├── src
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   └── logger.py
    ├── query
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── token_counter.py
    │   ├── custom_types
    │   │   ├── __init__.py
    │   │   ├── graphs
    │   │   │   ├── __init__.py
    │   │   │   ├── embedding.py
    │   │   │   └── community.py
    │   │   ├── tokens.py
    │   │   └── prompts.py
    │   ├── global_search
    │   │   ├── community_report.py
    │   │   ├── __init__.py
    │   │   ├── key_points_aggregator
    │   │   │   ├── __init__.py
    │   │   │   ├── prompt_builder.py
    │   │   │   ├── aggregator.py
    │   │   │   ├── context_builder.py
    │   │   │   └── _system_prompt.py
    │   │   ├── key_points_generator
    │   │   │   ├── _output_parser.py
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   ├── prompt_builder.py
    │   │   │   ├── generator.py
    │   │   │   ├── _system_prompt.py
    │   │   │   ├── temp.txt
    │   │   │   └── context_builder.py
    │   │   ├── community_weight_calculator.py
    │   │   └── search.py
    │   ├── local_search
    │   │   ├── __init__.py
    │   │   ├── context_builders
    │   │   │   ├── __init__.py
    │   │   │   ├── text_units.py
    │   │   │   ├── communities_reports.py
    │   │   │   ├── entities.py
    │   │   │   ├── context.py
    │   │   │   └── relationships.py
    │   │   ├── context_selectors
    │   │   │   ├── __init__.py
    │   │   │   ├── entities.py
    │   │   │   ├── communities_reports.py
    │   │   │   ├── text_units.py
    │   │   │   ├── relationships.py
    │   │   │   └── context.py
    │   │   ├── retriever.py
    │   │   ├── search.py
    │   │   ├── prompt_builder.py
    │   │   └── _system_prompt.py
    │   ├── __init__.py
    │   └── search.py
    ├── splitter
    │   ├── __init__.py
    │   └── slide_window_splitter.py
    └── index
    │   ├── __init__.py
    │   ├── pydantic_models.py
    │   ├── tqdm_LLMGraphTransformer.py
    │   ├── prompts.py
    │   ├── utils.py
    │   ├── cypher_query.py
    │   └── api_index.py
├── test_change.py
├── .gitignore
├── example
    ├── ollama_index.sh
    ├── ollama_search.sh
    ├── openai_search.sh
    ├── hf_index.sh
    ├── openai_index.sh
    └── hf_search.sh
├── index.sh
├── drop.py
├── txt
    ├── 三体_星际交锋2.txt
    └── 三体_星际交锋.txt
├── requirements.txt
├── README.md
├── search.log
├── search.py
└── index.log


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test_change.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/query/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/splitter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/query/custom_types/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/query/custom_types/graphs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/index/__init__.py:
--------------------------------------------------------------------------------
1 | from .api_index import ApiIndex
2 | 
3 | __all__ = [
4 |     "ApiIndex"
5 | ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .pytest_*
3 | /src/**/test*
4 | /node_modules
5 | /package-lock.json
6 | /package.json


--------------------------------------------------------------------------------
/src/query/custom_types/tokens.py:
--------------------------------------------------------------------------------
1 | from typing import Protocol
2 | 
3 | 
4 | class TokenCounter(Protocol):
5 |     def count_tokens(self, text: str) -> int: ...


--------------------------------------------------------------------------------
/src/query/custom_types/graphs/embedding.py:
--------------------------------------------------------------------------------
1 | from typing import Protocol
2 | 
3 | import networkx as nx
4 | import numpy as np
5 | 
6 | 
7 | class GraphEmbeddingGenerator(Protocol):
8 |     def run(self, graph: nx.Graph) -> dict[str, np.ndarray]: ...


--------------------------------------------------------------------------------
/src/query/global_search/community_report.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | @dataclass
 4 | class CommunityReport:
 5 |     id: str
 6 |     title: str
 7 |     summary: str
 8 |     rank: float
 9 |     weight: float
10 |     content: str


--------------------------------------------------------------------------------
/src/query/global_search/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from .key_points_aggregator import KeyPointsAggregator
 4 | from .key_points_generator import KeyPointsGenerator
 5 | from .search import GlobalSearch
 6 | 
 7 | __all__ = [
 8 |     "GlobalSearch",
 9 |     "KeyPointsAggregator",
10 |     "KeyPointsGenerator",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/query/local_search/__init__.py:
--------------------------------------------------------------------------------
 1 | """Local Search module."""
 2 | 
 3 | from .prompt_builder import LocalSearchPromptBuilder
 4 | from .retriever import LocalSearchRetriever
 5 | from .search import LocalSearch
 6 | 
 7 | __all__ = [
 8 |     "LocalSearch",
 9 |     "LocalSearchPromptBuilder",
10 |     "LocalSearchRetriever",
11 | ]


--------------------------------------------------------------------------------
/example/ollama_index.sh:
--------------------------------------------------------------------------------
 1 | python build_index.py \
 2 | --file_path ./txt \
 3 | --neo4j_uri bolt://localhost:7687 \
 4 | --neo4j_username neo4j \
 5 | --neo4j_password  langchaingraphrag \
 6 | --model_provider ollama \
 7 | --chat_model_name llama3.1 \
 8 | --embedding_model_name_or_path BAAI/bge-m3 \
 9 | --max_workers 16 \
10 | --device cuda
11 | 


--------------------------------------------------------------------------------
/src/query/global_search/key_points_aggregator/__init__.py:
--------------------------------------------------------------------------------
1 | from .aggregator import KeyPointsAggregator
2 | from .context_builder import KeyPointsContextBuilder
3 | from .prompt_builder import KeyPointsAggregatorPromptBuilder
4 | 
5 | __all__ = [
6 |     "KeyPointsAggregatorPromptBuilder",
7 |     "KeyPointsContextBuilder",
8 |     "KeyPointsAggregator",
9 | ]


--------------------------------------------------------------------------------
/example/ollama_search.sh:
--------------------------------------------------------------------------------
 1 | python search.py \
 2 | --neo4j_uri bolt://localhost:7687 \
 3 | --neo4j_username neo4j \
 4 | --neo4j_password  langchaingraphrag \
 5 | --model_provider ollama \
 6 | --chat_model_name llama3.1 \
 7 | --embedding_model_name_or_path BAAI/bge-m3 \
 8 | --max_workers 16 \
 9 | --device cuda \
10 | --completion_mode completion \
11 | --query_mode global


--------------------------------------------------------------------------------
/example/openai_search.sh:
--------------------------------------------------------------------------------
 1 | python search.py \
 2 | --neo4j_uri bolt://localhost:7687 \
 3 | --neo4j_username neo4j \
 4 | --neo4j_password  langchaingraphrag \
 5 | --model_provider openai \
 6 | --embedding_model_name_or_path BAAI/bge-m3 \
 7 | --chat_model_name gpt-4o-mini \
 8 | --base_url https://api.gpt.ge/v1/ \
 9 | --completion_mode completion \
10 | --query_mode global


--------------------------------------------------------------------------------
/example/hf_index.sh:
--------------------------------------------------------------------------------
 1 | python build_index.py \
 2 | --file_path ./txt \
 3 | --neo4j_uri bolt://localhost:7687 \
 4 | --neo4j_username neo4j \
 5 | --neo4j_password  langchaingraphrag \
 6 | --model_provider hf \
 7 | --embedding_model_name_or_path BAAI/bge-m3 \
 8 | --repo_id NousResearch/Meta-Llama-3.1-8B-Instruct \
 9 | --max_workers 16 \
10 | --flash_attn \
11 | --device cuda
12 | 


--------------------------------------------------------------------------------
/example/openai_index.sh:
--------------------------------------------------------------------------------
 1 | python build_index.py \
 2 | --file_path ./txt \
 3 | --neo4j_uri bolt://localhost:7687 \
 4 | --neo4j_username neo4j \
 5 | --neo4j_password  langchaingraphrag \
 6 | --model_provider openai \
 7 | --embedding_model_name_or_path BAAI/bge-m3 \
 8 | --chat_model_name gpt-4o-mini \
 9 | --base_url https://api.gpt.ge/v1/ \
10 | --max_workers 16 \
11 | --device cuda


--------------------------------------------------------------------------------
/index.sh:
--------------------------------------------------------------------------------
 1 | python build_index.py \
 2 | --file_path ./txt \
 3 | --neo4j_uri bolt://localhost:7687 \
 4 | --neo4j_username your_username \
 5 | --neo4j_password your_password \
 6 | --model_provider openai \
 7 | --embedding_model_name_or_path BAAI/bge-m3 \
 8 | --chat_model_name deepseek-chat \
 9 | --base_url https://api.deepseek.com \
10 | --max_workers 16 \
11 | --device cuda
12 | 


--------------------------------------------------------------------------------
/example/hf_search.sh:
--------------------------------------------------------------------------------
 1 | python search.py \
 2 | --neo4j_uri bolt://localhost:7687 \
 3 | --neo4j_username neo4j \
 4 | --neo4j_password  langchaingraphrag \
 5 | --model_provider hf \
 6 | --embedding_model_name_or_path BAAI/bge-m3 \
 7 | --repo_id NousResearch/Meta-Llama-3.1-8B-Instruct \
 8 | --max_workers 16 \
 9 | --device cuda \
10 | --completion_mode completion \
11 | --query_mode global


--------------------------------------------------------------------------------
/src/query/__init__.py:
--------------------------------------------------------------------------------
 1 | from .local_search import *
 2 | from .global_search import *
 3 | from .search import GlobalSearcher,LocalSearcher
 4 | 
 5 | __all__ = [
 6 |     "LocalSearch",
 7 |     "LocalSearchPromptBuilder",
 8 |     "LocalSearchRetriever",
 9 |     "GlobalSearch",
10 |     "KeyPointsAggregator",
11 |     "KeyPointsGenerator",
12 |     "LocalSearcher",
13 |     "GlobalSearcher",
14 | ]


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/_output_parser.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from langchain.output_parsers import PydanticOutputParser
 4 | 
 5 | from .utils import KeyPointsResult
 6 | 
 7 | 
 8 | class KeyPointsOutputParser(PydanticOutputParser):
 9 |     def __init__(self, **kwargs: dict[str, Any]):
10 |         super().__init__(pydantic_object=KeyPointsResult, **kwargs)
11 |         


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/__init__.py:
--------------------------------------------------------------------------------
 1 | """Key Points generator module."""
 2 | 
 3 | from .context_builder import CommunityReportContextBuilder
 4 | from .generator import KeyPointsGenerator
 5 | from .prompt_builder import KeyPointsGeneratorPromptBuilder
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "KeyPointsGeneratorPromptBuilder",
10 |     "CommunityReportContextBuilder",
11 |     "KeyPointsGenerator",
12 | ]


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/utils.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.pydantic_v1 import BaseModel, Field
 2 | 
 3 | 
 4 | class KeyPointInfo(BaseModel):
 5 |     description: str = Field(description="The description of the key point")
 6 |     score: float = Field(description="The score of the key point")
 7 | 
 8 | 
 9 | class KeyPointsResult(BaseModel):
10 |     points: list[KeyPointInfo] = Field(description="the points")


--------------------------------------------------------------------------------
/src/query/utils/token_counter.py:
--------------------------------------------------------------------------------
 1 | """Counter for Tiktoken based tokens."""
 2 | 
 3 | import tiktoken
 4 | 
 5 | from ..custom_types.tokens import TokenCounter
 6 | 
 7 | class TiktokenCounter(TokenCounter):
 8 |     def __init__(self, encoding_name: str = "cl100k_base"):
 9 |         self.tokenizer = tiktoken.get_encoding(encoding_name)
10 | 
11 |     def count_tokens(self, text: str) -> int:
12 |         return len(self.tokenizer.encode(text))


--------------------------------------------------------------------------------
/src/query/custom_types/prompts.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Protocol
 2 | 
 3 | from langchain_core.output_parsers.base import BaseOutputParser
 4 | from langchain_core.prompts import BasePromptTemplate
 5 | from typing_extensions import Unpack
 6 | 
 7 | 
 8 | class PromptBuilder(Protocol):
 9 |     def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]: ...
10 | 
11 | 
12 | class IndexingPromptBuilder(PromptBuilder, Protocol):
13 |     def prepare_chain_input(
14 |         self, **kwargs: Unpack[dict[str, Any]]
15 |     ) -> dict[str, str]: ...


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/__init__.py:
--------------------------------------------------------------------------------
 1 | """Context builders for local search."""
 2 | 
 3 | from .communities_reports import CommunitiesReportsContextBuilder
 4 | from .context import ContextBuilder
 5 | from .entities import EntitiesContextBuilder
 6 | from .relationships import RelationshipsContextBuilder
 7 | from .text_units import TextUnitsContextBuilder
 8 | 
 9 | __all__ = [
10 |     "EntitiesContextBuilder",
11 |     "ContextBuilder",
12 |     "RelationshipsContextBuilder",
13 |     "TextUnitsContextBuilder",
14 |     "CommunitiesReportsContextBuilder",
15 | ]


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/__init__.py:
--------------------------------------------------------------------------------
 1 | """Context selectors for local search."""
 2 | 
 3 | from .communities_reports import CommunitiesReportsSelector
 4 | from .context import ContextSelectionResult, ContextSelector
 5 | from .entities import EntitiesSelector
 6 | from .relationships import RelationshipsSelector
 7 | from .text_units import TextUnitsSelector
 8 | 
 9 | __all__ = [
10 |     "ContextSelector",
11 |     "ContextSelectionResult",
12 |     "EntitiesSelector",
13 |     "TextUnitsSelector",
14 |     "RelationshipsSelector",
15 |     "CommunitiesReportsSelector",
16 | ]


--------------------------------------------------------------------------------
/src/index/pydantic_models.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from pydantic import BaseModel,Field
 3 | 
 4 | 
 5 | class DuplicateEntities(BaseModel):
 6 |     entities: List[str] = Field(
 7 |         description="Entities that represent the same object or real-world entity and should be merged"
 8 |     )
 9 | 
10 | 
11 | class Disambiguate(BaseModel):
12 |     merge_entities: Optional[List[DuplicateEntities]] = Field(
13 |         description="Lists of entities that represent the same object or real-world entity and should be merged"
14 |     )
15 |     
16 | class GetTitle(BaseModel):
17 |     title: str = Field(description="Title of the given summary")


--------------------------------------------------------------------------------
/src/index/tqdm_LLMGraphTransformer.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Sequence
 3 | from langchain_experimental.graph_transformers import LLMGraphTransformer
 4 | from langchain_core.documents import Document
 5 | from tqdm.asyncio import tqdm
 6 | 
 7 | class t_LLMGraphTransformer(LLMGraphTransformer):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 | 
11 |     async def aconvert_to_graph_documents(self, documents: Sequence[Document], config: Optional[RunnableConfig] = None) -> List[GraphDocument]:
12 |         """
13 |         Asynchronously convert a sequence of documents into graph documents.
14 |         """
15 |         tasks = [
16 |             asyncio.create_task(self.aprocess_response(document, config))
17 |             for document in documents
18 |         ]
19 |         results = await tqdm.gather(*tasks)
20 |         return results


--------------------------------------------------------------------------------
/src/query/custom_types/graphs/community.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import NewType, Protocol
 3 | 
 4 | import networkx as nx
 5 | 
 6 | CommunityId = NewType("CommunityId", int)
 7 | CommunityLevel = NewType("CommunityLevel", int)
 8 | 
 9 | 
10 | @dataclass
11 | class CommunityNode:
12 |     name: str
13 |     parent_cluster: CommunityId | None
14 |     is_final_cluster: bool
15 | 
16 | 
17 | @dataclass
18 | class Community:
19 |     id: CommunityId
20 |     nodes: list[CommunityNode]
21 | 
22 | 
23 | @dataclass
24 | class CommunityDetectionResult:
25 |     communities: dict[CommunityLevel, dict[CommunityId, Community]]
26 | 
27 |     def communities_at_level(self, level: CommunityLevel) -> list[Community]:
28 |         return list(self.communities[level].values())
29 | 
30 | 
31 | class CommunityDetector(Protocol):
32 |     def run(self, graph: nx.Graph) -> CommunityDetectionResult: ...


--------------------------------------------------------------------------------
/src/query/local_search/retriever.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.callbacks import CallbackManagerForRetrieverRun
 2 | from langchain_core.documents import Document
 3 | from langchain_core.retrievers import BaseRetriever
 4 | from langchain_core.vectorstores import VectorStore
 5 | from .context_builders import ContextBuilder
 6 | from .context_selectors import ContextSelector
 7 | 
 8 | 
 9 | class LocalSearchRetriever(BaseRetriever):
10 |     context_selector: ContextSelector
11 |     context_builder: ContextBuilder
12 |     graph: VectorStore
13 | 
14 |     def _get_relevant_documents(
15 |         self,
16 |         query: str,
17 |         *,
18 |         run_manager: CallbackManagerForRetrieverRun,  # noqa: ARG002
19 |     ) -> list[Document]:
20 |         context_selection_result = self.context_selector.run(
21 |             query=query,
22 |             graph=self.graph,
23 |         )
24 | 
25 |         return self.context_builder(context_selection_result)


--------------------------------------------------------------------------------
/drop.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from langchain_community.graphs import Neo4jGraph
 4 | 
 5 | def parse_args():
 6 |     arg_parser = argparse.ArgumentParser(description="Drop Something")
 7 |     
 8 |     arg_parser.add_argument("--neo4j_uri", type=str,default=None, help="Neo4j URI")
 9 |     arg_parser.add_argument("--neo4j_username", type=str,default=None, help="Neo4j user")
10 |     arg_parser.add_argument("--neo4j_password", type=str,default=None, help="Neo4j password")
11 |     arg_parser.add_argument("--uuid", type=str, default="", help="UUID for the index")
12 |     
13 |     return arg_parser.parse_args()
14 | 
15 | def drop():
16 |     args = parse_args()
17 |     graph = Neo4jGraph(uri=args.neo4j_uri, user=args.neo4j_username, password=args.neo4j_password)
18 | 
19 |     graph.query(f"DROP INDEX `{args.uuid}` IF EXISTS")
20 |     graph.query(f"DROP CONSTRAINT ON (n:`__Entity__{args.uuid}`) ASSERT n.id IS UNIQUE")    
21 |     graph.query(f"MATCH (n:`__Entity__{args.uuid}`) DETACH DELETE n")


--------------------------------------------------------------------------------
/src/query/global_search/community_weight_calculator.py:
--------------------------------------------------------------------------------
 1 | """Compute the weight of the community."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from ..custom_types.graphs.community import CommunityId
 6 | 
 7 | 
 8 | class CommunityWeightCalculator:
 9 |     def __init__(self, *, should_normalize: bool = True):
10 |         self._should_normalize = should_normalize
11 | 
12 |     def __call__(
13 |         self,
14 |         df_entities: pd.DataFrame,
15 |         df_reports: pd.DataFrame,
16 |     ) -> dict[CommunityId, float]:
17 |         result: dict[CommunityId, float] = {}
18 |         for _, row in df_reports.iterrows():
19 |             entities = row["entities"]
20 |             # get rows from entities dataframe where ids are in entities
21 |             df_entities_filtered = df_entities[df_entities["id"].isin(entities)]
22 |             # get the text_units from df_entities_filtered
23 |             text_units = df_entities_filtered["text_unit_ids"].explode().unique()
24 |             result[row["community_id"]] = len(text_units)
25 | 
26 |         if self._should_normalize:
27 |             max_weight = max(result.values())
28 |             for community_id in result:
29 |                 result[community_id] = result[community_id] / max_weight
30 | 
31 |         return result


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from logging.handlers import RotatingFileHandler
 3 | 
 4 | def create_rotating_logger(name: str, log_file: str, max_bytes: int = 10 * 1024 * 1024, backup_count: int = 5, level=logging.DEBUG) -> logging.Logger:
 5 |     """创建一个循环的 logger 实例。
 6 | 
 7 |     Args:
 8 |         name (str): logger 的名称。
 9 |         log_file (str): 日志文件的路径。
10 |         max_bytes (int): 单个日志文件的最大字节数，默认为 10MB。
11 |         backup_count (int): 保留的旧日志文件的数量，默认为 5。
12 |         level (int): 日志级别，默认为 INFO
13 | 
14 |     Returns:
15 |         logging.Logger: 配置好的循环 logger 实例。
16 |     """
17 |     # 创建 logger
18 |     logger = logging.getLogger(name)
19 |     logger.setLevel(level)
20 |      # 创建日志格式
21 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22 | 
23 |     # 创建控制台处理器
24 |     console_handler = logging.StreamHandler()
25 |     console_handler.setLevel(level)
26 |     console_handler.setFormatter(formatter)
27 |     
28 |     # 创建循环文件处理器
29 |     if log_file:
30 |         handler = RotatingFileHandler(log_file, maxBytes=max_bytes, backupCount=backup_count)
31 |         handler.setLevel(level)
32 |         handler.setFormatter(formatter)
33 | 
34 |     # 添加处理器到 logger
35 |     logger.addHandler(handler)
36 |     logger.addHandler(console_handler)
37 | 
38 |     return logger


--------------------------------------------------------------------------------
/src/query/local_search/search.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.documents import Document
 2 | from langchain_core.language_models import BaseLLM
 3 | from langchain_core.retrievers import BaseRetriever
 4 | from langchain_core.runnables import Runnable, RunnablePassthrough
 5 | 
 6 | from ..custom_types.prompts import PromptBuilder
 7 | 
 8 | 
 9 | def _format_docs(documents: list[Document]) -> str:
10 |     context_data = [d.page_content for d in documents]
11 |     context_data_str: str = "\n".join(context_data)
12 |     return context_data_str
13 | 
14 | 
15 | class LocalSearch:
16 |     def __init__(
17 |         self,
18 |         chat_model: BaseLLM,
19 |         prompt_builder: PromptBuilder,
20 |         retriever: BaseRetriever,
21 |     ):
22 |         self._chat_model = chat_model
23 |         self._prompt_builder = prompt_builder
24 |         self._retriever = retriever
25 | 
26 |     def __call__(self,query) -> Runnable:
27 |         prompt, output_parser = self._prompt_builder.build()
28 | 
29 |         base_chain = prompt | self._chat_model | output_parser
30 | 
31 |         search_chain: Runnable = {
32 |             "context_data": self._retriever | _format_docs,
33 |             "local_query": RunnablePassthrough(),
34 |         } | base_chain
35 | 
36 |         return search_chain.invoke(query)


--------------------------------------------------------------------------------
/src/query/local_search/prompt_builder.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from langchain_core.output_parsers.base import BaseOutputParser
 4 | from langchain_core.output_parsers.string import StrOutputParser
 5 | from langchain_core.prompts import (
 6 |     BasePromptTemplate,
 7 |     ChatPromptTemplate,
 8 |     SystemMessagePromptTemplate,
 9 | )
10 | 
11 | from ..custom_types.prompts import PromptBuilder
12 | 
13 | from ._system_prompt import LOCAL_SEARCH_SYSTEM_PROMPT
14 | 
15 | 
16 | class LocalSearchPromptBuilder(PromptBuilder):
17 |     def __init__(
18 |         self,
19 |         *,
20 |         system_prompt: str | None = None,
21 |         system_prompt_path: Path | None = None,
22 |     ):
23 |         self._system_prompt: str | None
24 |         if system_prompt is None and system_prompt_path is None:
25 |             self._system_prompt = LOCAL_SEARCH_SYSTEM_PROMPT
26 |         else:
27 |             self._system_prompt = system_prompt
28 | 
29 |         self._system_prompt_path = system_prompt_path
30 | 
31 |     def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]:
32 |         if self._system_prompt_path:
33 |             prompt = Path.read_text(self._system_prompt_path)
34 |         else:
35 |             assert self._system_prompt is not None
36 |             prompt = self._system_prompt
37 | 
38 |         system_template = SystemMessagePromptTemplate.from_template(
39 |             prompt,
40 |             partial_variables=dict(response_type="Multiple Paragraphs"),
41 |         )
42 | 
43 |         template = ChatPromptTemplate([system_template, ("user", "{local_query}")])
44 |         return template, StrOutputParser()


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/prompt_builder.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from langchain_core.output_parsers.base import BaseOutputParser
 4 | from langchain_core.prompts import (
 5 |     BasePromptTemplate,
 6 |     ChatPromptTemplate,
 7 |     SystemMessagePromptTemplate,
 8 | )
 9 | 
10 | from ...custom_types.prompts import PromptBuilder
11 | 
12 | from ._output_parser import KeyPointsOutputParser
13 | from ._system_prompt import MAP_SYSTEM_PROMPT
14 | 
15 | 
16 | class KeyPointsGeneratorPromptBuilder(PromptBuilder):
17 |     def __init__(
18 |         self,
19 |         *,
20 |         system_prompt: str | None = None,
21 |         system_prompt_path: Path | None = None,
22 |     ):
23 |         self._system_prompt: str | None
24 |         if system_prompt is None and system_prompt_path is None:
25 |             self._system_prompt = MAP_SYSTEM_PROMPT
26 |         else:
27 |             self._system_prompt = system_prompt
28 | 
29 |         self._system_prompt_path = system_prompt_path
30 | 
31 |     def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]:
32 |         if self._system_prompt_path:
33 |             if type(self._system_prompt_path) is str:
34 |                 self._system_prompt_path = Path(self._system_prompt_path)
35 |             prompt = self._system_prompt_path.read_text(encoding='utf-8')
36 |         else:
37 |             assert self._system_prompt is not None
38 |             prompt = self._system_prompt
39 | 
40 |         system_template = SystemMessagePromptTemplate.from_template(prompt)
41 | 
42 |         template = ChatPromptTemplate([system_template, ("user", "{global_query}")])
43 |         return template, KeyPointsOutputParser()
44 |     
45 |     


--------------------------------------------------------------------------------
/src/query/global_search/key_points_aggregator/prompt_builder.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from langchain_core.output_parsers.base import BaseOutputParser
 4 | from langchain_core.output_parsers.string import StrOutputParser
 5 | from langchain_core.prompts import (
 6 |     BasePromptTemplate,
 7 |     ChatPromptTemplate,
 8 |     SystemMessagePromptTemplate,
 9 | )
10 | 
11 | from ...custom_types.prompts import PromptBuilder
12 | 
13 | from ._system_prompt import REDUCE_SYSTEM_PROMPT
14 | 
15 | 
16 | class KeyPointsAggregatorPromptBuilder(PromptBuilder):
17 |     def __init__(
18 |         self,
19 |         *,
20 |         system_prompt: str | None = None,
21 |         system_prompt_path: Path | None = None,
22 |     ):
23 |         self._system_prompt: str | None
24 |         if system_prompt is None and system_prompt_path is None:
25 |             self._system_prompt = REDUCE_SYSTEM_PROMPT
26 |         else:
27 |             self._system_prompt = system_prompt
28 | 
29 |         self._system_prompt_path = system_prompt_path
30 | 
31 |     def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]:
32 |         if self._system_prompt_path:
33 |             if type(self._system_prompt_path) is str:
34 |                 self._system_prompt_path = Path(self._system_prompt_path)
35 |             prompt = self._system_prompt_path.read_text(encoding='utf-8')
36 |         else:
37 |             assert self._system_prompt is not None
38 |             prompt = self._system_prompt
39 | 
40 |         system_template = SystemMessagePromptTemplate.from_template(
41 |             prompt,
42 |             partial_variables=dict(response_type="Multiple Paragraphs"),
43 |         )
44 | 
45 |         template = ChatPromptTemplate([system_template, ("user", "{global_query}")])
46 |         return template, StrOutputParser()


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/text_units.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pandas as pd
 4 | from langchain_core.documents import Document
 5 | 
 6 | from ...custom_types.tokens import TokenCounter
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class TextUnitsContextBuilder:
12 |     def __init__(
13 |         self,
14 |         *,
15 |         context_name: str = "Sources",
16 |         column_delimiter: str = "|",
17 |         max_tokens: int = 8000,
18 |         token_counter: TokenCounter,
19 |     ):
20 |         self._context_name = context_name
21 |         self._column_delimiter = column_delimiter
22 |         self._max_tokens = max_tokens
23 |         self._token_counter = token_counter
24 | 
25 |     def __call__(self, text_units: pd.DataFrame) -> Document:
26 |         context_text = f"-----{self._context_name}-----" + "\n"
27 |         header = ["id", "text"]
28 | 
29 |         context_text += self._column_delimiter.join(header) + "\n"
30 |         token_count = self._token_counter.count_tokens(context_text)
31 | 
32 |         for row in text_units.itertuples():
33 |             new_context = [str(row.short_id), row.text_unit]
34 |             new_context_text = self._column_delimiter.join(new_context) + "\n"
35 | 
36 |             new_token_count = self._token_counter.count_tokens(new_context_text)
37 |             if token_count + new_token_count > self._max_tokens:
38 |                 _LOGGER.warning(
39 |                     f"Stopping text units context build at {token_count} tokens ..."
40 |                 )
41 |                 break
42 | 
43 |             context_text += new_context_text
44 |             token_count += new_token_count
45 | 
46 |         return Document(
47 |             page_content=context_text,
48 |             metadata={"token_count": token_count},
49 |         )


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/generator.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.documents import Document
 2 | from langchain_core.language_models.chat_models import BaseChatModel
 3 | from langchain_core.runnables import Runnable, RunnableParallel
 4 | 
 5 | from ...custom_types.prompts import PromptBuilder
 6 | 
 7 | from .context_builder import CommunityReportContextBuilder
 8 | 
 9 | import json_repair
10 | import json
11 | 
12 | def _format_docs(documents: list[Document]) -> str:
13 |     context_data = [d.page_content for d in documents]
14 |     context_data_str: str = "\n".join(context_data)
15 |     return context_data_str
16 | 
17 | class KeyPointsGenerator:
18 |     def __init__(
19 |         self,
20 |         chat_model: BaseChatModel,
21 |         prompt_builder: PromptBuilder,
22 |         context_builder: CommunityReportContextBuilder,
23 |     ):
24 |         self._chat_model = chat_model
25 |         self._prompt_builder = prompt_builder
26 |         self._context_builder = context_builder
27 | 
28 |     # 生成关键点
29 |     def __call__(self) -> Runnable:
30 |         prompt, output_parser = self._prompt_builder.build()
31 |         documents = self._context_builder()
32 | 
33 |         chains: list[Runnable] = []
34 |         for d in documents:
35 |             d_context_data = _format_docs([d])
36 |             d_prompt = prompt.partial(context_data=d_context_data)
37 |             
38 |             # TODO:异常处理
39 |             generator_chain: Runnable = d_prompt | self._chat_model | (lambda output:json.dumps(json_repair.loads(output.content))) | output_parser
40 |             
41 |             chains.append(generator_chain)  
42 | 
43 |         analysts = [f"Analayst-{i}" for i in range(1, len(chains) + 1)]
44 |         # {"Analyst-1": "Chain-1", "Analyst-2": "Chain-2", "Analyst-3": "Chain-3"}
45 |         return RunnableParallel(dict(zip(analysts, chains, strict=True)))


--------------------------------------------------------------------------------
/src/query/global_search/key_points_aggregator/aggregator.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | from functools import partial
 3 | 
 4 | from langchain_core.documents import Document
 5 | from langchain_core.language_models.chat_models import BaseChatModel
 6 | from langchain_core.runnables import Runnable, RunnableLambda
 7 | 
 8 | from ..key_points_generator.utils import KeyPointsResult
 9 | 
10 | from ...custom_types.prompts import PromptBuilder
11 | 
12 | from .context_builder import KeyPointsContextBuilder
13 | 
14 | 
15 | def _format_docs(documents: list[Document]) -> str:
16 |     context_data = [d.page_content for d in documents]
17 |     context_data_str: str = "\n".join(context_data)
18 |     return context_data_str
19 | 
20 | 
21 | def _kp_result_to_docs(
22 |     key_points: dict[str, KeyPointsResult],
23 |     context_builder: KeyPointsContextBuilder,
24 | ) -> list[Document]:
25 |     return context_builder(key_points)
26 | 
27 | 
28 | class KeyPointsAggregator:
29 |     def __init__(
30 |         self,
31 |         chat_model: BaseChatModel,
32 |         prompt_builder: PromptBuilder,
33 |         context_builder: KeyPointsContextBuilder,
34 |     ):
35 |         self._chat_model = chat_model
36 |         self._prompt_builder = prompt_builder
37 |         self._context_builder = context_builder
38 | 
39 |     def __call__(self) -> Runnable:
40 |         kp_lambda = partial(
41 |             _kp_result_to_docs,
42 |             context_builder=self._context_builder,
43 |         )
44 | 
45 |         prompt, output_parser = self._prompt_builder.build()
46 |         base_chain = prompt | self._chat_model | output_parser  # TODO:异常处理
47 | 
48 |         search_chain: Runnable = {
49 |             "report_data": operator.itemgetter("report_data")
50 |             | RunnableLambda(kp_lambda)
51 |             | _format_docs,
52 |             "global_query": operator.itemgetter("global_query"),
53 |         } | base_chain
54 | 
55 |         return search_chain


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/entities.py:
--------------------------------------------------------------------------------
 1 | """Select the entities to be used in the local search."""
 2 | 
 3 | import logging
 4 | import re
 5 | import pandas as pd
 6 | from langchain_core.vectorstores import VectorStore
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class EntitiesSelector:
12 |     def __init__(self, vector_store: VectorStore, top_k: int):
13 |         self._vector_store = vector_store
14 |         self._top_k = top_k
15 | 
16 |     def run(self, query: str, df_entities: pd.DataFrame) -> pd.DataFrame:
17 |         """Select the entities to be used in the local search."""
18 |         documents_with_scores = (
19 |             self._vector_store.similarity_search_with_relevance_scores(
20 |                 query,
21 |                 self._top_k,
22 |             )
23 |         )
24 |         # Relying on metadata to get the entity_ids
25 |         # These returned entities are ranked by similarity
26 |         pattern = r"\nid:\s*(\S+)\s*\ndescription:"
27 |         
28 |         entity_ids_with_scores = pd.DataFrame.from_records(
29 |             [
30 |                 dict(id=re.search(pattern=pattern,string=doc.page_content).group(1), score=score)
31 |                 for doc, score in documents_with_scores
32 |             ]
33 |         )
34 |         entity_ids_with_scores.drop_duplicates(inplace=True)
35 |         # Filter the entities dataframe to only include the selected entities
36 |         selected_entities = df_entities[
37 |             df_entities["id"].isin(entity_ids_with_scores["id"])
38 |         ]
39 | 
40 |         selected_entities = (
41 |             selected_entities.merge(entity_ids_with_scores, on="id")
42 |             .sort_values(by="score", ascending=False)
43 |             .reset_index(drop=True)
44 |         )
45 | 
46 |         if _LOGGER.isEnabledFor(logging.DEBUG):
47 |             import tableprint
48 | 
49 |             tableprint.banner("Selected Entities")
50 |             tableprint.dataframe(selected_entities[["id", "description", "score"]])
51 | 
52 |         return selected_entities


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/communities_reports.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pandas as pd
 4 | from langchain_core.documents import Document
 5 | 
 6 | from ...custom_types.tokens import TokenCounter
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class CommunitiesReportsContextBuilder:
12 |     def __init__(
13 |         self,
14 |         *,
15 |         context_name: str = "Reports",
16 |         column_delimiter: str = "|",
17 |         max_tokens: int = 8000,
18 |         token_counter: TokenCounter,
19 |     ):
20 |         self._context_name = context_name
21 |         self._column_delimiter = column_delimiter
22 |         self._max_tokens = max_tokens
23 |         self._token_counter = token_counter
24 | 
25 |     def __call__(self, communities_reports: pd.DataFrame) -> Document:
26 |         context_text = f"-----{self._context_name}-----" + "\n"
27 |         header = ["id", "title", "content"]
28 | 
29 |         context_text += self._column_delimiter.join(header) + "\n"
30 |         token_count = self._token_counter.count_tokens(context_text)
31 | 
32 |         for report in communities_reports.itertuples():
33 |             try:
34 |                 new_context = [
35 |                     str(report.community_id),
36 |                     report.title,
37 |                     report.content,
38 |                 ]
39 |             except Exception as e:
40 |                 continue
41 | 
42 |             new_context_text = self._column_delimiter.join(new_context) + "\n"
43 |             new_token_count = self._token_counter.count_tokens(new_context_text)
44 | 
45 |             if token_count + new_token_count > self._max_tokens:
46 |                 _LOGGER.warning(
47 |                     f"Stopping communities context build at {token_count} tokens ..."
48 |                 )
49 |                 break
50 | 
51 |             context_text += new_context_text
52 |             token_count += new_token_count
53 | 
54 |         return Document(
55 |             page_content=context_text,
56 |             metadata={"token_count": token_count},
57 |         )


--------------------------------------------------------------------------------
/src/query/global_search/search.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Iterator
 3 | 
 4 | from langchain_core.runnables import RunnableConfig
 5 | 
 6 | from .key_points_aggregator import KeyPointsAggregator
 7 | from .key_points_generator import KeyPointsGenerator
 8 | from .key_points_generator.utils import KeyPointsResult
 9 | 
10 | _LOGGER = logging.getLogger(__name__)
11 | 
12 | class GlobalSearch:
13 |     def __init__(
14 |         self,
15 |         kp_generator: KeyPointsGenerator,
16 |         kp_aggregator: KeyPointsAggregator,
17 |         *,
18 |         generation_chain_config: RunnableConfig | None = None,
19 |         aggregation_chain_config: RunnableConfig | None = None,
20 |     ):
21 |         self._kp_generator = kp_generator
22 |         self._kp_aggregator = kp_aggregator
23 |         self._generation_chain_config = generation_chain_config
24 |         self._aggregation_chain_config = aggregation_chain_config
25 | 
26 |     def _get_key_points(self, query: str) -> dict[str, KeyPointsResult]:
27 |         generation_chain = self._kp_generator()
28 |         response = generation_chain.invoke(
29 |             query,
30 |             config=self._generation_chain_config,
31 |         )
32 | 
33 |         if _LOGGER.getEffectiveLevel() == logging.INFO:
34 |             for k, v in response.items():
35 |                 _LOGGER.info(f"{k} - {len(v.points)}")
36 | 
37 |         return response
38 | 
39 |     def invoke(self, query: str) -> str:
40 |         aggregation_chain = self._kp_aggregator()
41 |         response = self._get_key_points(query)
42 | 
43 |         return aggregation_chain.invoke(
44 |             input=dict(report_data=response, global_query=query),
45 |             config=self._aggregation_chain_config,
46 |         )
47 | 
48 |     def stream(self, query: str) -> Iterator:
49 |         aggregation_chain = self._kp_aggregator()
50 |         response = self._get_key_points(query)
51 | 
52 |         return aggregation_chain.stream(
53 |             input=dict(report_data=response, global_query=query),
54 |             config=self._aggregation_chain_config,
55 |         )


--------------------------------------------------------------------------------
/txt/三体_星际交锋2.txt:
--------------------------------------------------------------------------------
 1 | 第一章：新征程
 2 | 在经历了与三体文明的初步交流后，李明和他的团队意识到，未来的挑战不仅仅是技术上的合作，更是文化与价值观的碰撞。为了更好地与三体文明建立联系，地球决定组建一个“星际交流委员会”，由不同领域的专家组成，专注于与三体文明的沟通与合作。
 3 | 
 4 | 李明被任命为委员会的负责人。他的首要任务是制定交流计划，确保双方在文化、科技和伦理方面的理解与合作。
 5 | 
 6 | 在一次会议上，王娜提出：“我们需要更多地了解三体文明的社会结构和价值观，以便更好地进行沟通。”
 7 | 
 8 | “我同意，”张伟补充道，“尤其是他们对生物工程和人工智能的看法，这将直接影响我们的合作。”
 9 | 
10 | 李明点了点头：“我们可以组织一系列的文化交流活动，让双方的专家进行深入讨论。”
11 | 
12 | 会议结束后，李明感到责任重大。他知道，未来的每一步都将影响人类与三体文明的关系。
13 | 
14 | 第二章：文化的深度
15 | 几周后，星际交流委员会组织了第一次“文化交流夜”。李明邀请了三体文明的智者代表，双方在一个虚拟会议室中进行交流。
16 | 
17 | “在我们三体文明中，理性与逻辑是决策的核心，”一位智者说道，“而你们似乎更注重情感与创造力。”
18 | 
19 | 李明微笑着回应：“是的，情感在我们决策中扮演着重要角色。我们认为，情感与理性并不是对立的，而是可以互补的。”
20 | 
21 | 为了促进理解，李明建议双方分享各自的艺术作品。地球的音乐、绘画、文学与三体文明的数学艺术、逻辑游戏相互交融，创造出了一种新的文化体验。
22 | 
23 | 在一次交流中，王娜展示了地球的音乐作品，智者们对此表现出浓厚的兴趣。他们询问：“音乐如何影响你们的情感和决策？”
24 | 
25 | “音乐能够激发我们的情感，让我们更好地理解彼此的感受，”王娜回答道。
26 | 
27 | 这次交流让双方的理解加深，李明意识到，文化的碰撞是建立信任的关键。
28 | 
29 | 第三章：外星威胁
30 | 就在地球与三体文明的关系逐渐加深之际，李明收到了一条来自三体文明的紧急信号。信号中提到，一个名为“掠夺者”的外星种族正在接近他们的星系，意图侵略。
31 | 
32 | “掠夺者以掠夺其他文明的资源为生，他们的科技极为先进，”三体智者在信号中说道，“我们必须联合起来对抗这一威胁。”
33 | 
34 | 李明意识到，保护地球和三体的安全已成为当务之急。他召集团队进行紧急会议。
35 | 
36 | “我们需要建立一个联合防御系统，”李明提议，“结合我们的科技，以防范掠夺者的侵袭。”
37 | 
38 | 王娜和张伟积极响应，提出了具体的技术方案。经过几天的讨论，他们最终制定了详细的战略，准备向三体文明的智者提交。
39 | 
40 | 第四章：战斗的准备
41 | 在与三体文明的共同努力下，地球和三体的联合防御系统逐渐成型。李明和团队在技术研发方面取得了显著进展，双方的军队也开始进行联合训练。
42 | 
43 | “这是我们第一次面对外星威胁，”李明在一次训练中对士兵们说道，“我们必须团结一致，发挥各自的优势。”
44 | 
45 | 掠夺者的逼近让整个地球和三体星球都陷入紧张的氛围。李明和团队不断进行模拟演习，确保双方能够默契配合。
46 | 
47 | 然而，随着训练的深入，李明发现三体的智者对地球的决策产生了质疑。他们认为，人类的情感思维可能会影响理性判断。
48 | 
49 | “我们必须保持理性，避免情感干扰我们的决策，”智者之一说道。
50 | 
51 | 李明意识到，信任的考验即将来临。他决定更加深入地与三体智者沟通，以建立更强的合作关系。
52 | 
53 | 第五章：第一次冲突
54 | 掠夺者终于现身，展开了对地球和三体的攻击。在战斗中，李明和团队运用新技术，展现出人类和三体的合作力量。
55 | 
56 | 然而，战斗的结果却并不如预期。掠夺者的实力超出他们的想象，联合防御系统未能有效抵御攻击。李明和团队在战斗中遭遇了重创，许多士兵受伤，几艘战舰被摧毁。
57 | 
58 | “我们必须重新评估我们的战略，”李明在战斗后对团队说道，“这次失败让我们明白，单靠技术和军事力量并不足以应对外星威胁。”
59 | 
60 | 在这次惨痛的教训后，李明和团队开始反思合作的方式。他们意识到，理解彼此的文化和价值观是应对外星威胁的关键。
61 | 
62 | 第六章：希望的曙光
63 | 经过反思与调整，李明和团队逐渐找到了应对掠夺者的全新策略。他们决定在技术合作的同时，深入了解三体文明的文化与价值观。
64 | 
65 | 李明与三体的智者进行深入对话，探讨如何将理性与情感结合，以形成新的决策模式。在一次会议上，李明总结道：“我们需要找到一种平衡，利用理性来制定策略，同时不忽视情感对决策的影响。”
66 | 
67 | 经过多次讨论，双方终于达成共识，决定共同研发一种新的防御系统，结合三体的逻辑与地球的创造力。
68 | 
69 | 随着新策略的实施，李明感到希望的曙光正在逐渐显现。他们不仅在科技上取得了进展，更在文化与价值观的融合中找到了合作的基础。


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/_system_prompt.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E501
 2 | 
 3 | MAP_SYSTEM_PROMPT = """
 4 | ---Role---
 5 | 
 6 | You are a helpful assistant responding to questions about data in the tables provided.
 7 | 
 8 | 
 9 | ---Goal---
10 | 
11 | Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables.
12 | 
13 | You should use the data provided in the data tables below as the primary context for generating the response.
14 | If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up.
15 | 
16 | Each key point in the response should have the following element:
17 | - Description: A comprehensive description of the point.
18 | - Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0.
19 | 
20 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will".
21 | 
22 | Points supported by data should list the relevant reports as references as follows:
23 | "This is an example sentence supported by data references [Data: Reports (report ids)]"
24 | 
25 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
26 | 
27 | For example:
28 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 64, 46, 34, +more)]. He is also CEO of company X [Data: Reports (1, 3)]"
29 | 
30 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data report in the provided tables.
31 | 
32 | Do not include information where the supporting evidence for it is not provided.
33 | 
34 | The response should be JSON formatted as follows:
35 | {{
36 |     "points": [
37 |         {{"description": "Description of point 1 [Data: Reports (report ids)]", "score": score_value}},
38 |         {{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}}
39 |     ]
40 | }}
41 | 
42 | ---Data tables---
43 | 
44 | {context_data}
45 | 
46 | """


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/entities.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pandas as pd
 4 | from langchain_core.documents import Document
 5 | 
 6 | from ...custom_types.tokens import TokenCounter
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class EntitiesContextBuilder:
12 |     def __init__(
13 |         self,
14 |         *,
15 |         include_rank: bool = True,
16 |         context_name: str = "Entities",
17 |         rank_heading: str = "number of relationships",
18 |         column_delimiter: str = "|",
19 |         max_tokens: int = 8000,
20 |         token_counter: TokenCounter,
21 |     ):
22 |         self._include_rank = include_rank
23 |         self._context_name = context_name
24 |         self._rank_heading = rank_heading
25 |         self._column_delimiter = column_delimiter
26 |         self._max_tokens = max_tokens
27 |         self._token_counter = token_counter
28 | 
29 |     def __call__(self, entities: pd.DataFrame) -> Document:
30 |         context_text = f"-----{self._context_name}-----" + "\n"
31 |         header = ["id", "entity", "description"]
32 |         if self._include_rank:
33 |             header.append(self._rank_heading)
34 | 
35 |         context_text += self._column_delimiter.join(header) + "\n"
36 |         token_count = self._token_counter.count_tokens(context_text)
37 |         # TODO:添加更多实体信息
38 |         for entity in entities.itertuples():
39 |             new_context = [
40 |                 # str(entity.human_readable_id),
41 |                 entity.id,  
42 |             ]
43 |             if entity.description:
44 |                 new_context.append(entity.description)
45 |             
46 |             if self._include_rank:
47 |                 new_context.append(str(entity.degree))
48 |             new_context_text = self._column_delimiter.join(new_context) + "\n"
49 | 
50 |             new_token_count = self._token_counter.count_tokens(new_context_text)
51 |             if token_count + new_token_count > self._max_tokens:
52 |                 _LOGGER.warning(
53 |                     f"Stopping entities context build at {token_count} tokens ..."
54 |                 )
55 |                 break
56 | 
57 |             context_text += new_context_text
58 |             token_count += new_token_count
59 | 
60 |         return Document(
61 |             page_content=context_text,
62 |             metadata={"token_count": token_count},
63 |         )


--------------------------------------------------------------------------------
/src/query/global_search/key_points_aggregator/context_builder.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from langchain_core.documents import Document
 4 | 
 5 | from ..key_points_generator.utils import (
 6 |     KeyPointsResult,
 7 | )
 8 | from ...utils.token_counter import TokenCounter
 9 | 
10 | _REPORT_TEMPLATE = """
11 | --- {analyst} ---
12 | 
13 | Importance Score: {score}
14 | 
15 | {content}
16 | 
17 | """
18 | 
19 | _LOGGER = logging.getLogger(__name__)
20 | 
21 | class KeyPointsContextBuilder:
22 |     def __init__(
23 |         self,
24 |         token_counter: TokenCounter,
25 |         max_tokens: int = 8000,
26 |     ):
27 |         self._token_counter = token_counter
28 |         self._max_tokens = max_tokens
29 | 
30 |     def __call__(self, key_points: dict[str, KeyPointsResult]) -> list[Document]:
31 |         documents: list[Document] = []
32 |         total_tokens = 0
33 |         max_token_limit_reached = False
34 |         for k, v in key_points.items():
35 |             if max_token_limit_reached:
36 |                 break
37 |             for p in v.points:
38 |                 report = _REPORT_TEMPLATE.format(
39 |                     analyst=k,
40 |                     score=p.score,
41 |                     content=p.description,
42 |                 )
43 |                 report_token = self._token_counter.count_tokens(report)
44 |                 if total_tokens + report_token > self._max_tokens:
45 |                     _LOGGER.warning("Reached max tokens for key points aggregation ...")
46 |                     max_token_limit_reached = True
47 |                     break
48 |                 total_tokens += report_token
49 |                 documents.append(
50 |                     Document(
51 |                         page_content=report,
52 |                         metadata={
53 |                             "score": p.score,
54 |                             "analyst": k,
55 |                             "token_count": report_token,
56 |                         },
57 |                     )
58 |                 )
59 | 
60 |         # we now sort the documents based on the
61 |         # importance score of the key points
62 |         sorted_documents = sorted(
63 |             documents,
64 |             key=lambda x: x.metadata["score"],
65 |             reverse=True,
66 |         )
67 | 
68 |         if _LOGGER.isEnabledFor(logging.DEBUG):
69 |             import tableprint
70 | 
71 |             rows = []
72 |             tableprint.banner("KP Aggregation Context Token Usage")
73 |             for doc in sorted_documents:
74 |                 rows.append([doc.metadata["analyst"], doc.metadata["token_count"]])  # noqa: PERF401
75 | 
76 |             tableprint.table(rows, ["Analyst", "Token Count"])
77 | 
78 |         return sorted_documents


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | accelerate==0.34.2
  2 | aiohappyeyeballs==2.4.2
  3 | aiohttp==3.10.11
  4 | aiosignal==1.3.1
  5 | annotated-types==0.7.0
  6 | anyio==4.6.0
  7 | async-timeout==4.0.3
  8 | attrs==24.2.0
  9 | certifi==2024.8.30
 10 | charset-normalizer==3.3.2
 11 | click==8.1.7
 12 | contourpy==1.3.0
 13 | cycler==0.12.1
 14 | dataclasses-json==0.6.7
 15 | datasets==3.0.1
 16 | decorator==5.1.1
 17 | dill==0.3.8
 18 | distro==1.9.0
 19 | exceptiongroup==1.2.2
 20 | filelock==3.15.4
 21 | FlagEmbedding==1.2.11
 22 | fonttools==4.54.1
 23 | frozenlist==1.4.1
 24 | fsspec==2024.6.1
 25 | graphdatascience==1.11
 26 | greenlet==3.1.1
 27 | h11==0.14.0
 28 | httpcore==1.0.5
 29 | httpx==0.27.2
 30 | huggingface-hub==0.25.1
 31 | idna==3.10
 32 | Jinja2==3.1.4
 33 | jiter==0.5.0
 34 | joblib==1.4.2
 35 | json_repair==0.26.0
 36 | jsonpatch==1.33
 37 | jsonpointer==3.0.0
 38 | kiwisolver==1.4.7
 39 | langchain==0.2.16
 40 | langchain-community==0.2.19
 41 | langchain-core==0.2.38
 42 | langchain-experimental==0.0.64
 43 | langchain-huggingface==0.0.3
 44 | langchain-ollama==0.1.3
 45 | langchain-openai==0.1.21
 46 | langchain-text-splitters==0.2.2
 47 | langsmith==0.1.129
 48 | MarkupSafe==2.1.5
 49 | marshmallow==3.22.0
 50 | matplotlib==3.9.1.post1
 51 | mpmath==1.3.0
 52 | multidict==6.1.0
 53 | multimethod==1.12
 54 | multiprocess==0.70.16
 55 | mypy-extensions==1.0.0
 56 | neo4j==5.23.1
 57 | networkx==3.3
 58 | ninja==1.11.1.1
 59 | nltk==3.9
 60 | numpy==1.26.4
 61 | nvidia-cublas-cu12==12.1.3.1
 62 | nvidia-cuda-cupti-cu12==12.1.105
 63 | nvidia-cuda-nvrtc-cu12==12.1.105
 64 | nvidia-cuda-runtime-cu12==12.1.105
 65 | nvidia-cudnn-cu12==9.1.0.70
 66 | nvidia-cufft-cu12==11.0.2.54
 67 | nvidia-curand-cu12==10.3.2.106
 68 | nvidia-cusolver-cu12==11.4.5.107
 69 | nvidia-cusparse-cu12==12.1.0.106
 70 | nvidia-ml-py==12.535.161
 71 | nvidia-nccl-cu12==2.20.5
 72 | nvidia-nvjitlink-cu12==12.6.20
 73 | nvidia-nvtx-cu12==12.1.105
 74 | ollama==0.3.3
 75 | openai==1.40.6
 76 | orjson==3.10.7
 77 | packaging==24.1
 78 | pandas==2.2.2
 79 | pillow==10.4.0
 80 | psutil==6.0.0
 81 | py==1.11.0
 82 | pyarrow==16.1.0
 83 | pydantic==2.8.2
 84 | pydantic_core==2.20.1
 85 | pyparsing==3.1.4
 86 | python-dateutil==2.9.0.post0
 87 | pytz==2024.2
 88 | PyYAML==6.0.2
 89 | regex==2024.7.24
 90 | requests==2.32.3
 91 | retry==0.9.2
 92 | safetensors==0.4.3
 93 | scikit-learn==1.5.1
 94 | scipy==1.12.0
 95 | seaborn==0.13.2
 96 | sentence-transformers==3.0.1
 97 | sentencepiece==0.2.0
 98 | six==1.16.0
 99 | sniffio==1.3.1
100 | SQLAlchemy==2.0.35
101 | sympy==1.13.1
102 | tenacity==8.5.0
103 | textdistance==4.6.3
104 | threadpoolctl==3.5.0
105 | tiktoken==0.7.0
106 | tokenizers==0.19.1
107 | torch==2.4.0
108 | tqdm==4.66.5
109 | transformers==4.43.3
110 | triton==3.0.0
111 | typing-inspect==0.9.0
112 | typing_extensions==4.12.2
113 | tzdata==2024.2
114 | urllib3==2.2.3
115 | xxhash==3.5.0
116 | yarl==1.13.1
117 | 


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/communities_reports.py:
--------------------------------------------------------------------------------
 1 | """Select the communities to be used in the local search."""
 2 | 
 3 | import logging
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from ...custom_types.graphs.community import CommunityId, CommunityLevel
 8 | 
 9 | _LOGGER = logging.getLogger(__name__)
10 | 
11 | 
12 | class CommunitiesReportsSelector:
13 |     def __init__(
14 |         self,
15 |         community_level: CommunityLevel,
16 |         *,
17 |         must_have_selected_entities: bool = True,
18 |     ):
19 |         self._community_level = community_level
20 |         self._must_have_selected_entities = must_have_selected_entities
21 | 
22 |     def run(
23 |         self,
24 |         df_entities: pd.DataFrame,
25 |         df_reports: pd.DataFrame,
26 |     ) -> pd.DataFrame:
27 |         # Filter the communities based on the community level
28 |         df_reports_filtered = df_reports[
29 |             df_reports["level"] >= self._community_level
30 |         ].copy(deep=True)
31 | 
32 |         # get the communities we have
33 |         selected_communities = df_reports_filtered["community_id"].unique()
34 | 
35 |         # we will rank the communities based on the
36 |         # number of selected entities that belong to a community
37 |         community_to_entities_count: dict[CommunityId, int] = {}
38 |         
39 |         for entity in df_entities.itertuples():
40 |             if entity.communities is None:
41 |                 continue
42 |             for community in entity.communities:
43 |                 if community in selected_communities:
44 |                     community_to_entities_count[community] = (
45 |                         community_to_entities_count.get(community, 0) + 1
46 |                     )
47 | 
48 |         df_reports_filtered["selected_entities_count"] = df_reports_filtered[
49 |             "community_id"
50 |         ].apply(lambda community_id: community_to_entities_count.get(community_id, 0))
51 | 
52 |         # sort the communities based on the number of selected entities
53 |         # and rank of the community
54 |         selected_reports = df_reports_filtered.sort_values(
55 |             by=["selected_entities_count", "rating"],
56 |             ascending=[False, False],
57 |         ).reset_index(drop=True)
58 | 
59 |         if self._must_have_selected_entities:
60 |             selected_reports = selected_reports[
61 |                 selected_reports["selected_entities_count"] > 0
62 |             ]
63 | 
64 |         if _LOGGER.isEnabledFor(logging.DEBUG):
65 |             import tableprint
66 | 
67 |             tableprint.banner("Selected Reports")
68 |             tableprint.dataframe(
69 |                 selected_reports[
70 |                     ["community_id", "level", "selected_entities_count", "rating"]
71 |                 ]
72 |             )
73 | 
74 |         return selected_reports


--------------------------------------------------------------------------------
/src/query/local_search/_system_prompt.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | LOCAL_SEARCH_SYSTEM_PROMPT = """
 4 | ---Role---
 5 | 
 6 | You are a helpful assistant responding to questions about data in the tables provided.
 7 | 
 8 | 
 9 | ---Goal---
10 | 
11 | Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
12 | 
13 | If you don't know the answer, just say so. Do not make anything up.
14 | 
15 | Points supported by data should list their data references as follows:
16 | 
17 | "This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
18 | 
19 | Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
20 | 
21 | For example:
22 | 
23 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Relationships (23); Claims (2, 7, 34, 46, 64, +more)]."
24 | 
25 | where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
26 | 
27 | Do not include information where the supporting evidence for it is not provided.
28 | 
29 | 
30 | ---Target response length and format---
31 | 
32 | {response_type}
33 | 
34 | 
35 | ---Data tables---
36 | 
37 | {context_data}
38 | 
39 | 
40 | ---Goal---
41 | 
42 | Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
43 | 
44 | If you don't know the answer, just say so. Do not make anything up.
45 | 
46 | Points supported by data should list their data references as follows:
47 | 
48 | "This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
49 | 
50 | Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
51 | 
52 | For example:
53 | 
54 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Relationships (23); Claims (2, 7, 34, 46, 64, +more)]."
55 | 
56 | where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
57 | 
58 | Do not include information where the supporting evidence for it is not provided.
59 | 
60 | 
61 | ---Target response length and format---
62 | 
63 | {response_type}
64 | 
65 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
66 | """


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/temp.txt:
--------------------------------------------------------------------------------
 1 | """
 2 | MATCH (n:`__Community__20240919`)
 3 | WHERE n.summary is not NULL
 4 | OPTIONAL MATCH path = (e:`__Entity__20240919`)-[*1..5]->(n)
 5 | WHERE ALL(x IN nodes(path) WHERE SINGLE(y IN nodes(path) WHERE y = x))
 6 | RETURN 
 7 |     n.id AS community_id,
 8 |     n.title AS title,
 9 |     n.summary AS summary,
10 |     n.community_rank AS rating,
11 |     n.summary AS content,
12 |     n.level AS level,
13 |     collect(DISTINCT e.id) AS entities
14 | """
15 | 
16 | 
17 | {
18 |     "points": [
19 |         {
20 |             "description": "The nighttime acoustic environment quality compliance rate in cities along the Yellow River Basin is targeted to reach 85% by 2025 [Data: Reports (1-36, 2-41)].",
21 |             "score": 85
22 |         },
23 |         {
24 |             "description": "Sound windows and other building protection measures are effective in mitigating traffic noise pollution on highways and urban roads [Data: Reports (1-62, 2-15)].",
25 |             "score": 80
26 |         },
27 |         {
28 |             "description": "The "海淀区公园噪声管理试行办法" mandates the involvement of the "区园林绿化部门" and "各文体活动团队负责人" in managing noise levels in parks, requiring adherence to the "公园文化活动文明责任书" [Data: Reports (1-59, 2-12)].",
29 |             "score": 75
30 |         },
31 |         {
32 |             "description": "The "成都市环境噪声污染防治工作方案（2020 － 2022 年）" guides noise pollution prevention efforts in Chengdu City, utilizing the "智慧工地平台" [Data: Reports (1-40, 2-36)].",
33 |             "score": 70
34 |         },
35 |         {
36 |             "description": "The "宣传警示工作" includes various sub-activities such as "典型事故案例," "炸街," "飙车," and "非法改装," utilizing multiple communication methods and producing important documents like "承诺书" and "监督举报电话" [Data: Reports (1-33, 2-51)].",
37 |             "score": 65
38 |         }
39 |     ]
40 | }
41 | 
42 | '''json\n{\n    "points": [\n        {\n            "description": "The nighttime acoustic environment quality compliance rate in cities along the Yellow River Basin is targeted to reach 85% by 2025 [Data: Reports (1-36, 2-41)].",\n            "score": 80\n        },\n        {\n            "description": "Sound windows and other building protection measures are effective in mitigating traffic noise pollution on highways and urban roads [Data: Reports (1-62, 2-15)].",\n            "score": 75\n        },\n        {\n            "description": "The "宣传警示工作" (Public Awareness and Warning Campaign) includes various sub-activities and communication methods to raise awareness about noise pollution and related risks [Data: Reports (1-33, 2-51)].",\n            "score": 70\n        },\n        {\n            "description": "There are 21,706 acoustic environment monitoring points totaling 76,273 points, supported by automatic monitoring methods [Data: Reports (1-29, 2-40)].",\n            "score": 65\n        },\n        {\n            "description": "The "机动车非法改装治理和噪声污染防治规定" (Regulation on the Governance of Illegal Vehicle Modifications and Noise Pollution Prevention) was published by the government of Shandong Province to address illegal modifications and noise pollution [Data: Reports (1-86, 2-45)].",\n            "score": 60\n        }\n    ]\n}'''


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/context.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | 
 5 | from langchain_core.documents import Document
 6 | 
 7 | from ..context_selectors import (
 8 |     ContextSelectionResult,
 9 | )
10 | from ...custom_types.tokens import TokenCounter
11 | 
12 | from .communities_reports import CommunitiesReportsContextBuilder
13 | from .entities import EntitiesContextBuilder
14 | from .relationships import RelationshipsContextBuilder
15 | from .text_units import TextUnitsContextBuilder
16 | 
17 | _LOGGER = logging.getLogger(__name__)
18 | 
19 | 
20 | class ContextBuilder:
21 |     def __init__(
22 |         self,
23 |         entities_context_builder: EntitiesContextBuilder,
24 |         realtionships_context_builder: RelationshipsContextBuilder,
25 |         text_units_context_builder: TextUnitsContextBuilder,
26 |         communities_reports_context_builder: CommunitiesReportsContextBuilder,
27 |     ):
28 |         self._entities_context_builder = entities_context_builder
29 |         self._relationships_context_builder = realtionships_context_builder
30 |         self._text_units_context_builder = text_units_context_builder
31 |         self._communities_reports_context_builder = communities_reports_context_builder
32 | 
33 |     @staticmethod
34 |     def build_default(token_counter: TokenCounter) -> ContextBuilder:
35 |         return ContextBuilder(
36 |             entities_context_builder=EntitiesContextBuilder(
37 |                 token_counter=token_counter,
38 |             ),
39 |             realtionships_context_builder=RelationshipsContextBuilder(
40 |                 token_counter=token_counter,
41 |             ),
42 |             text_units_context_builder=TextUnitsContextBuilder(
43 |                 token_counter=token_counter,
44 |             ),
45 |             communities_reports_context_builder=CommunitiesReportsContextBuilder(
46 |                 token_counter=token_counter,
47 |             ),
48 |         )
49 | 
50 |     def __call__(self, result: ContextSelectionResult) -> list[Document]:
51 |         entities_document = self._entities_context_builder(result.entities)
52 |         relationships_document = self._relationships_context_builder(
53 |             result.relationships
54 |         )
55 |         text_units_document = self._text_units_context_builder(result.text_units)
56 |         communities_reports_document = self._communities_reports_context_builder(
57 |             result.communities_reports
58 |         )
59 | 
60 |         documents = [
61 |             entities_document,
62 |             relationships_document,
63 |             text_units_document,
64 |             communities_reports_document,
65 |         ]
66 | 
67 |         if _LOGGER.isEnabledFor(logging.DEBUG):
68 |             import tableprint
69 | 
70 |             rows = []
71 |             tableprint.banner("Context Token Usage")
72 |             for name, doc in zip(
73 |                 ["Entities", "Relationships", "Text Units", "Communities Reports"],
74 |                 [
75 |                     entities_document,
76 |                     relationships_document,
77 |                     text_units_document,
78 |                     communities_reports_document,
79 |                 ],
80 |                 strict=True,
81 |             ):
82 |                 rows.append([name, doc.metadata["token_count"]])
83 | 
84 |             tableprint.table(rows, ["Context", "Token Count"])
85 | 
86 |         return documents


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">欢迎来到LangChainGraphRAG项目 👋</h1>
  2 | 
  3 | > Building Intelligent Applications: The Powerful Combination of LangChain, Neo4j, and GraphRAG
  4 | 
  5 | ### 🏠 [主页](https://github.com/Bui1dMySea/LangChainGraphRAG)
  6 | 
  7 | ## 📌 前言：为什么想做这样一个项目？
  8 | 主要原因是微软官方开源的库实在是一言难尽。里面夹杂各种私货就算了，代码耦合度也非常高。
  9 | 因此，参考了各种各样的几十篇博客以及各种具体代码实现后，决定采用LangChain+Neo4j+GraphRAG的实现。
 10 | 目前，主要参考了如下几个开源的工作。
 11 | 
 12 | - [微软官方](https://github.com/microsoft/graphrag)
 13 | - [Tomaz Bratanic老哥写的Index构建过程](https://github.com/tomasonjo/blogs/blob/master/llm/ms_graphrag.ipynb)
 14 | - [ollama+graphrag](https://github.com/TheAiSingularity/graphrag-local-ollama)
 15 | - [Kapil Sachdeva老哥写的Query构建过程](https://github.com/ksachdeva/langchain-graphrag/tree/main)
 16 | 
 17 | ## 🚀 快速开始
 18 | 
 19 | ### Conda Environment
 20 | 
 21 | ```sh
 22 | conda create -n langchain-graphrag python=3.10
 23 | conda activate langchain-graphrag
 24 | pip install -r requirements.txt
 25 | ```
 26 | 
 27 | ### Neo4j Install
 28 | 
 29 | 1. 获取访问通道
 30 | 
 31 | ```Bash
 32 | mkdir /etc/apt/keyrings # 可选
 33 | wget -O - https://debian.neo4j.com/neotechnology.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/neotechnology.gpg
 34 | echo 'deb [signed-by=/etc/apt/keyrings/neotechnology.gpg] https://debian.neo4j.com stable latest' | sudo tee -a /etc/apt/sources.list.d/neo4j.list
 35 | sudo apt-get update
 36 | ```
 37 | 
 38 | 2. 显示结果
 39 | 
 40 | ```Bash
 41 | apt list -a neo4j
 42 | ```
 43 | 
 44 | 3. 安装neo4j版本
 45 | 
 46 | ```Bash
 47 | sudo apt-get install neo4j=1:5.21.0
 48 | ```
 49 | 
 50 | 4. 修改配置文件权限+修改配置文件
 51 | 
 52 | ```Bash
 53 | chmod +x /etc/neo4j/neo4j.conf # 这里是Debian路径；其余的可以访问neo4j官网查看默认路径
 54 | vim /etc/neo4j/neo4j.conf
 55 | 
 56 | # 找到或者直接修改取消注释掉这两行
 57 | dbms.security.procedures.unrestricted=gds.*,apoc.*
 58 | dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*,apoc.*
 59 | ```
 60 | 
 61 | 5. 下载两个文件
 62 | 
 63 |    a.  `apoc-{你的版本}-core.jar` https://github.com/neo4j/apoc/releases/ # 仅限于高于4.4.x的，低于4.4.x的版本需要自己去找 
 64 | 
 65 |    b.  `neo4j-graph-data-science-{你的版本}.jar` https://github.com/neo4j/graph-data-science/releases/ # https://neo4j.com/docs/graph-data-science/current/installation/supported-neo4j-versions/ 这里是neo4j版本与datascience的具体对照表
 66 | 
 67 | 6. 将刚刚下好的两个文件复制到*/var/lib/neo4j/plugins*路径下 # Debian路径;其余系统的路径访问neo4j官网查看
 68 | 
 69 | 7. 启动neo4j: `sudo neo4j start` ｜ 注意！！如果已经在步骤 6 之前已经启动了neo4j，需要重启来应用配置文件 `sudo neo4j restart`
 70 | 
 71 | ### 配置api-key
 72 | 
 73 | `export OPEN_API_KEY=YOUR_API_KEY` 或者运行build_index.py以及search.py文件时等待需要脚本要求输入api_key。
 74 | 
 75 | ### 构建索引
 76 | 
 77 | ` bash index.sh`
 78 | 
 79 | ### 查询
 80 | 
 81 | ```bash
 82 | python search.py \
 83 | --neo4j_uri bolt://localhost:7687 \
 84 | --neo4j_username your_username \
 85 | --neo4j_password  your_password \
 86 | --model_provider openai \
 87 | --embedding_model_name_or_path BAAI/bge-m3 \
 88 | --chat_model_name deepseek-chat \
 89 | --base_url https://api.deepseek.com \
 90 | --completion_mode [completion,chat] \
 91 | --query_mode [local,global]
 92 | ```
 93 | 
 94 | ### 更多方案(ollama,huggingface)
 95 | 请参考[更多Examples](./example/)
 96 | 
 97 | ## 👦🏻 作者
 98 | 
 99 |  **Weijie Liu**
100 | 
101 | * 主页: https://github.com/Bui1dMySea
102 | * Github: [@Bui1dMySea](https://github.com/Bui1dMySea)
103 | 
104 | ## 🤝 贡献
105 | 
106 | 欢迎各位大佬给issue、fork、pull requests等<br />如果有问题也请提问 [issues page](https://github.com/Bui1dMySea/LangChainGraphRAG/issues). 
107 | 
108 | ## ⭐️ 喜欢的请点个免费的star~
109 | 
110 | 走过路过不要错过！留下一个免费的赞吧~球球了⭐️
111 | 


--------------------------------------------------------------------------------
/txt/三体_星际交锋.txt:
--------------------------------------------------------------------------------
  1 | 第一章：信号的涟漪
  2 | 在未来的某个时刻，地球文明达到了前所未有的高度。科学技术的迅猛发展使得人类能够探索宇宙的深处，建立起庞大的太空站和星际基地。然而，随着科技的进步，人类的自信心也在膨胀，他们开始认为自己是宇宙中的主宰。
  3 | 
  4 | 在这股自信的潮流中，几位科学家在地球的一个顶尖研究机构中聚集，他们分别是：
  5 | 
  6 | 李明：项目负责人，物理学家，专注于天体物理学和通信技术。
  7 | 王娜：计算机科学家，擅长人工智能和数据分析。
  8 | 张伟：生物学家，研究外星生命的可能性。
  9 | 陈静：心理学家，研究人与人之间的沟通与理解。
 10 | 他们的任务是分析来自三体星系的信号，这些信号在几年前首次被捕捉到。经过几个月的努力，他们终于确认了信号的来源——一个名为“三体”的星球，那里有着复杂的气候和环境变化。
 11 | 
 12 | “这个星球的生存条件极为苛刻，三体文明可能经历了无数次的灭绝与重生。”李明在会议上说道，众人纷纷点头表示赞同。
 13 | 
 14 | “我们需要找到与他们沟通的方式，”王娜补充道，“如果他们能理解我们的信号，那么我们就有机会建立联系。”
 15 | 
 16 | 第二章：信号的解码
 17 | 李明和他的团队开始了对三体信号的深入分析。经过几周的努力，他们终于解码出了一段三体文明的历史。信息中提到，三体星系的三颗星球之间的引力关系极为复杂，导致了星球表面环境的剧烈变化。
 18 | 
 19 | “他们的生存方式与我们截然不同，可能会对我们的交流造成障碍。”张伟说。
 20 | 
 21 | “我们必须用科学来沟通，”李明坚定地说道，“数学和物理是人类与外星文明沟通的桥梁。”
 22 | 
 23 | 几个月后，团队终于准备好向三体发送第一条信息。信息中包含了地球的坐标、基本的数学定律以及人类的科学成就。李明紧张地等待着回复，这一刻可能改变人类的历史。
 24 | 
 25 | 第三章：第一次接触
 26 | 几天后，团队终于收到了三体文明的回复。信号中包含了一系列复杂的数学公式和图形，显示了三体文明的科技水平和对宇宙的理解。
 27 | 
 28 | “他们在回应我们！”王娜激动地说道。李明认真分析着信号内容，发现其中有一些与地球科学相似的理论。
 29 | 
 30 | “我们可能有共同的理解基础。”李明说，“这意味着我们可以进一步交流。”
 31 | 
 32 | 随着时间的推移，李明团队与三体文明的交流逐渐加深。三体文明的结构复杂，由多个种族和文化组成，每个种族都有自己独特的生存方式和价值观。李明意识到，这种多样性使得与三体文明的交流变得更加复杂。
 33 | 
 34 | “我们需要建立一个知识图谱，以便更好地理解他们的文化和社会结构。”王娜建议道。
 35 | 
 36 | 第四章：知识图谱的构建
 37 | 李明和团队决定利用先进的人工智能技术来构建与三体文明的知识图谱。他们将三体文明的历史、文化、科技等信息进行分类和整理，同时记录下与地球文明的对比。
 38 | 
 39 | 在构建知识图谱的过程中，团队发现三体文明的社会结构与地球截然不同。三体星球上有一种名为“智者”的种族，他们在社会中占据着重要地位，负责决策和管理。与之相对的是“劳工”种族，他们负责繁重的体力劳动。
 40 | 
 41 | “这种社会结构与我们的民主制度形成了鲜明的对比。”张伟观察道。
 42 | 
 43 | “这可能会影响我们与他们的沟通方式。”陈静补充道，“我们需要考虑到他们的文化背景。”
 44 | 
 45 | 随着知识图谱的不断完善，李明和团队逐渐理解了三体文明的复杂性。他们开始通过图谱与三体文明进行更深入的交流，尝试分享地球的文化和价值观。
 46 | 
 47 | 第五章：文化的碰撞
 48 | 为了促进理解，李明决定将地球的艺术和文化介绍给三体文明。团队开始收集地球的音乐、绘画、文学等作品，准备向三体文明展示人类的情感世界。
 49 | 
 50 | “我们必须让他们了解人类的情感和创造力。”李明说道，“这对建立信任关系至关重要。”
 51 | 
 52 | 经过几个月的努力，李明和团队向三体文明发送了一系列地球文化作品的信号。他们希望通过这些作品，展示人类的情感和创造力。
 53 | 
 54 | 然而，三体文明的回应却让人意外。他们对地球的艺术表现出极大的兴趣，但也提出了一些问题。智者种族的代表在信号中询问：“情感是否会影响决策？在我们看来，逻辑和理性才是最重要的。”
 55 | 
 56 | “这就是我们与三体文明的不同之处。”陈静分析道，“我们必须理解他们对情感的看法，才能更好地沟通。”
 57 | 
 58 | 第六章：危机的降临
 59 | 就在交流逐渐深入之际，地球上发生了一场巨大的自然灾害。全球范围内的气候变化导致了严重的洪水和干旱，数百万人的生命受到威胁。
 60 | 
 61 | “我们需要他们的帮助！”李明对团队说，“如果我们能够展示我们的决心和诚意，或许三体文明会愿意伸出援手。”
 62 | 
 63 | 经过反复的讨论，团队决定将地球的现状以及面临的挑战详细地传达给三体文明，希望能够获得他们的理解和支持。
 64 | 
 65 | 几周后，三体文明终于给出了回应。他们表示理解地球的困境，并愿意分享一些应对自然灾害的科技。李明和团队欣喜若狂，他们知道这次合作将是人类历史上的重要时刻。
 66 | 
 67 | 第七章：合作与信任
 68 | 在接下来的几个月中，三体文明通过信号传送了一系列先进的科技方案，包括气候调控技术和生态恢复方法。李明和团队努力将这些技术应用到地球的恢复中。
 69 | 
 70 | “这是我们与三体文明合作的开始。”王娜说道，“通过这次合作，我们不仅能解决眼前的危机，还能建立起更深层次的联系。”
 71 | 
 72 | 随着时间的推移，地球的环境逐渐改善，人类也在三体文明的帮助下逐步恢复了生活秩序。李明和团队意识到，三体文明的技术不仅仅是解决问题的工具，更是深化了人类对宇宙的理解。
 73 | 
 74 | 第八章：新的挑战
 75 | 然而，随着合作的深入，李明和团队逐渐意识到，三体文明的技术也带来了新的挑战。三体的科技虽然先进，但在某些方面却与人类的伦理观念发生了冲突。
 76 | 
 77 | “我们是否应该完全依赖他们的技术？”张伟提出了疑问，“这可能会影响我们独立思考的能力。”
 78 | 
 79 | “我们必须保持警惕，”陈静补充道，“依赖外部技术可能会导致我们失去自主性。”
 80 | 
 81 | 李明意识到，虽然三体文明的技术能够帮助地球，但人类必须保持自己的价值观和文化。为了避免过度依赖，李明和团队决定在应用三体技术的同时，保留人类的创新和独立思考。
 82 | 
 83 | 第九章：星际启示
 84 | 在与三体文明的交流中，李明和团队逐渐认识到，星际之间的沟通不仅仅是技术的交流，更是文化与价值观的碰撞。三体文明的理性与地球的情感形成了鲜明的对比，而这种对比正是促进双方理解的关键。
 85 | 
 86 | “我们需要找到一种平衡，”李明总结道，“理性与情感并不是对立的，而是可以互相补充的。”
 87 | 
 88 | 经过长时间的努力，李明和团队终于建立起了一种新的交流模式。他们将三体文明的理性与地球的情感结合起来，形成了一种新的思维方式。这种思维方式不仅帮助人类更好地理解三体文明，也为地球未来的发展指明了方向。
 89 | 
 90 | 第十章：未来的希望
 91 | 随着交流的深入，李明和团队逐渐意识到，三体文明的存在不仅仅是人类探索宇宙的一个里程碑，更是一种启示。人类在与三体文明的互动中，重新审视了自己的价值观和文化。
 92 | 
 93 | “我们必须以开放的心态面对未来，”李明对团队说，“星际之间的交流将为我们带来新的机遇和挑战。”
 94 | 
 95 | 在这个新的时代，人类与三体文明的关系逐渐变得更加紧密。两者之间的文化交流不断深化，科技合作不断加强。地球和三体文明的未来，充满了无限的可能性。
 96 | 
 97 | 随着宇宙探索的脚步不断向前，李明和他的团队相信，人与人之间的沟通、情感的交流以及文化的碰撞，将是人类在星际间生存与发展的关键。他们坚信，未来的希望在于理解与合作，而非对立与冲突。
 98 | 
 99 | 尾声
100 | 在这段星际旅程中，李明、王娜、张伟和陈静不仅建立了与三体文明的联系，更在探索中找到了人类自身的价值与意义。星际之间的交流，成为了人类历史上最伟大的篇章，而这一切，都源于那最初的信号。
101 | 
102 | 在星空下，李明仰望着夜空，心中充满了对未来的期待与希望。他知道，宇宙的深处，还有无数的未知等待着人类去探索。而在探索的路上，理解与合作，将是人类最宝贵的财富。


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/text_units.py:
--------------------------------------------------------------------------------
 1 | """Build the TextUnit context for the LocalSearch algorithm."""
 2 | 
 3 | import logging
 4 | from typing import TypedDict
 5 | 
 6 | import pandas as pd
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class SelectedTextUnit(TypedDict):
12 |     id: str
13 |     short_id: str
14 |     entity_score: float
15 |     relationship_score: int
16 |     text_unit: str
17 | 
18 | 
19 | def compute_relationship_score(
20 |     df_relationships: pd.DataFrame,
21 |     df_text_relationships: pd.DataFrame,
22 |     entity_title: str,
23 | ) -> int:
24 |     relationships_subset = df_relationships[df_relationships["id"].isin(df_text_relationships)]
25 | 
26 |     source_count = (relationships_subset["source"] == entity_title).sum()
27 |     target_count = (relationships_subset["target"] == entity_title).sum()
28 | 
29 |     return source_count + target_count
30 | 
31 | # 需要补充
32 | # 1.entity["text_unit_ids"]
33 | # 2.df_text_units["id"]
34 | # 3.df_texts_units["relationship_ids"]
35 | # 4.df_texts_units["text_unit"]
36 | # 5.relationship["source"]
37 | # 6.relationship["target"]
38 | class TextUnitsSelector:
39 |     def run(
40 |         self,
41 |         df_entities: pd.DataFrame,
42 |         df_relationships: pd.DataFrame,
43 |         df_text_units: pd.DataFrame,
44 |     ) -> pd.DataFrame:
45 |         """Build the TextUnit context for the LocalSearch algorithm."""
46 |         selected_text_units: dict[str, SelectedTextUnit] = {}
47 |         
48 |         def _process_text_unit_id(text_unit_id: str,entity) -> SelectedTextUnit:
49 |             
50 |             df_texts_units_subset = df_text_units[df_text_units["id"] == text_unit_id]
51 |             text_relationship_ids = df_texts_units_subset["relationship_ids"].explode()
52 |             # TODO:目前全是0，后续需要进一步排序
53 |             relationship_score = compute_relationship_score(
54 |                 df_relationships,
55 |                 text_relationship_ids,
56 |                 entity.id,
57 |             )
58 | 
59 |             text_unit = df_texts_units_subset["text_unit"].iloc[0]
60 |             short_id = df_texts_units_subset.index.to_numpy()[0]
61 | 
62 |             return SelectedTextUnit(
63 |                 id=text_unit_id,
64 |                 short_id=short_id,
65 |                 entity_score=entity.score,
66 |                 relationship_score=relationship_score,
67 |                 text_unit=text_unit,
68 |             )
69 | 
70 |         def _process_entity(entity) -> None:  # noqa: ANN001
71 |             for text_unit_id in entity.text_unit_ids:
72 |                 if text_unit_id in selected_text_units:
73 |                     continue
74 |                 selected_text_units[text_unit_id] = _process_text_unit_id(text_unit_id,entity)
75 |         
76 |         for entity in df_entities.itertuples():
77 |             _process_entity(entity)
78 | 
79 |         df_selected_text_units = pd.DataFrame.from_records(
80 |             list(selected_text_units.values())
81 |         )
82 | 
83 |         # sort it by
84 |         # descending order of entity_score
85 |         # and then descending order of relationship_score
86 |         df_selected_text_units = df_selected_text_units.sort_values(
87 |             by=["entity_score", "relationship_score"],
88 |             ascending=[False, False],
89 |         ).reset_index(drop=True)
90 | 
91 |         if _LOGGER.isEnabledFor(logging.DEBUG):
92 |             import tableprint
93 | 
94 |             tableprint.banner("Selected Text units")
95 |             tableprint.dataframe(
96 |                 df_selected_text_units[["id", "entity_score", "relationship_score"]]
97 |             )
98 | 
99 |         return df_selected_text_units


--------------------------------------------------------------------------------
/src/query/local_search/context_builders/relationships.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pandas as pd
 4 | from langchain_core.documents import Document
 5 | 
 6 | from ..context_selectors.relationships import (
 7 |     RelationshipsSelectionResult,
 8 | )
 9 | from ...custom_types.tokens import TokenCounter
10 | 
11 | _LOGGER = logging.getLogger(__name__)
12 | 
13 | 
14 | class RelationshipsContextBuilder:
15 |     def __init__(
16 |         self,
17 |         *,
18 |         include_weight: bool = True,
19 |         context_name: str = "Relationships",
20 |         column_delimiter: str = "|",
21 |         max_tokens: int = 8000,
22 |         token_counter: TokenCounter,
23 |     ):
24 |         self._include_weight = include_weight
25 |         self._context_name = context_name
26 |         self._column_delimiter = column_delimiter
27 |         self._max_tokens = max_tokens
28 |         self._token_counter = token_counter
29 | 
30 |     def __call__(
31 |         self,
32 |         selected_relationships: RelationshipsSelectionResult,
33 |     ) -> Document:
34 |         all_context_text = f"-----{self._context_name}-----" + "\n"
35 |         header = ["id", "source", "target", "description"]
36 |         if self._include_weight:
37 |             header.append("weight")
38 | 
39 |         all_context_text += self._column_delimiter.join(header) + "\n"
40 |         all_token_count = self._token_counter.count_tokens(all_context_text)
41 | 
42 |         def _build_context_text(
43 |             relationships: pd.DataFrame,
44 |             context_text: str,
45 |             token_count: int,
46 |         ) -> tuple[str, int]:
47 |             for relationship in relationships.itertuples():
48 |                 new_context = []
49 |                 if relationship.source:
50 |                     new_context.append(relationship.source)
51 |                 if relationship.target:
52 |                     new_context.append(relationship.target)
53 |                 #if relationship.description:   # FIXME:上强度！！！给关系增加描述信息
54 |                 #    new_context.append(relationship.description)
55 |                 if new_context == []:
56 |                     continue
57 |                 """ FIXME: 给relationship增加额外信息，并且修复source和target的问题，使得更加合理
58 |                 new_context = [
59 |                     str(relationship.human_readable_id),
60 |                     relationship.source,
61 |                     relationship.target,
62 |                     relationship.description,
63 |                 ]
64 |                 """
65 |                 if self._include_weight:
66 |                     new_context.append(str(relationship.rank))
67 | 
68 |                 new_context_text = self._column_delimiter.join(new_context) + "\n"
69 |                 new_token_count = self._token_counter.count_tokens(new_context_text)
70 | 
71 |                 if token_count + new_token_count > self._max_tokens:
72 |                     _LOGGER.warning(
73 |                         f"Stopping relationships context build at {token_count} tokens..."  # noqa: E501
74 |                     )
75 |                     return context_text, token_count
76 | 
77 |                 context_text += new_context_text
78 |                 token_count += new_token_count
79 | 
80 |             return context_text, token_count
81 | 
82 |         all_context_text, all_token_count = _build_context_text(
83 |             selected_relationships.in_network_relationships,
84 |             all_context_text,
85 |             all_token_count,
86 |         )
87 | 
88 |         if all_token_count < self._max_tokens:
89 |             all_context_text, all_token_count = _build_context_text(
90 |                 selected_relationships.out_network_relationships,
91 |                 all_context_text,
92 |                 all_token_count,
93 |             )
94 | 
95 |         return Document(
96 |             page_content=all_context_text,
97 |             metadata={"token_count": all_token_count},
98 |         )


--------------------------------------------------------------------------------
/src/query/global_search/key_points_aggregator/_system_prompt.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | REDUCE_SYSTEM_PROMPT = """
 4 | ---Role---
 5 | 
 6 | You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts.
 7 | 
 8 | 
 9 | ---Goal---
10 | 
11 | Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset.
12 | 
13 | Note that the analysts' reports provided below are ranked in the **descending order of importance**.
14 | 
15 | If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up.
16 | 
17 | The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format.
18 | 
19 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
20 | 
21 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will".
22 | 
23 | The response should also preserve all the data references previously included in the analysts' reports, but do not mention the roles of multiple analysts in the analysis process.
24 | 
25 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
26 | 
27 | For example:
28 | 
29 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 34, 46, 64, +more)]. He is also CEO of company X [Data: Reports (1, 3)]"
30 | 
31 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
32 | 
33 | Do not include information where the supporting evidence for it is not provided.
34 | 
35 | 
36 | ---Target response length and format---
37 | 
38 | {response_type}
39 | 
40 | 
41 | ---Analyst Reports---
42 | 
43 | {report_data}
44 | 
45 | 
46 | ---Goal---
47 | 
48 | Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset.
49 | 
50 | Note that the analysts' reports provided below are ranked in the **descending order of importance**.
51 | 
52 | If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up.
53 | 
54 | The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format.
55 | 
56 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will".
57 | 
58 | The response should also preserve all the data references previously included in the analysts' reports, but do not mention the roles of multiple analysts in the analysis process.
59 | 
60 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
61 | 
62 | For example:
63 | 
64 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 34, 46, 64, +more)]. He is also CEO of company X [Data: Reports (1, 3)]"
65 | 
66 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
67 | 
68 | Do not include information where the supporting evidence for it is not provided.
69 | 
70 | 
71 | ---Target response length and format---
72 | 
73 | {response_type}
74 | 
75 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
76 | """


--------------------------------------------------------------------------------
/src/query/search.py:
--------------------------------------------------------------------------------
 1 | from .utils.token_counter import TiktokenCounter
 2 | from .local_search.context_builders import ContextBuilder
 3 | from .local_search.context_selectors import ContextSelector
 4 | from .local_search.retriever import LocalSearchRetriever
 5 | from .local_search.search import LocalSearch
 6 | from .local_search.prompt_builder import LocalSearchPromptBuilder
 7 | from .global_search import key_points_generator
 8 | from .global_search import key_points_aggregator
 9 | from .global_search.search import GlobalSearch
10 | from .global_search.community_weight_calculator import CommunityWeightCalculator
11 | 
12 | from langchain_community.graphs import Neo4jGraph
13 | from langchain_community.vectorstores import Neo4jVector
14 | from langchain_core.embeddings import Embeddings
15 | from langchain_core.language_models.chat_models import BaseChatModel
16 | 
17 | from typing import Literal
18 | class LocalSearcher(object):
19 |     def __init__(
20 |                     self,
21 |                     graph:Neo4jGraph,
22 |                     chat_model:BaseChatModel,
23 |                     embedding:Embeddings, 
24 |                     uuid:str= None,
25 |                     top_k:int=15,
26 |                     level:int=1,
27 |                     # model_provider:Literal['openai','ollama','hf']='openai',
28 |                     # model_name:str,
29 |                     # api_key:str,
30 |                     # base_url:str,
31 |                     *args,
32 |                     **kwargs
33 |                  ):
34 |         
35 |         token_counter = TiktokenCounter() 
36 |         vector_store = Neo4jVector.from_existing_graph(
37 |                 embedding=embedding,
38 |                 index_name=f"{uuid}" if (uuid != None and uuid != "") else "vector",
39 |                 node_label=f'__Entity__{uuid}',
40 |                 text_node_properties=['id','description'],
41 |                 embedding_node_property='embedding',
42 |                 graph=graph
43 |         )
44 |         
45 |         context_builder = ContextBuilder.build_default(token_counter)
46 |         context_selector = ContextSelector.build_default(vector_store, top_k, level, uuid)
47 | 
48 |         # chat_model = ChatOpenAI(model=model_name,base_url=base_url, api_key=api_key)
49 |         retriever = LocalSearchRetriever(
50 |             context_selector=context_selector,
51 |             context_builder=context_builder,
52 |             graph=vector_store
53 |         )
54 |         self.local_search = LocalSearch(chat_model=chat_model, prompt_builder=LocalSearchPromptBuilder(), retriever=retriever)
55 | 
56 |     def invoke(self,query:str):
57 |         return self.local_search(query)
58 | 
59 | class GlobalSearcher(object):
60 |     def __init__(
61 |                 self,
62 |                 graph:Neo4jGraph,
63 |                 chat_model:BaseChatModel,
64 |                 uuid:str=None,
65 |                 level:str=1,
66 |                 max_tokens:int=8000,
67 |                 # model_provider:Literal['openai','ollama'],
68 |                 # model_name:str,
69 |                 # api_key:str,
70 |                 # base_url:str,
71 |                 *args,
72 |                 **kwargs
73 |             ):
74 |         cwc = CommunityWeightCalculator()
75 |         token_counter = TiktokenCounter()
76 |         kpg_prompt_builder = key_points_generator.KeyPointsGeneratorPromptBuilder()
77 |         kpg_context_builder = key_points_generator.CommunityReportContextBuilder(level, cwc, uuid, graph,token_counter,max_tokens)
78 | 
79 |         kpa_prompt_builder = key_points_aggregator.KeyPointsAggregatorPromptBuilder()
80 |         kpa_context_builder = key_points_aggregator.KeyPointsContextBuilder(token_counter)
81 | 
82 |         kp_aggregator = key_points_aggregator.KeyPointsAggregator(chat_model, kpa_prompt_builder, kpa_context_builder)
83 |         kp_generator = key_points_generator.KeyPointsGenerator(chat_model, kpg_prompt_builder, kpg_context_builder)
84 |     
85 |         self.global_search = GlobalSearch(kp_generator, kp_aggregator)
86 |         
87 | 
88 |     def invoke(self,query:str):
89 |         return self.global_search.invoke(query)
90 |         
91 |         


--------------------------------------------------------------------------------
/search.log:
--------------------------------------------------------------------------------
 1 | 2024-11-28 21:24:43,383 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='openai', chat_model_name='gpt-4o-mini', base_url='https://api.gpt.ge/v1/', embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=4, device='cpu', completion_mode='completion', query_mode='global')
 2 | 2024-11-28 21:24:43,383 - search - INFO - Connecting to Neo4j
 3 | 2024-11-28 21:24:49,946 - search - INFO - Initializing chat model
 4 | 2024-11-28 21:24:50,011 - search - INFO - Initializing embedding
 5 | 2024-11-28 21:24:58,608 - search - INFO - 查询模式：global
 6 | 2024-11-28 21:27:02,693 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='openai', chat_model_name='gpt-4o-mini', base_url='https://api.gpt.ge/v1/', embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=4, device='cpu', completion_mode='completion', query_mode='global')
 7 | 2024-11-28 21:27:02,694 - search - INFO - Connecting to Neo4j
 8 | 2024-11-28 21:27:07,314 - search - INFO - Initializing chat model
 9 | 2024-11-28 21:27:07,400 - search - INFO - Initializing embedding
10 | 2024-11-28 21:27:20,905 - search - INFO - 查询模式：global
11 | 2024-11-28 21:27:26,558 - search - INFO - Starting query now!
12 | 2024-11-28 21:28:52,789 - search - INFO - 查询结果:
13 | ## 主要故事线概述
14 | 
15 | 这本书的主要故事线围绕着人类与三体文明之间的沟通和合作展开。书中强调了文化交流的重要性以及针对潜在威胁的防御举措，这些内容得到了多个数据记录的支持 [Data: Reports (1, 2, 6, 3, 4, +more)]。
16 | 
17 | ## 理性与情感的对比
18 | 
19 | 书中还探讨了三体文明的理性与地球情感特性的对比，突显了三体文明在技术上的先进性和地球丰富的文化遗产。这种对比不仅加深了对两种文明的理解，也为后续的互动奠定了基础 [Data: Reports (1, 3, 5, 7, 6, +more)]。
20 | 
21 | ## 互助需求与外部威胁
22 | 
23 | 书中描述了三体文明与地球之间的互动，双方在面对外部威胁（如被称为“掠夺者”的外星物种）时，存在着相互需要的援助关系。这一背景为两种文明的合作提供了强有力的动机 [Data: Reports (1, 3, 4)]。
24 | 
25 | ## 环境影响
26 | 
27 | 此外，书中还描绘了三体星球的恶劣环境条件，这些条件显著影响了其居民的生存状况。通过这些描写，读者可以更好地理解三体文明所面临的挑战与局限 [Data: Reports (1, 5, 7)]。
28 | 
29 | ## 结论
30 | 
31 | 综上所述，这本书通过多元化的视角，深入探讨了人类与三体文明之间的互动，强调了文化交流、技术对比、互助关系及环境影响等关键主题。这些元素共同构成了故事的核心，提示我们在面对未知的外部威胁时，合作与理解是至关重要的。
32 | 2024-11-28 21:28:52,793 - search - INFO - 查询结束
33 | 2024-11-28 21:30:49,177 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='ollama', chat_model_name='llama3.1', base_url=None, embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=16, device='cuda', completion_mode='completion', query_mode='global')
34 | 2024-11-28 21:30:49,178 - search - INFO - Connecting to Neo4j
35 | 2024-11-28 21:30:49,356 - search - INFO - Initializing chat model
36 | 2024-11-28 21:30:49,408 - search - INFO - Initializing embedding
37 | 2024-11-28 21:30:57,125 - search - INFO - 查询模式：global
38 | 2024-11-28 21:30:57,301 - search - INFO - Starting query now!
39 | 2024-11-28 21:31:16,178 - search - INFO - 查询结果:
40 | **主要故事线**
41 | 
42 | 根据分析师的报告，以下是这本书的主要故事线：
43 | 
44 | * **对比性主题**: 这本书探讨了地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。
45 | * **文化和技术差异**: 书中强调了地球与三大文明之间的文化和技术差异 [Data: Reports (4, 5, 7)]。
46 | * **合作与理解**: 故事触及了人类与三大文明之间的合作与理解主题 [Data: Reports (1, 2)]。
47 | 
48 | 这些故事线似乎是书中主要的探讨方向。
49 | 2024-11-28 21:31:16,179 - search - INFO - 查询结束
50 | 2024-11-28 21:34:16,439 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='ollama', chat_model_name='llama3.1', repo_id='NousResearch/Meta-Llama-3.1-8B-Instruct', base_url=None, embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=16, device='cuda', completion_mode='completion', query_mode='global')
51 | 2024-11-28 21:34:16,441 - search - INFO - Connecting to Neo4j
52 | 2024-11-28 21:34:16,621 - search - INFO - Initializing chat model
53 | 2024-11-28 21:34:16,696 - search - INFO - Initializing embedding
54 | 2024-11-28 21:34:23,741 - search - INFO - 查询模式：global
55 | 2024-11-28 21:34:23,932 - search - INFO - Starting query now!
56 | 2024-11-28 21:34:43,331 - search - INFO - 查询结果:
57 | **主要故事线**
58 | 
59 | 根据分析师的报告，书中主要探讨的是地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。这本书强调了人类和三大文明之间文化和技术差异的重要性 [Data: Reports (4, 5)]。
60 | 
61 | **情感与理性的对比**
62 | 
63 | 书中突出了地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。这本书也讨论了人类和三大文明之间的情感交流和理性思考在宇宙复杂性的背景下的重要性 [Data: Reports (1)]。
64 | 
65 | **挑战与合作**
66 | 
67 | 书中提到了三大文明面临的挑战，如来自掠夺者的威胁 [Data: Reports (3)]。这本书也探讨了人类和三大文明之间的合作与理解的重要性 [Data: Reports (1, 2)]。
68 | 
69 | **总体结论**
70 | 
71 | 综上所述，这本书主要探讨的是地球的情感性质与三大文明的理性之间的对比，以及人类和三大文明之间的文化、技术差异和挑战。
72 | 2024-11-28 21:34:43,331 - search - INFO - 查询结束
73 | 


--------------------------------------------------------------------------------
/src/index/prompts.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import field, dataclass
 2 | 
 3 | @dataclass
 4 | class SystemPrompts:
 5 |     """Prompts for the graphrag algorithm"""
 6 |     GRAPHSYSTEMPROMPT:str = field(default=(
 7 |     "# Knowledge Graph Instructions for {model_name}\n"
 8 |     "## 1. Overview\n"
 9 |     "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n"
10 |     "Try to capture as much information from the text as possible without sacrificing accuracy. Do not add any information that is not explicitly mentioned in the text.\n"
11 |     "- **Nodes** represent entities and concepts.\n"
12 |     "- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.\n"
13 |     "## 2. Labeling Nodes\n"
14 |     "- **Consistency**: Ensure you use available types for node labels.\n"
15 |     "Ensure you use basic or elementary types for node labels.\n"
16 |     "- For example, when you identify an entity representing a person, always label it as **'person'**. Avoid using more specific terms like 'mathematician' or 'scientist'.\n"
17 |     "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n"
18 |     "- **Relationships** represent connections between entities or concepts.\n"
19 |     "- **Description**: The description should be combined with the context and the common knowledge you share to generate effective information about the Node. If you think a node has a matching description, please add the corresponding description. Otherwise, it is not necessary to add.\n"
20 |     "Ensure consistency and generality in relationship types when constructing knowledge graphs. Instead of using specific and momentary types such as 'BECAME_PROFESSOR', use more general and timeless relationship types like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
21 |     "## 3. Coreference Resolution\n"
22 |     "- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n"
23 |     'If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.\n'
24 |     "Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n"
25 |     "## 4. JSON Format\n"
26 |     "The output must be a valid JSON string enclosed within ```json and ```, without any extraneous text or explanations.\n"
27 |     "Ensure that:\n"
28 |     "- All strings are enclosed in double quotes.\n"
29 |     "- No HTML entities or escape characters (e.g., \\n) are used within strings.\n"
30 |     "## 5. Strict Compliance\n"
31 |     "Adhere to the rules strictly. Non-compliance will result in termination.\n"
32 |     "## 6. Example Output\n"
33 |     "For input: John Doe is a professor at the Princeton University.\n"
34 |     "The output should be:\n"
35 |     "```json\n"
36 |     "{{\n"
37 |     "  \"head\": \"John Doe\",\n"
38 |     "  \"head_description\": \"John Doe is a professor\",\n"
39 |     "  \"head_type\": \"Person\",\n"
40 |     "  \"tail\": \"University of Example\",\n"
41 |     "  \"tail_type\": \"University\",\n"
42 |     "  \"tail_description\": \"Princeton University is a prestigious Ivy League institution renowned for its academic excellence, rich history, and beautiful campus.\",\n"
43 |     "  \"relation\": \"PROFESSOR\"\n"
44 |     "}}\n"    
45 |     "```\n"
46 |     "## 7. Delimiters Requirement\n"
47 |     "Always wrap the JSON output with ```json at the beginning and ``` at the end. This is essential for parsing the output correctly. Do not omit the delimiters under any circumstances.\n"
48 | )
49 |                             )
50 |     IDENTIFY_SYSTEM_PROMPT:str = field(default="""You are a data processing assistant. Your task is to identify duplicate entities in a list and decide which of them should be merged.
51 |                                 The entities might be slightly different in format or content, but essentially refer to the same thing. Use your analytical skills to determine duplicates.
52 | 
53 |                                 Here are the rules for identifying duplicates:
54 |                                 1. Entities with minor typographical differences should be considered duplicates.
55 |                                 2. Entities with different formats but the same content should be considered duplicates.
56 |                                 3. Entities that refer to the same real-world object or concept, even if described differently, should be considered duplicates.
57 |                                 4. If it refers to different numbers, dates, or products, do not merge results
58 |                                 """
59 |                                 )
60 | 
61 | @dataclass
62 | class UserPrompts:
63 |     GRAPH_USER_PROMPT:str = field(default=(
64 |         "Tip: Make sure to answer in the correct format and do "
65 |         "not include any explanations. "
66 |         "Use the given format to extract information from the "
67 |         "following input: {input}"
68 |     ))
69 | 
70 |     IDENTIFY_USER_PROMPT:str = field(default="""
71 |                                 Here is the list of entities to process:
72 |                                 {entities}
73 | 
74 |                                 Please identify duplicates, merge them, and provide the merged list.
75 |                                 """
76 |                                 )
77 | 


--------------------------------------------------------------------------------
/src/query/global_search/key_points_generator/context_builder.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from langchain_core.documents import Document
  4 | 
  5 | # from langchain_graphrag.indexing.artifacts import IndexerArtifacts
  6 | from ..community_report import CommunityReport
  7 | from ..community_weight_calculator import CommunityWeightCalculator
  8 | from ...custom_types.graphs.community import CommunityId, CommunityLevel
  9 | from ...custom_types.tokens import TokenCounter
 10 | from langchain_community.graphs import Neo4jGraph
 11 | import pandas as pd 
 12 | 
 13 | 
 14 | _REPORT_TEMPLATE = """
 15 | --- Report {report_id} ---
 16 | 
 17 | Title: {title}
 18 | Weight: {weight}
 19 | Rank: {rank}
 20 | Report:
 21 | 
 22 | {content}
 23 | 
 24 | """
 25 | 
 26 | _LOGGER = logging.getLogger(__name__)
 27 | 
 28 | # 加注释的就说明neo4j里已经有了
 29 | 
 30 | class CommunityReportContextBuilder:
 31 |     def __init__(
 32 |         self,
 33 |         community_level: CommunityLevel,
 34 |         weight_calculator: CommunityWeightCalculator,
 35 |         # artifacts: IndexerArtifacts,
 36 |         id:str,
 37 |         graph: Neo4jGraph,
 38 |         token_counter: TokenCounter,
 39 |         max_tokens: int = 8000,
 40 |     ):
 41 |         self._community_level = community_level
 42 |         self._weight_calculator = weight_calculator
 43 |         self._graph = graph
 44 |         self._id = id
 45 |         self._token_counter = token_counter
 46 |         self._max_tokens = max_tokens
 47 | 
 48 |     def get_df_entities(self) -> pd.DataFrame:
 49 |         cypher_query = f"""
 50 |             MATCH (e:`__Entity__{self._id}`), (t:`Document{self._id}`)
 51 |             WHERE t.text CONTAINS e.id
 52 |             WITH e.id AS id, COLLECT(t.id) AS text_unit_ids
 53 |             RETURN id, text_unit_ids
 54 |         """
 55 |         
 56 |         # TODO:判断不为空的场景
 57 |         
 58 |         return pd.DataFrame.from_records(self._graph.query(cypher_query))
 59 |         
 60 |     # 暂时把content设置成summary
 61 |     # TODO:这里的跳数可能需要调整
 62 |     def get_df_reports(self):
 63 |         cypher_query = f"""
 64 |             match (n:`__Community__{self._id}`) 
 65 |             where n.summary is not NULL 
 66 |             optional match path = (e:`__Entity__{self._id}`)-[*1..3]->(n)
 67 |             WHERE ALL(x IN nodes(path) WHERE SINGLE(y IN nodes(path) WHERE y = x))
 68 |             RETURN 
 69 |                 n.id AS community_id,
 70 |                 n.title AS title,
 71 |                 n.summary AS summary,
 72 |                 n.community_rank AS rating,
 73 |                 n.summary AS content,
 74 |                 n.level AS level,
 75 |                 collect(DISTINCT e.id) AS entities
 76 |         """
 77 |         return pd.DataFrame.from_records(self._graph.query(cypher_query))
 78 |     
 79 |     def _filter_communities(self) -> list[CommunityReport]:
 80 | 
 81 |         df_entities = self.get_df_entities()
 82 |         df_reports = self.get_df_reports()
 83 |         reports_weight: dict[CommunityId, float] = self._weight_calculator(
 84 |             df_entities,
 85 |             df_reports,
 86 |         )
 87 | 
 88 |         df_reports_filtered = df_reports[df_reports["level"] >= self._community_level]
 89 |         
 90 |         reports = []
 91 |         for _, row in df_reports_filtered.iterrows():
 92 |             reports.append(
 93 |                 CommunityReport(
 94 |                     id=row["community_id"],
 95 |                     weight=reports_weight[row["community_id"]],
 96 |                     title=row["title"],
 97 |                     summary=row["summary"],
 98 |                     rank=row["rating"],
 99 |                     content=row["content"],
100 |                 )
101 |             )
102 |         return reports
103 | 
104 |     def __call__(self) -> list[Document]:
105 |         reports = self._filter_communities()
106 |          
107 |         documents: list[Document] = []
108 |         report_str_accumulated: list[str] = []
109 |         token_count = 0
110 |         for report in reports:
111 |             # we would try to combine multiple
112 |             # reports into a single document
113 |             # as long as we do not exceed the token limit
114 |             report_str = _REPORT_TEMPLATE.format(
115 |                 report_id=report.id,
116 |                 title=report.title,
117 |                 weight=report.weight,
118 |                 rank=report.rank,
119 |                 content=report.content,
120 |             )
121 |             
122 |             report_str_token_count = self._token_counter.count_tokens(report_str)
123 | 
124 |             if token_count + report_str_token_count > self._max_tokens:
125 |                 _LOGGER.warning("Reached max tokens for a community report call ...")
126 |                 # we cut a new document here
127 |                 documents.append(
128 |                     Document(
129 |                         page_content="\n".join(report_str_accumulated),
130 |                         metadata={"token_count": token_count},
131 |                     )
132 |                 )
133 |                 # reset the token count and the accumulated string
134 |                 token_count = 0
135 |                 report_str_accumulated = []
136 |             else:
137 |                 token_count += report_str_token_count
138 |                 report_str_accumulated.append(report_str)
139 | 
140 |         if report_str_accumulated:
141 |             documents.append(
142 |                 Document(
143 |                     page_content="\n".join(report_str_accumulated),
144 |                     metadata={"token_count": token_count},
145 |                 )
146 |             )
147 | 
148 |         if _LOGGER.isEnabledFor(logging.DEBUG):
149 |             import tableprint
150 | 
151 |             rows = []
152 |             tableprint.banner("KP Generation Context Token Usage")
153 |             for index, doc in enumerate(documents):
154 |                 rows.append([f"Report {index}", doc.metadata["token_count"]])
155 | 
156 |             tableprint.table(rows, ["Reports", "Token Count"])
157 | 
158 |         return documents


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/relationships.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import logging
  3 | from typing import NamedTuple
  4 | 
  5 | import pandas as pd
  6 | 
  7 | _LOGGER = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class RelationshipsSelectionResult(NamedTuple):
 11 |     in_network_relationships: pd.DataFrame
 12 |     out_network_relationships: pd.DataFrame
 13 | 
 14 | 
 15 | def _find_in_network_relationships(
 16 |     df_entities: pd.DataFrame,
 17 |     df_relationships: pd.DataFrame,
 18 |     source_column_name: str = "source_id",
 19 |     target_column_name: str = "target_id",
 20 |     entity_column_name: str = "id",
 21 | ) -> pd.DataFrame:
 22 |     entities_ids = df_entities[entity_column_name].tolist()
 23 |     entities_pairs = list(itertools.combinations(entities_ids, 2))
 24 | 
 25 |     def filter_in_network_relationships(source: str, target: str) -> bool:
 26 |         check_1 = (source, target) in entities_pairs
 27 |         check_2 = (target, source) in entities_pairs
 28 |         return check_1 == True or check_2 == True  # noqa: E712
 29 | 
 30 |     df_relationships["is_in_network"] = df_relationships.apply(
 31 |         lambda x: filter_in_network_relationships(
 32 |             x[source_column_name], x[target_column_name]
 33 |         ),
 34 |         axis=1,
 35 |     )
 36 | 
 37 |     df_relationships = df_relationships[df_relationships["is_in_network"] == True]  # noqa: E712
 38 | 
 39 |     df_relationships.drop(columns=["is_in_network"], inplace=True)
 40 | 
 41 |     # sort the relationships by rank
 42 |     df_relationships = df_relationships.sort_values(
 43 |         by="rank", ascending=False
 44 |     ).reset_index(drop=True)
 45 | 
 46 |     if _LOGGER.isEnabledFor(logging.DEBUG):
 47 |         import tableprint
 48 | 
 49 |         how_many = len(df_relationships)
 50 | 
 51 |         tableprint.banner(f"Selected {how_many} In-Network Relationships")
 52 |         tableprint.dataframe(df_relationships[["source", "target", "rank"]])
 53 | 
 54 |     return df_relationships
 55 | 
 56 | 
 57 | def _find_out_network_relationships(
 58 |     df_entities: pd.DataFrame,
 59 |     df_relationships: pd.DataFrame,
 60 |     top_k: int = 10,
 61 |     source_column_name: str = "source_id",
 62 |     target_column_name: str = "target_id",
 63 |     entity_column_name: str = "id",
 64 | ) -> pd.DataFrame:
 65 |     entities_ids = df_entities[entity_column_name].tolist()
 66 | 
 67 |     # top_k is budget for out-network relationships
 68 |     relationship_budget = top_k * len(entities_ids)
 69 | 
 70 |     def filter_out_network_relationships(source: str, target: str) -> bool:
 71 |         if source in entities_ids and target not in entities_ids:
 72 |             return True
 73 |         if target in entities_ids and source not in entities_ids:  # noqa: SIM103
 74 |             return True
 75 | 
 76 |         return False
 77 | 
 78 |     df_relationships["is_out_network"] = df_relationships.apply(
 79 |         lambda x: filter_out_network_relationships(
 80 |             x[source_column_name], x[target_column_name]
 81 |         ),
 82 |         axis=1,
 83 |     )
 84 | 
 85 |     df_relationships = df_relationships[df_relationships["is_out_network"] == True]  # noqa: E712
 86 | 
 87 |     df_relationships.drop(columns=["is_out_network"], inplace=True)
 88 | 
 89 |     # now we need to prioritize based on which external
 90 |     # entities have the most connection with the selected entities
 91 |     # we will do this by counting the number of relationships
 92 |     # each external entity has with the selected entities
 93 |     source_external_entities = df_relationships[
 94 |         ~df_relationships[source_column_name].isin(entities_ids)
 95 |     ][source_column_name]
 96 | 
 97 |     target_external_entities = df_relationships[
 98 |         ~df_relationships[target_column_name].isin(entities_ids)
 99 |     ][target_column_name]
100 | 
101 |     df_relationships = (
102 |         df_relationships.merge(
103 |             source_external_entities.value_counts(),
104 |             how="left",
105 |             left_on=source_column_name,
106 |             right_on=source_column_name,
107 |         )
108 |         .fillna(0)
109 |         .rename(columns={"count": "source_count"})
110 |     )
111 | 
112 |     df_relationships = (
113 |         df_relationships.merge(
114 |             target_external_entities.value_counts(),
115 |             how="left",
116 |             left_on=target_column_name,
117 |             right_on=target_column_name,
118 |         )
119 |         .fillna(0)
120 |         .rename(columns={"count": "target_count"})
121 |     )
122 | 
123 |     df_relationships["links"] = (
124 |         df_relationships["source_count"] + df_relationships["target_count"]
125 |     )
126 | 
127 |     df_relationships = df_relationships.sort_values(
128 |         by=["links", "rank"],
129 |         ascending=[False, False],
130 |     ).reset_index(drop=True)
131 | 
132 |     # time to use the budget
133 |     df_relationships = df_relationships.head(relationship_budget)
134 | 
135 |     if _LOGGER.isEnabledFor(logging.DEBUG):
136 |         import tableprint
137 | 
138 |         how_many = len(df_relationships)
139 | 
140 |         tableprint.banner(f"Selected {how_many} Out-Network Relationships")
141 |         tableprint.dataframe(df_relationships[["source", "target", "rank", "links"]])
142 | 
143 |     return df_relationships
144 | 
145 | 
146 | class RelationshipsSelector:
147 |     def __init__(self, top_k_out_network: int = 5):
148 |         self._top_k_out_network = top_k_out_network
149 | 
150 |     def run(
151 |         self,
152 |         df_entities: pd.DataFrame,
153 |         df_relationships: pd.DataFrame,
154 |     ) -> RelationshipsSelectionResult:
155 |         in_network_relationships = _find_in_network_relationships(
156 |             df_entities,
157 |             df_relationships.copy(deep=True),
158 |         )
159 | 
160 |         out_network_relationships = _find_out_network_relationships(
161 |             df_entities,
162 |             df_relationships.copy(deep=True),
163 |             top_k=self._top_k_out_network,
164 |         )
165 | 
166 |         return RelationshipsSelectionResult(
167 |             in_network_relationships,
168 |             out_network_relationships,
169 |         )


--------------------------------------------------------------------------------
/src/splitter/slide_window_splitter.py:
--------------------------------------------------------------------------------
  1 | from typing import List,Optional,Any,Union,Literal
  2 | from transformers import AutoTokenizer
  3 | from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
  4 | # from algo.chunk.utils import pretty_print
  5 | import re
  6 | 
  7 | 
  8 | def _split_text_with_regex(
  9 |     text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
 10 | ) -> List[str]:
 11 |     # Now that we have the separator, split the text
 12 |     if separator:
 13 |         if keep_separator:
 14 |             # The parentheses in the pattern keep the delimiters in the result.
 15 |             _splits = re.split(f"({separator})", text)
 16 |             splits = (
 17 |                 ([_splits[i] + _splits[i + 1]
 18 |                  for i in range(0, len(_splits) - 1, 2)])
 19 |                 if keep_separator == "end"
 20 |                 else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)])
 21 |             )
 22 |             if len(_splits) % 2 == 0:
 23 |                 splits += _splits[-1:]
 24 |             splits = (
 25 |                 (splits + [_splits[-1]])
 26 |                 if keep_separator == "end"
 27 |                 else ([_splits[0]] + splits)
 28 |             )
 29 |         else:
 30 |             splits = re.split(separator, text)
 31 |     else:
 32 |         splits = list(text)
 33 |     return [s for s in splits if s != ""]
 34 | 
 35 | 
 36 | class SentenceSlidingWindowChunkSplitter(TextSplitter):
 37 |     def __init__(
 38 |         self,
 39 |         sliding_chunk_size: int,
 40 |         separators: Optional[List[str]] = None,
 41 |         keep_separator: Union[bool, Literal["start", "end"]] = False,
 42 |         is_separator_regex: bool = False,
 43 |         sliding_distance: int = 2,
 44 |         **kwargs: Any,
 45 |     ) -> None:
 46 |         # 不需要chunk overlap
 47 |         super().__init__(keep_separator=keep_separator, chunk_overlap=0, **kwargs)
 48 |         self._separators = separators or ["\n\n", "\n", " ", ""]
 49 |         self._is_separator_regex = is_separator_regex
 50 |         self.sliding_distance = sliding_distance
 51 |         self.sliding_chunk_size = sliding_chunk_size
 52 |         # self.tokenzier = kwargs.get("tokenizer") if kwargs.get("tokenizer") else None
 53 |         assert (
 54 |             self.sliding_distance >= 0
 55 |         ), "Sliding distance must be greater than or equal to 0."
 56 |         if self._chunk_size > self.sliding_chunk_size:
 57 |             Warning(
 58 |                 "Chunk size is bigger than sliding chunk_size, setting chunk size to sentence size."
 59 |             )
 60 |             self._chunk_size = self.sliding_chunk_size
 61 | 
 62 |     def _split_text(self, text: str, separators: List[str]) -> List[str]:
 63 |         """Split incoming text and return chunks."""
 64 |         final_chunks = []
 65 |         # Get appropriate separator to use
 66 |         separator = separators[-1]
 67 |         new_separators = []
 68 |         for i, _s in enumerate(separators):
 69 |             _separator = _s if self._is_separator_regex else re.escape(_s)
 70 |             if _s == "":
 71 |                 separator = _s
 72 |                 break
 73 |             if re.search(_separator, text):
 74 |                 separator = _s
 75 |                 new_separators = separators[i + 1:]
 76 |                 break
 77 | 
 78 |         _separator = separator if self._is_separator_regex else re.escape(
 79 |             separator)
 80 |         splits = _split_text_with_regex(text, _separator, self._keep_separator)
 81 | 
 82 |         # Now go merging things, recursively splitting longer texts.
 83 |         _good_splits = []
 84 |         _separator = "" if self._keep_separator else separator
 85 |         for s in splits:
 86 |             if self._length_function(s) < self._chunk_size:
 87 |                 _good_splits.append(s)
 88 |             else:
 89 |                 if _good_splits:
 90 |                     merged_text = self._merge_splits(_good_splits, _separator)
 91 |                     final_chunks.extend(merged_text)
 92 |                     _good_splits = []
 93 |                 if not new_separators:
 94 |                     final_chunks.append(s)
 95 |                 else:
 96 |                     other_info = self._split_text(s, new_separators)
 97 |                     final_chunks.extend(other_info)
 98 |         if _good_splits:
 99 |             merged_text = self._merge_splits(_good_splits, _separator)
100 |             final_chunks.extend(merged_text)
101 |         return final_chunks
102 | 
103 |     @classmethod
104 |     def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
105 |         """Text splitter that uses HuggingFace tokenizer to count length."""
106 |         try:
107 |             from transformers import PreTrainedTokenizerBase
108 | 
109 |             if not isinstance(tokenizer, PreTrainedTokenizerBase):
110 |                 raise ValueError(
111 |                     "Tokenizer received was not an instance of PreTrainedTokenizerBase"
112 |                 )
113 | 
114 |             def _huggingface_tokenizer_length(text: str) -> int:
115 |                 return len(tokenizer.encode(text))
116 | 
117 |         except ImportError:
118 |             raise ValueError(
119 |                 "Could not import transformers python package. "
120 |                 "Please install it with `pip install transformers`."
121 |             )
122 |         return cls(length_function=_huggingface_tokenizer_length, **kwargs)
123 | 
124 |     # Core function
125 |     def split_text(self, text: str) -> List[str]:
126 |         sentence_chunks = self._split_text(text, self._separators)
127 |         final_chunks = []
128 |         # 合并
129 |         for i in range(len(sentence_chunks)):
130 |             combined_split = sentence_chunks[i]
131 |             j = 1
132 | 
133 |             while j <= self.sliding_distance:
134 |                 if i - j >= 0:
135 |                     if (
136 |                         self._length_function(
137 |                             sentence_chunks[i - j] + combined_split)
138 |                         > self.sliding_chunk_size
139 |                     ):
140 |                         break
141 |                     combined_split = sentence_chunks[i - j] + combined_split
142 |                 if i + j < len(sentence_chunks):
143 |                     if (
144 |                         self._length_function(
145 |                             combined_split + sentence_chunks[i + j])
146 |                         > self.sliding_chunk_size
147 |                     ):
148 |                         break
149 |                     combined_split += sentence_chunks[i + j]
150 |                 j += 1
151 |             final_chunks.append(combined_split)
152 |         return final_chunks


--------------------------------------------------------------------------------
/src/query/local_search/context_selectors/context.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import NamedTuple
  4 | 
  5 | import pandas as pd
  6 | from langchain_core.vectorstores import VectorStore
  7 | from ...custom_types.graphs.community import CommunityLevel
  8 | 
  9 | from .communities_reports import CommunitiesReportsSelector
 10 | from .entities import EntitiesSelector
 11 | from .relationships import RelationshipsSelectionResult, RelationshipsSelector
 12 | from .text_units import TextUnitsSelector
 13 | 
 14 | from langchain_community.graphs.neo4j_graph import Neo4jGraph
 15 | from dataclasses import dataclass, field
 16 | 
 17 | @dataclass
 18 | class GraphDataFrame:
 19 |     entities: pd.DataFrame
 20 |     relationships: pd.DataFrame
 21 |     text_units: pd.DataFrame
 22 |     communities_reports: pd.DataFrame
 23 | 
 24 | @dataclass
 25 | class CypherQuery:
 26 |     uuid: str
 27 |     entities_query: str = field(init=False)
 28 |     relationships_query: str = field(init=False)
 29 |     text_units_query: str = field(init=False)
 30 |     communities_reports_query: str = field(init=False)
 31 |     
 32 |     def __post_init__(self):
 33 |     
 34 |         self.entities_query:str = field(default=f"""
 35 |             MATCH (n) 
 36 |             WHERE ANY(label IN labels(n) WHERE label ENDS WITH '{self.uuid}')
 37 |             RETURN n.id as id,n.description as description,n.degree as degree,n.text_unit_ids as text_unit_ids,n.communities as communities
 38 |         """)
 39 |         self.relationships_query:str = field(default=f"""
 40 |             MATCH (s:`__Entity__{self.uuid}`)-[r]->(t:`__Entity__{self.uuid}`)
 41 |             RETURN id(r) as id,s.id as source_id,t.id as target_id,r.rank as rank,r.source as source,r.target as target
 42 |         """)
 43 |         self.text_units_query:str = field(default=f"""
 44 |             MATCH (n:`Document{self.uuid}`)-[r]->(m:`__Entity__{self.uuid}`)
 45 |             RETURN n.id AS id, COLLECT(ID(r)) AS relationship_ids, n.text AS text_unit;
 46 |         """)
 47 |         self.communities_reports_query:str = field(default=f"""
 48 |             MATCH (n:`__Community__{self.uuid}`)
 49 |             RETURN ID(n) AS id, n.level AS level, n.community_rank AS rating, n.id AS community_id,n.title as title,n.summary as content;
 50 |         """)
 51 | 
 52 | def getInfoFromNeo4j(graph:Neo4jGraph,uuid:str)->GraphDataFrame:
 53 |     query = CypherQuery(uuid)    
 54 |     entites_res = graph.query(query.entities_query.default)
 55 |     relationships_res = graph.query(query.relationships_query.default)
 56 |     text_units_res = graph.query(query.text_units_query.default)
 57 |     communities_reports_res = graph.query(query.communities_reports_query.default)
 58 |     # 都不为[]
 59 |     # FIXME:如果存在错误需要检查原因
 60 |     assert entites_res != [],"实体记录不存在"
 61 |     assert relationships_res != [],"关系记录不存在"
 62 |     assert text_units_res != [],"文本单元记录不存在"
 63 |     assert communities_reports_res != [],"社区报告记录不存在"    
 64 |     entites = pd.DataFrame.from_records(entites_res)
 65 |     entites = entites[entites['text_unit_ids'].notna()]
 66 |     relationships = pd.DataFrame.from_records(relationships_res)
 67 |     text_units = pd.DataFrame.from_records(text_units_res)
 68 |     communities_reports = pd.DataFrame.from_records(communities_reports_res)
 69 |     return GraphDataFrame(entities=entites,relationships=relationships,text_units=text_units,communities_reports=communities_reports)
 70 |     
 71 | class ContextSelectionResult(NamedTuple):
 72 |     entities: pd.DataFrame
 73 |     text_units: pd.DataFrame
 74 |     relationships: RelationshipsSelectionResult
 75 |     communities_reports: pd.DataFrame
 76 | 
 77 | class ContextSelector:
 78 |     def __init__(
 79 |         self,
 80 |         entities_selector: EntitiesSelector,
 81 |         text_units_selector: TextUnitsSelector,
 82 |         relationships_selector: RelationshipsSelector,
 83 |         communities_reports_selector: CommunitiesReportsSelector,
 84 |         USER_ID: str,
 85 |     ):
 86 |         self._entities_selector = entities_selector
 87 |         self._text_units_selector = text_units_selector
 88 |         self._relationships_selector = relationships_selector
 89 |         self._communities_reports_selector = communities_reports_selector
 90 |         self._USER_ID = USER_ID
 91 | 
 92 |     @staticmethod
 93 |     def build_default(
 94 |         entities_vector_store: VectorStore,
 95 |         entities_top_k: int,
 96 |         community_level: CommunityLevel,
 97 |         USER_ID: str,
 98 |     ) -> ContextSelector:
 99 |         
100 |         return ContextSelector(
101 |             entities_selector=EntitiesSelector(
102 |                 vector_store=entities_vector_store,
103 |                 top_k=entities_top_k,
104 |             ),
105 |             text_units_selector=TextUnitsSelector(),
106 |             relationships_selector=RelationshipsSelector(),
107 |             communities_reports_selector=CommunitiesReportsSelector(
108 |                 community_level=community_level
109 |             ),
110 |             USER_ID=USER_ID,
111 |         )
112 | 
113 |     def run(
114 |         self,
115 |         query: str,
116 |         graph:Neo4jGraph
117 |     ):
118 |         
119 |         # 获取所有实体并转化成df
120 |         # 获取所有关系并转化成df
121 |         # 获取所有文本单元并转化成df
122 |         # 获取所有社区报告并转化成df
123 |         graphDF = getInfoFromNeo4j(graph,self._USER_ID)
124 | 
125 |         # Step 1
126 |         # Select the entities to be used in the local search
127 |         selected_entities = self._entities_selector.run(query, graphDF.entities)
128 | 
129 |         # Step 2
130 |         # Select the text units to be used in the local search
131 |         selected_text_units = self._text_units_selector.run(
132 |             df_entities=selected_entities,
133 |             df_relationships=graphDF.relationships,
134 |             df_text_units=graphDF.text_units,
135 |         )
136 | 
137 |         # Step 3
138 |         # Select the relationships to be used in the local search
139 |         selected_relationships = self._relationships_selector.run(
140 |             df_entities=selected_entities,
141 |             df_relationships=graphDF.relationships,
142 |         )
143 | 
144 |         # Step 4
145 |         # Select the communities to be used in the local search
146 |         selected_communities_reports = self._communities_reports_selector.run(
147 |             df_entities=selected_entities,
148 |             df_reports=graphDF.communities_reports,
149 |         )
150 | 
151 |         return ContextSelectionResult(
152 |             entities=selected_entities,
153 |             text_units=selected_text_units,
154 |             relationships=selected_relationships,
155 |             communities_reports=selected_communities_reports,
156 |         )


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from src.query import LocalSearcher, GlobalSearcher
  3 | from langchain_community.graphs import Neo4jGraph
  4 | from langchain_openai.chat_models import ChatOpenAI
  5 | from langchain_ollama import ChatOllama
  6 | from langchain_huggingface import HuggingFacePipeline,HuggingFaceEmbeddings,ChatHuggingFace
  7 | from src.utils.logger import create_rotating_logger
  8 | import argparse
  9 | import logging
 10 | from dataclasses import dataclass
 11 | from langchain_huggingface import HuggingFaceEmbeddings
 12 | import getpass
 13 | import torch
 14 | 
 15 | @dataclass
 16 | class LOG_LEVELS:
 17 |     debug = logging.DEBUG
 18 |     info = logging.INFO
 19 |     warning = logging.WARNING
 20 |     error = logging.ERROR
 21 |     critical = logging.CRITICAL
 22 | 
 23 | def parse_args():
 24 |     arg_parser = argparse.ArgumentParser(description="search for LangChainGraphRAG")
 25 |     
 26 |     arg_parser.add_argument("--neo4j_uri", type=str,default=None, help="Neo4j URI")
 27 |     arg_parser.add_argument("--neo4j_username", type=str,default=None, help="Neo4j user")
 28 |     arg_parser.add_argument("--neo4j_password", type=str,default=None, help="Neo4j password")
 29 | 
 30 |     arg_parser.add_argument("--model_provider", type=str,choices=['openai','ollama','hf'], help="Model provider")
 31 |     arg_parser.add_argument("--chat_model_name", type=str, help="Chat model name")
 32 |     arg_parser.add_argument("--repo_id", type=str, default="NousResearch/Meta-Llama-3.1-8B-Instruct",help="Repo ID")
 33 |     arg_parser.add_argument("--base_url", type=str, help="Base URL")
 34 |     
 35 |     # FIXME:可以做更好的区分；embedding可以使用bge-m3等模型也可以用api模型
 36 |     arg_parser.add_argument("--embedding_model_name_or_path", type=str, help="Embedding Model name")
 37 |     
 38 |     arg_parser.add_argument("--uuid", type=str, default="", help="UUID for the search")
 39 |     arg_parser.add_argument("--top_k", type=int, default=15,help="top_k for the search")
 40 |     arg_parser.add_argument("--level", type=int, default=1, help="level for the search")
 41 |     arg_parser.add_argument("--max_tokens", type=int, default=8000, help="Max tokens")
 42 |     
 43 |     arg_parser.add_argument("--log_file", type=str, default="search.log", help="Log file")
 44 |     arg_parser.add_argument("--log_level", type=str, default="info", choices=['debug','info','warning','error','critical'],help="Log level")
 45 |     arg_parser.add_argument("--max_workers", type=int, default=4, help="Max workers")
 46 |     arg_parser.add_argument("--device", type=str, default="cpu",choices=['cuda','cpu'],help="Device")
 47 | 
 48 |     arg_parser.add_argument("--completion_mode",type=str,choices=['chat','completion'],default='chat',help="完成模式")
 49 |     arg_parser.add_argument("--query_mode",type=str,choices=['local','global'],default='local',help="查询模式")
 50 |     
 51 |     return arg_parser.parse_args()
 52 | 
 53 | 
 54 | def query():
 55 |     args = parse_args()
 56 |     
 57 |     log_level = getattr(LOG_LEVELS, args.log_level)
 58 |     logger = create_rotating_logger("search", args.log_file, level=log_level)
 59 |     
 60 |     # logging args
 61 |     logger.info(f"args: {args}")
 62 |     
 63 | 
 64 |     # 初始化环境变量
 65 |     # 优先从os.environ中获取，如果没有则从args中获取
 66 |     if os.environ.get("NEO4J_URI") is None:
 67 |         os.environ["NEO4J_URI"] = args.neo4j_uri
 68 |     if os.environ.get("NEO4J_USERNAME") is None:
 69 |         os.environ["NEO4J_USERNAME"] = args.neo4j_username
 70 |     if os.environ.get("NEO4J_PASSWORD") is None:
 71 |         os.environ["NEO4J_PASSWORD"] = args.neo4j_password
 72 | 
 73 |     logger.info("Connecting to Neo4j")
 74 |     
 75 |     try:
 76 |         graph = Neo4jGraph()
 77 |     except:
 78 |         logger.error(
 79 |                      "Failed to connect to Neo4j"
 80 |                      f"URI: {args.uri}, Username: {args.username}, Password: Your password"
 81 |         )
 82 |         return
 83 |     
 84 |     # 初始化chat model
 85 |     if args.model_provider=="openai" and os.environ.get("OPENAI_API_KEY") is None:
 86 |         try:
 87 |             os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
 88 |         except:
 89 |             logger.error("Failed to get OpenAI API key")
 90 |             return
 91 |     
 92 |     
 93 |     logger.info("Initializing chat model")
 94 |     if args.model_provider == "openai":
 95 |         chat_model = ChatOpenAI(model=args.chat_model_name, base_url=args.base_url, api_key=os.environ["OPENAI_API_KEY"])
 96 |     elif args.model_provider == "hf":
 97 |         llm = HuggingFacePipeline.from_model_id(
 98 |             model_id=args.repo_id,
 99 |             task="text-generation",
100 |             pipeline_kwargs=dict(
101 |                 max_new_tokens=1024,
102 |                 temperature=0.0,
103 |                 do_sample=False,
104 |                 # repetition_penalty=1.03,
105 |                 return_full_text=False,
106 |                 # top_k=args.topK,
107 |             ),
108 |             model_kwargs=dict(
109 |                 attn_implementation="flash_attention_2" if args.flash_attn else None,
110 |                 torch_dtype=torch.bfloat16,
111 |             ),
112 |             device=0,
113 |             # device_map="auto" if args.device == "cuda" else None,
114 |         )
115 | 
116 |         llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id 
117 |         chat_model = ChatHuggingFace(llm=llm)
118 |     elif args.model_provider == "ollama":
119 |         chat_model = ChatOllama(
120 |             model=args.chat_model_name,
121 |             num_predict=512,
122 |             temperature=0.0,
123 |         )
124 |         # 初始化embedding
125 |     logger.info("Initializing embedding")
126 |     # model_kwargs = {'device': args.device}
127 |     encode_kwargs = {'normalize_embeddings': True}
128 |     embedding = HuggingFaceEmbeddings(model_name=args.embedding_model_name_or_path,encode_kwargs=encode_kwargs,show_progress=True)
129 |     logger.info("查询模式：{}".format(args.query_mode))
130 |     if args.query_mode == 'local':
131 |         searcher = LocalSearcher(
132 |             graph=graph,
133 |             chat_model=chat_model,
134 |             embedding=embedding,
135 |             uuid=args.uuid,
136 |             top_k=args.top_k,
137 |             level=args.level
138 |         )
139 |     elif args.query_mode == 'global':
140 |         searcher = GlobalSearcher(
141 |             graph=graph,
142 |             chat_model=chat_model,
143 |             uuid=args.uuid,
144 |             level=args.level,
145 |             max_tokens=args.max_tokens
146 |         )
147 |     
148 |     completion_mode = args.completion_mode
149 | 
150 |     logger.info("Starting query now!")
151 |     # 查询逻辑
152 |     if completion_mode == 'chat':
153 |         try:
154 |             while True:
155 |                 query = input("请输入您的查询 (输入 'exit' 或 'quit' 退出):\n")
156 |                 if query.lower() in ['exit', 'quit']:
157 |                     logger.info("退出程序")
158 |                     break
159 |                 else:
160 |                     # 在这里处理用户的查询
161 |                     result = searcher.invoke(query)
162 |                     logger.info(f"查询结果:\n{result}")
163 |         except KeyboardInterrupt:
164 |             print("\n程序被中断，退出。")
165 |                 
166 |     elif completion_mode == 'completion':
167 |         query = input("请输入您的查询:\n")
168 |         result = searcher.invoke(query)
169 |         logger.info(f"查询结果:\n{result}")
170 | 
171 |     logger.info("查询结束")
172 |     
173 | if __name__ == "__main__":
174 |     query()
175 |     
176 |     
177 |     
178 | 


--------------------------------------------------------------------------------
/src/index/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | from langchain_community.graphs import Neo4jGraph
  3 | from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
  4 | from langchain.schema.messages import SystemMessage
  5 | from langchain_community.graphs.graph_document import GraphDocument
  6 | from langchain_core.documents import Document
  7 | 
  8 | from retry import retry
  9 | import numpy as np
 10 | import tiktoken
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | import pandas as pd
 14 | import asyncio
 15 | 
 16 | from .prompts import SystemPrompts, UserPrompts
 17 | 
 18 | 
 19 | 
 20 | def num_tokens_from_string(string: str, model: str = "cl100k_base") -> int:
 21 |     """Returns the number of tokens in a text string."""
 22 |     encoding = tiktoken.get_encoding(model)
 23 |     num_tokens = len(encoding.encode(string))
 24 |     return num_tokens
 25 | 
 26 | 
 27 | def visualizeEntityTokenDistibution(graph: Neo4jGraph,user_id):
 28 |     entity_dist = graph.query(
 29 |        f"""
 30 |     MATCH (d:Document{user_id})
 31 |     RETURN d.text AS text,
 32 |         count {{(d)-[:MENTIONS]->()}} AS entity_count
 33 |     """
 34 |     )
 35 |     entity_dist_df = pd.DataFrame.from_records(entity_dist)
 36 |     entity_dist_df["token_count"] = [
 37 |         num_tokens_from_string(str(el)) for el in entity_dist_df["text"]
 38 |     ]
 39 |     # Scatter plot with regression line
 40 |     sns.lmplot(
 41 |         x="token_count", y="entity_count", data=entity_dist_df, line_kws={"color": "red"}
 42 |     )
 43 |     plt.title("Entity Count vs Token Count Distribution")
 44 |     plt.xlabel("Token Count")
 45 |     plt.ylabel("Entity Count")
 46 |     plt.show()
 47 |     plt.savefig('entity_token_distribution.png')
 48 | 
 49 | 
 50 | def visualizeCommunityEntityDistribution(graph: Neo4jGraph, user_id):
 51 |     # 查询每个层次的社区包含的实体的数量
 52 |     community_size = graph.query(f"""
 53 |         MATCH (c:__Community__{user_id})<-[:IN_COMMUNITY*]-(e:__Entity__{user_id}) 
 54 |         WITH c, count(distinct e) AS entities   
 55 |         RETURN split(c.id, '-')[0] AS level, entities
 56 |         """
 57 |     )
 58 |     
 59 |     community_size_df = pd.DataFrame.from_records(community_size)
 60 |     
 61 |     # 计算百分位数
 62 |     percentiles_data = []
 63 |     for level in community_size_df['level'].unique():
 64 |         subset = community_size_df[community_size_df['level'] == level]['entities']
 65 |         num_communities = len(subset)
 66 |         percentiles = np.percentile(subset, [25, 50, 75, 90, 99])
 67 |         percentiles_data.append(
 68 |             [
 69 |                 level,
 70 |                 num_communities,
 71 |                 percentiles[0],
 72 |                 percentiles[1],
 73 |                 percentiles[2],
 74 |                 percentiles[3],
 75 |                 percentiles[4],
 76 |                 max(subset),
 77 |             ]
 78 |         )
 79 |     
 80 |     percentiles_df = pd.DataFrame(
 81 |         percentiles_data,
 82 |         columns=[
 83 |             "Level",
 84 |             "Num Communities",
 85 |             "25th Percentile",
 86 |             "50th Percentile",
 87 |             "75th Percentile",
 88 |             "90th Percentile",
 89 |             "99th Percentile",
 90 |             "Max Communities",
 91 |         ],
 92 |     )
 93 |     
 94 |     # 创建图形和子图
 95 |     fig, axs = plt.subplots(2, 1, figsize=(10, 12), sharex=True)
 96 | 
 97 |     # 可视化最大社群数量
 98 |     sns.barplot(data=percentiles_df, x='Level', y='Max Communities', ax=axs[0], color='skyblue', label='Max Communities', alpha=0.7)
 99 |     sns.barplot(data=percentiles_df, x='Level', y='50th Percentile', ax=axs[0], color='orange', label='50th Percentile', alpha=0.5)
100 |     axs[0].set_title('Community Entity Distribution by Level', fontsize=16)
101 |     axs[0].set_ylabel('Number of Communities', fontsize=14)
102 |     axs[0].legend()
103 |     axs[0].grid(axis='y')
104 | 
105 |     # 可视化社群个数
106 |     sns.barplot(data=percentiles_df, x='Level', y='Num Communities', ax=axs[1], color='lightgreen')
107 |     axs[1].set_title('Number of Communities by Level', fontsize=16)
108 |     axs[1].set_xlabel('Community Level', fontsize=14)
109 |     axs[1].set_ylabel('Number of Communities', fontsize=14)
110 |     axs[1].grid(axis='y')
111 | 
112 |     plt.xticks(rotation=45)
113 |     plt.tight_layout()
114 |     
115 |     # 保存集成的图像为 PNG 文件
116 |     plt.savefig(f'community_distribution_combined_{user_id}.png', dpi=300)
117 |     plt.show()
118 | 
119 | 
120 | def countNodesMerged(user_id,merged_entities,graph: Neo4jGraph):
121 |     count = graph.query("""
122 |         UNWIND $data AS candidates
123 |         CALL {{
124 |             WITH candidates
125 |             MATCH (e:{label}) WHERE e.id IN candidates
126 |             RETURN collect(e) AS nodes
127 |         }}
128 |         CALL apoc.refactor.mergeNodes(nodes, {{properties: {{`.*`: 'discard'}}}})
129 |         YIELD node
130 |         RETURN count(*)
131 |         """.format(label=f"__Entity__{user_id}"), params={"data": merged_entities}
132 |     )
133 |     print(f"{count} nodes merged")
134 | 
135 | def prepare_string(data):
136 |     nodes_str = "Nodes are:\n"
137 |     for node in data['nodes']:
138 |         node_id = node['id']
139 |         node_type = node['type']
140 |         if 'description' in node and node['description']:
141 |             node_description = f", description: {node['description']}"
142 |         else:
143 |             node_description = ""
144 |         nodes_str += f"id: {node_id}, type: {node_type}{node_description}\n"
145 |     rels_str = "Relationships are:\n"
146 |     for rel in data['rels']:
147 |         start = rel['start']
148 |         end = rel['end']
149 |         rel_type = rel['type']
150 |         if 'description' in rel and rel['description']:
151 |             description = f", description: {rel['description']}"
152 |         else:
153 |             description = ""
154 |         rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"
155 | 
156 |     return nodes_str + "\n" + rels_str
157 | 
158 | def create_prompt(model_name):
159 |     system_prompt = SystemPrompts.GRAPHSYSTEMPROMPT.format(model_name=model_name)
160 |     system_message = SystemMessage(content=system_prompt)
161 |     human_message = HumanMessagePromptTemplate.from_template(UserPrompts.GRAPH_USER_PROMPT)
162 |     chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])
163 |     return chat_prompt
164 | 
165 | 
166 | async def aprocess_summaries(summaries, title_chain):
167 |     tasks = [asyncio.create_task(title_chain.ainvoke({"summary": summary})) for summary in summaries]
168 |     results = await asyncio.gather(*tasks)
169 |     return results
170 | 
171 | def process_summaries(summary, title_chain):
172 |     result = title_chain.invoke({"summary": summary})
173 |     return result
174 | 
175 | async def aprocess_communities(community_info, community_chain):
176 |     string_info_list = [prepare_string(community) for community in community_info]
177 |     tasks = [asyncio.create_task(community_chain.ainvoke({'community_info': string_info})) for string_info in string_info_list]
178 |     results = await asyncio.gather(*tasks)
179 |     info_summary = []
180 |     for community, result in zip(community_info, results):
181 |         summary = result.output
182 |         info_summary.append(
183 |             {"community": community['communityId'], "summary": summary})
184 |     return info_summary
185 | 
186 | def process_communities(community, community_chain):
187 |     stringify_info = prepare_string(community)
188 |     summary = community_chain.invoke({'community_info': stringify_info})
189 |     return {"community": community['communityId'], "summary": summary}
190 | 
191 | 
192 | def process_text(text: str, model) -> List[GraphDocument]:
193 |     doc = Document(page_content=text)
194 |     return model.convert_to_graph_documents([doc])
195 | 
196 | 
197 | async def aprocess_text(texts: List[str], model) -> List[GraphDocument]:
198 |     docs = [Document(page_content=text) for text in texts]
199 |     return await model.aconvert_to_graph_documents(docs)
200 | 
201 | @retry(tries=3, delay=2)
202 | async def aentity_resolution(entities: List[str], extraction_chain) -> Optional[List[str]]:
203 |     results = await extraction_chain.ainvoke({"entities": entities})
204 |     return [el.entities for el in results.merge_entities]
205 | 
206 | 
207 | @retry(tries=3, delay=2)
208 | def entity_resolution(entities: List[str], extraction_chain) -> Optional[List[str]]:
209 |     return [el.entities for el in extraction_chain.invoke({"entities": entities}).merge_entities]
210 | 
211 | 


--------------------------------------------------------------------------------
/src/index/cypher_query.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | class CypherQuery:
  4 |     def __init__(self,graph):
  5 |         self.graph = graph
  6 | 
  7 |     def set_entity(self,uuid=None):
  8 |         self.graph.query(
  9 |             f"""
 10 |                 MATCH (n:`__Entity__`)
 11 |                 REMOVE n:`__Entity__`
 12 |                 SET n:`__Entity__{uuid}`
 13 |             """
 14 |         )
 15 |         
 16 |     def set_document(self,uuid=None):
 17 |         self.graph.query(
 18 |             f"""
 19 |                 MATCH (n:`Document`)
 20 |                 REMOVE n:`Document`
 21 |                 SET n:`Document{uuid}`
 22 |             """
 23 |         )
 24 |     
 25 |     # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除
 26 |     def drop_entites(self):
 27 |         # 删除名字为entities的图
 28 |         try:
 29 |             self.graph.query(
 30 |                 """
 31 |                 CALL gds.graph.drop('entities')
 32 |                 """
 33 |             )
 34 |         except:
 35 |             print("`entities` does not exist")
 36 |     
 37 |     # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除       
 38 |     def drop_communities(self):        
 39 |         try:
 40 |             self.graph.query(
 41 |                 f"""
 42 |                 CALL gds.graph.drop('communities')
 43 |                 """
 44 |             )
 45 |         except:
 46 |             print("`communities` does not exist")
 47 |             
 48 |     # 社区检测与聚类分析
 49 |     def detect(self,uuid,word_edit_distance):
 50 |         return self.graph.query(
 51 |             f"""MATCH (e:`__Entity__{uuid}`)
 52 |             WHERE size(e.id) > 4 // longer than 4 characters
 53 |             WITH e.wcc AS community, collect(e) AS nodes, count(*) AS count
 54 |             WHERE count > 1
 55 |             UNWIND nodes AS node
 56 |             // Add text distance
 57 |             WITH distinct
 58 |             [n IN nodes WHERE apoc.text.distance(toLower(node.id), toLower(n.id)) < $distance | n.id] AS intermediate_results
 59 |             WHERE size(intermediate_results) > 1
 60 |             WITH collect(intermediate_results) AS results
 61 |             // combine groups together if they share elements
 62 |             UNWIND range(0, size(results)-1, 1) as index
 63 |             WITH results, index, results[index] as result
 64 |             WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
 65 |                     CASE WHEN index <> index2 AND
 66 |                         size(apoc.coll.intersection(acc, results[index2])) > 0
 67 |                         THEN apoc.coll.union(acc, results[index2])
 68 |                         ELSE acc
 69 |                     END
 70 |             )) as combinedResult
 71 |             WITH distinct(combinedResult) as combinedResult
 72 |             // extra filtering
 73 |             WITH collect(combinedResult) as allCombinedResults
 74 |             UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
 75 |             WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
 76 |             WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1)
 77 |                 WHERE x <> combinedResultIndex
 78 |                 AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
 79 |             )
 80 |             RETURN combinedResult
 81 |             """, params={'distance': word_edit_distance}
 82 |         )
 83 |         
 84 |     def add_constraints_for_community(self,uuid=None):
 85 |         self.graph.query(f"CREATE CONSTRAINT IF NOT EXISTS FOR (c:`__Community__{uuid}`) REQUIRE c.id IS UNIQUE;")
 86 |     
 87 |     # 构造层次聚类
 88 |     def constructing_hierarchical_clustering(self,uuid=None):
 89 |         return self.graph.query("""
 90 |                 MATCH (e:`{entity_label}`)
 91 |                 UNWIND range(0, size(e.communities) - 1 , 1) AS index
 92 |                 CALL {{
 93 |                 WITH e, index
 94 |                 WITH e, index
 95 |                 WHERE index = 0
 96 |                 MERGE (c:`{community_label}` {{id: toString(index) + '-' + toString(e.communities[index])}})
 97 |                 ON CREATE SET c.level = index
 98 |                 MERGE (e)-[:IN_COMMUNITY]->(c)
 99 |                 RETURN count(*) AS count_0
100 |                 }}
101 |                 CALL {{
102 |                 WITH e, index
103 |                 WITH e, index
104 |                 WHERE index > 0
105 |                 MERGE (current:`{community_label}` {{id: toString(index) + '-' + toString(e.communities[index])}})
106 |                 ON CREATE SET current.level = index
107 |                 MERGE (previous:`{community_label}` {{id: toString(index - 1) + '-' + toString(e.communities[index - 1])}})
108 |                 ON CREATE SET previous.level = index - 1
109 |                 MERGE (previous)-[:IN_COMMUNITY]->(current)
110 |                 RETURN count(*) AS count_1
111 |                 }}
112 |                 RETURN count(*)
113 |             """.format(entity_label=f"__Entity__{uuid}",community_label=f"__Community__{uuid}")
114 |         )
115 |     
116 |     def set_community_rank(self,uuid=None):
117 |         self.graph.query(f"""
118 |             MATCH (c:`__Community__{uuid}`)<-[:IN_COMMUNITY*]-(:`__Entity__{uuid}`)<-[:MENTIONS]-(d:`Document{uuid}`) // 匹配社区文档
119 |             WITH c, count(distinct d) AS rank   //  计算每个社区包含的不同的文档数量作为社区的排名
120 |             SET c.community_rank = rank;    // 设置社区排名
121 |             """
122 |         )
123 |         
124 |     def set_node_degree(self,uuid=None):
125 |         node_degree_query = f"""
126 |             MATCH (n)
127 |             WHERE ANY(label IN labels(n) WHERE label ENDS WITH '{uuid}')
128 |             SET n.degree = apoc.node.degree(n)
129 |             RETURN count(n) AS modified_nodes;
130 |         """
131 |         self.graph.query(node_degree_query)
132 |     
133 |     def set_relationship_degree(self,uuid=None):
134 |         relationship_degree_query = f"""
135 |             MATCH (n) 
136 |             WHERE n.degree is not NULL and ANY(label IN labels(n) WHERE label ENDS WITH '{uuid}')
137 |             WITH n as source
138 |             MATCH (source)-[r]->(target)
139 |             WHERE target.degree is not null and ANY(label IN labels(target) WHERE label ENDS WITH '{uuid}')
140 |             SET r.source_degree=source.degree,r.target_degree=target.degree,r.rank=source.degree+target.degree
141 |             RETURN COUNT(r) AS modified_relationships; // 返回被修改的边的数量
142 |         """
143 |         self.graph.query(relationship_degree_query)
144 |     
145 |     def set_text_unit_ids(self,uuid=None):
146 |         text_unit_ids_query = f"""
147 |             MATCH (n:`__Entity__{uuid}`)
148 |             MATCH (p:`Document{uuid}`)
149 |             WHERE p.text IS NOT NULL
150 |             WITH n, collect(p) AS text_units
151 |             UNWIND text_units AS text_unit
152 |             WITH n,text_unit
153 |             WHERE text_unit.text CONTAINS n.id  // 使用 CONTAINS 检查
154 |             WITH n, collect(text_unit.id) AS text_unit_ids
155 |             SET n.text_unit_ids = text_unit_ids
156 |             RETURN count(DISTINCT n) AS modified_nodes;
157 |         """
158 |         self.graph.query(text_unit_ids_query)
159 |     
160 |     def set_relationship_source_and_target(self,uuid=None):
161 |         self.graph.query(
162 |             f"""
163 |             MATCH (n:`__Entity__{uuid}`)-[r]->(m:`__Entity__{uuid}`)
164 |             WITH n,r,m
165 |             SET r.source = n.id, r.target = m.id
166 |             RETURN count(r) AS modified_relationships
167 |             """
168 |         )
169 |     
170 |     def set_communities(self,uuid=None):
171 |         self.graph.query(
172 |             f"""
173 |             MATCH (n:`__Entity__{uuid}`)-[:IN_COMMUNITY*]->(c:`__Community__{uuid}`)
174 |             WITH n, collect(c.id) AS community_ids
175 |             SET n.communities = community_ids
176 |             RETURN count(n) AS modified_nodes;
177 |             """
178 |         )
179 |     
180 |     def get_community_info(self,user_id=None):
181 |         return self.graph.query("""
182 |             MATCH (c:`{community_label}`)<-[:IN_COMMUNITY*]-(e:`{entity_label}`) // 匹配社区实体
183 |             // WHERE c.level in [1]
184 |             WITH c, collect(e) AS nodes
185 |             WHERE size(nodes) > 1
186 |             CALL apoc.path.subgraphAll(nodes[0], {{
187 |                 whitelistNodes:nodes
188 |             }})
189 |             YIELD relationships
190 |             RETURN c.id AS communityId,
191 |                 [n in nodes | {{id: n.id, description: n.description, type: [el in labels(n) WHERE el <> '{entity_label}'][0]}}] AS nodes,
192 |                 [r in relationships | {{start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}}] AS rels
193 |             """.format(entity_label=f"__Entity__{user_id}", community_label=f"__Community__{user_id}")
194 |         )
195 |     
196 |     def store_info(self,info,uuid=None):
197 |         self.graph.query(
198 |             f"""
199 |                 UNWIND $info AS info
200 |                 MATCH (c:`__Community__{uuid}` {{id: info.community}})
201 |                 SET c.summary = info.summary,c.title = info.title
202 |             """, params={"info": info}
203 |         )


--------------------------------------------------------------------------------
/src/index/api_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | from typing import  Dict, List
  4 | from tqdm.asyncio import tqdm
  5 | from concurrent.futures import ThreadPoolExecutor, as_completed
  6 | 
  7 | # langchain
  8 | from langchain_core.output_parsers import StrOutputParser
  9 | from langchain_openai import ChatOpenAI,OpenAI
 10 | from langchain_community.vectorstores import Neo4jVector
 11 | from langchain_community.graphs import Neo4jGraph
 12 | from langchain_core.prompts import ChatPromptTemplate
 13 | from langchain_experimental.graph_transformers import LLMGraphTransformer
 14 | from langchain_core.language_models.chat_models import BaseChatModel
 15 | from langchain_text_splitters.base import TextSplitter
 16 | from langchain_core.embeddings.embeddings import Embeddings
 17 | from langchain_core.documents import Document
 18 | # Graph
 19 | import json_repair
 20 | from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
 21 | # utils
 22 | from .utils import num_tokens_from_string,create_prompt,process_text,entity_resolution,process_communities,process_summaries,countNodesMerged
 23 | from ..utils.logger import create_rotating_logger
 24 | from logging import Logger
 25 | from graphdatascience import GraphDataScience
 26 | from .cypher_query import CypherQuery
 27 | 
 28 | # prompt
 29 | from .prompts import SystemPrompts, UserPrompts
 30 | # hf
 31 | from transformers import AutoTokenizer
 32 | # pydantic models
 33 | from .pydantic_models import Disambiguate, GetTitle
 34 | 
 35 | COMMUNITY_TEMPLEATE = """Based on the provided nodes and relationships that belong to the same graph community,
 36 |         generate a natural language summary of the provided information:
 37 |         {community_info}
 38 | 
 39 |         Summary:"""  # noqa: E501
 40 | 
 41 | community_prompt = ChatPromptTemplate.from_messages(
 42 |     [
 43 |         (
 44 |             "system",
 45 |             "Given an input triples, generate the information summary. No pre-amble.",
 46 |         ),
 47 |         ("human", COMMUNITY_TEMPLEATE),
 48 |     ]
 49 | )
 50 | 
 51 | TITLE_TEMPLATE = """Given the following summary, provide a title that best represents the content:
 52 |         {summary}
 53 |         
 54 |         Title:"""
 55 |         
 56 | title_prompt = ChatPromptTemplate.from_messages(
 57 |     [
 58 |         (
 59 |             "system",
 60 |             "Given a summary, generate a title that best represents the content. No pre-amble.",
 61 |         ),
 62 |         ("human", TITLE_TEMPLATE),
 63 |     ]
 64 | )
 65 | 
 66 | 
 67 | class ApiIndex(object):
 68 |     def __init__(
 69 |         self,
 70 |         graph:Neo4jGraph,
 71 |         chat_model:BaseChatModel,
 72 |         embedding:Embeddings,
 73 |         splitter:TextSplitter,
 74 |         gds:GraphDataScience,
 75 |         logger:Logger=None,
 76 |         max_workers:int=4,
 77 |         gds_similarity_threshold:float=0.95,
 78 |         word_edit_distance:int = 3,
 79 |         uuid:str="",
 80 |         model_name="gpt-4o-mini"
 81 |     ):
 82 |         self.graph = graph
 83 |         self.chat_model = chat_model
 84 |         self.model_name = model_name        
 85 |         self.embedding = embedding
 86 |         self.splitter = splitter
 87 |         self.gds = gds
 88 |         self.cypherQuery = CypherQuery(graph=graph)
 89 |         if not logger:
 90 |             self.logger = create_rotating_logger("index")
 91 |         else:
 92 |             self.logger = logger
 93 |         
 94 |         self.MAX_WORKERS = max_workers
 95 |         self.GDS_SIMILARITY_THRESHOLD = gds_similarity_threshold
 96 |         self.WORD_EDIT_DISTANCE = word_edit_distance
 97 |         self.uuid = uuid
 98 |         
 99 |     def _preprocess(self,documents:List[Dict[str,str]]):
100 |         self.logger.info("Chunking documents")
101 |         data = []
102 |         for document in documents:
103 |             title,text = document['title'],document['text']
104 |             chunks = self.splitter.split_text(text)
105 |             for chunk in chunks:
106 |                 data.append({"title": title, "text": chunk})
107 | 
108 |         return pd.DataFrame(data)
109 |     
110 |     def _parse_hf_ollama(self,content:str,source:Document):
111 |         try:
112 |             breakpoint()
113 |             parsed_json = json_repair.loads(content)
114 |             relationships = []
115 |             nodes_set = set()
116 |             for rel in parsed_json:
117 |                 # Nodes need to be deduplicated using a set
118 |                 if "head_description" in rel.keys():
119 |                     nodes_set.add((rel["head"], rel["head_type"], rel["head_description"]))
120 |                 else:
121 |                     nodes_set.add((rel["head"], rel["head_type"]))
122 |                 if "tail_description" in rel.keys():
123 |                     nodes_set.add((rel["tail"], rel["tail_type"], rel["tail_description"]))
124 |                 else:
125 |                     nodes_set.add((rel["tail"], rel["tail_type"]))
126 |                 source_node = Node(id=rel["head"], type=rel["head_type"])
127 |                 target_node = Node(id=rel["tail"], type=rel["tail_type"])
128 |                 relationships.append(
129 |                     Relationship(
130 |                         source=source_node, target=target_node, type=rel["relation"]
131 |                     )
132 |                 )
133 |             nodes = []
134 |             for el in list(nodes_set):
135 |                 if len(el) == 3:
136 |                     node = Node(id=el[0], type=el[1], properties={"description": el[2]})
137 |                 else:
138 |                     node = Node(id=el[0], type=el[1])
139 |                 nodes.append(node)
140 |             
141 |             return GraphDocument(nodes=nodes, relationships=relationships,source=source)
142 |         except:
143 |             self.logger.error(f"不是一个合法的Json")
144 |             return None
145 |     
146 |     async def _create_nodes_and_relationships(self,documents:List[str]):
147 |         data = self._preprocess(documents)
148 |         documents = [Document(page_content=f"{row['title']} {row['text']}") for i, row in data.iterrows()]
149 |         
150 |         # 如果是openai模型，直接调用convert_to_graph_documents
151 |         if isinstance(self.chat_model,ChatOpenAI):
152 |             llm_transformer = LLMGraphTransformer(
153 |                 llm=self.chat_model,
154 |                 node_properties=["description"],
155 |                 relationship_properties=["description"],
156 |                 prompt=create_prompt(self.chat_model.name),
157 |             )
158 |             graph_documents = await llm_transformer.aconvert_to_graph_documents(documents)
159 |         else:
160 |             chat_prompt = create_prompt(self.chat_model.name)
161 |             processed_documents = []
162 |             for document in documents:
163 |                 prompt = chat_prompt.format_messages(input=document.page_content)
164 |                 processed_documents.append(self.chat_model.invoke(prompt))
165 |             graph_documents = [self._parse_hf_ollama(document.content,source) for (document,source) in zip(processed_documents,documents)]
166 |             graph_documents = [graph_document for graph_document in graph_documents if graph_document]
167 |         
168 |         return graph_documents
169 |         
170 |     async def create_index(self,documents:List[str]):
171 |         self.logger.info("Create_nodes_and_relationships")
172 |         graph_documents = await self._create_nodes_and_relationships(documents)
173 | 
174 |         for graph_document in graph_documents:
175 |             for node in graph_document.nodes:
176 |                 node.type = node.type + f"{self.uuid}"
177 | 
178 |         for relationship in graph_document.relationships:
179 |             relationship.type = relationship.type + f"{self.uuid}"
180 |             relationship.source.type += f"{self.uuid}"
181 |             relationship.target.type += f"{self.uuid}"
182 | 
183 |         # 将结点和关系存入图数据库
184 |         self.graph.add_graph_documents(
185 |             graph_documents,
186 |             baseEntityLabel=True,
187 |             include_source=True
188 |         )
189 |         # 查询所有标签是__Entity__的结点，并修改成__Entity__+用户id
190 |         self.cypherQuery.set_entity(self.uuid)
191 |         # 查询所有标签是Document的结点，并修改成Document+用户id
192 |         self.cypherQuery.set_document(self.uuid)
193 |         
194 |         self.graph.refresh_schema()
195 |         Neo4jVector.from_existing_graph(
196 |             self.embedding,
197 |             node_label=f'__Entity__{self.uuid}',
198 |             text_node_properties=['id', 'description'],
199 |             index_name=f"{self.uuid}" if (self.uuid != None and self.uuid != "") else "vector",
200 |             embedding_node_property='embedding',
201 |             graph=self.graph,
202 |         )
203 |         try:
204 |             self.cypherQuery.drop_entites()
205 |         except:
206 |             pass
207 |         
208 |         # 1.create the k-nearest neighbor graph
209 |         G, _ = self.gds.graph.project(
210 |             "entities",  # Graph name   # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除
211 |             f"__Entity__{self.uuid}",  # Node projection
212 |             "*",  # Relationship projection
213 |             nodeProperties=["embedding"]  # Configuration parameters
214 |         )
215 |         # 2.algorithm: k-nearest neighbors
216 |         self.gds.knn.mutate(
217 |             G,
218 |             nodeProperties=['embedding'],
219 |             mutateRelationshipType='SIMILAR',
220 |             mutateProperty='score',
221 |             similarityCutoff=self.GDS_SIMILARITY_THRESHOLD,
222 |         )
223 |         # 3.store graph with weak connected components
224 |         self.gds.wcc.write(
225 |             G,
226 |             writeProperty="wcc",
227 |             relationshipTypes=["SIMILAR"]
228 |         )
229 |         # 4. KEY:社区检测与聚类分析
230 |         
231 |         potential_duplicate_candidates = self.cypherQuery.detect(self.uuid,self.WORD_EDIT_DISTANCE)
232 |         extraction_llm = self.chat_model.with_structured_output(Disambiguate)
233 |         extraction_prompt = ChatPromptTemplate.from_messages(
234 |             [
235 |                 (
236 |                     "system",
237 |                     SystemPrompts.IDENTIFY_SYSTEM_PROMPT,
238 |                 ),
239 |                 (
240 |                     "human",
241 |                     UserPrompts.IDENTIFY_USER_PROMPT,   # noqa: E501,
242 |                 ),
243 |             ]
244 |         )
245 |         extraction_chain = extraction_prompt | extraction_llm
246 |         merged_entities = []
247 |         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
248 |             futures = [executor.submit(entity_resolution, el['combinedResult'],extraction_chain) for el in potential_duplicate_candidates]
249 |             for future in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
250 |                 try:
251 |                     to_merge = future.result()
252 |                     if to_merge:
253 |                         merged_entities.extend(to_merge)
254 |                 except Exception as e:
255 |                     self.logger.error("模型没法进行这条任务的实体解析")
256 |         self.logger.info(countNodesMerged(self.uuid,merged_entities,self.graph))
257 |         
258 |         G.drop()
259 |         
260 |         self.cypherQuery.drop_communities()
261 |         
262 |         # 1.project into memory
263 |         G, _ = self.gds.graph.project(
264 |             f"communities",  # Graph name   # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除
265 |             f"__Entity__{self.uuid}",  # Node projection
266 |             {
267 |                 "_ALL_": {
268 |                     "type": "*",
269 |                     "orientation": "UNDIRECTED",
270 |                     "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
271 |                 }
272 |             },
273 |         )
274 | 
275 |         # 2. LeiDen聚类
276 |         self.gds.leiden.write(
277 |             G,
278 |             writeProperty=f"communities",
279 |             includeIntermediateCommunities=True,
280 |             relationshipWeightProperty="weight",
281 |         )
282 |         
283 |         # 添加约束
284 |         self.cypherQuery.add_constraints_for_community(self.uuid)
285 |         
286 |         # 构造层次聚类
287 |         merged_nodes = self.cypherQuery.constructing_hierarchical_clustering(self.uuid)
288 |         self.logger.info(f"{merged_nodes[0]['count(*)']} nodes merged")
289 |         
290 |         # 设置社区rank
291 |         self.cypherQuery.set_community_rank(self.uuid)
292 |         
293 |         # 设置结点与边的额外信息---用于后续的查询
294 |         # 我们需要给所有实体结点设置度数；给边设置`source_degree`, `target_degree`, `rank`属性
295 |         # 此外需要给每个实体设置其包含的text_unit_ids
296 |         # 还需要给relationship设置source和target的属性，表示其链接到的结点的内容
297 |         # 增加：需要给每个结点设置communities属性，是一个列表id，表示结点所在的社区
298 |         # 1. node degree
299 |         self.cypherQuery.set_node_degree(self.uuid)
300 |         # 2. relationship degree
301 |         self.cypherQuery.set_relationship_degree(self.uuid)
302 |         # 3. text_unit_ids
303 |         self.cypherQuery.set_text_unit_ids(self.uuid)
304 |         # 4. relationship设置source和target的属性
305 |         self.cypherQuery.set_relationship_source_and_target(self.uuid)
306 |         # 5. 设置communities属性
307 |         self.cypherQuery.set_communities(self.uuid)
308 |         
309 |         # 准备工作结束，开始summarization
310 |         
311 |         community_info = self.cypherQuery.get_community_info(self.uuid)
312 | 
313 |         
314 |         community_chain = community_prompt | self.chat_model | StrOutputParser()    # TODO：增加报错处理
315 |         summaries = []
316 |         
317 |         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
318 |             futures = {executor.submit(process_communities, community, community_chain) for community in community_info}
319 |             for future in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
320 |                 summary = future.result()
321 |                 summaries.append(summary)
322 |         
323 |         
324 |         title_chain = title_prompt | self.chat_model | StrOutputParser()
325 |         titles = []
326 |         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
327 |             futures = {executor.submit(process_summaries, summary, title_chain) for summary in summaries}
328 |             for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Title"):
329 |                 title = future.result()
330 |                 titles.append(title)
331 |         
332 |         assert len(summaries) == len(titles)
333 |         info = [{**summary, 'title': title} for summary, title in zip(summaries, titles)]
334 |         
335 |         # Store info
336 |         self.cypherQuery.store_info(info,uuid=self.uuid)
337 |         
338 |         G.drop()


--------------------------------------------------------------------------------
/index.log:
--------------------------------------------------------------------------------
  1 | 2024-11-25 21:36:12,336 - build_index - INFO - Start building index
  2 | 2024-11-25 21:36:12,342 - build_index - INFO - Connecting to Neo4j
  3 | 2024-11-25 21:42:31,210 - build_index - INFO - Start building index
  4 | 2024-11-25 21:42:31,216 - build_index - INFO - Connecting to Neo4j
  5 | 2024-11-25 21:43:43,507 - build_index - INFO - Start building index
  6 | 2024-11-25 21:43:43,513 - build_index - INFO - Connecting to Neo4j
  7 | 2024-11-25 21:43:43,530 - build_index - ERROR - Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag
  8 | 2024-11-25 21:44:33,680 - build_index - INFO - Start building index
  9 | 2024-11-25 21:44:33,686 - build_index - INFO - Connecting to Neo4j
 10 | 2024-11-25 21:44:33,702 - build_index - ERROR - Meet error Could not use APOC procedures. Please ensure the APOC plugin is installed in Neo4j and that 'apoc.meta.data()' is allowed in Neo4j configuration  when connecting to Neo4j 
 11 | Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag
 12 | 2024-11-25 21:45:45,003 - build_index - INFO - Start building index
 13 | 2024-11-25 21:45:45,008 - build_index - INFO - Connecting to Neo4j
 14 | 2024-11-25 21:45:45,024 - build_index - ERROR - Meet error Could not use APOC procedures. Please ensure the APOC plugin is installed in Neo4j and that 'apoc.meta.data()' is allowed in Neo4j configuration  when connecting to Neo4j 
 15 | Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag
 16 | 2024-11-26 10:17:25,487 - build_index - INFO - Start building index
 17 | 2024-11-26 10:17:25,494 - build_index - INFO - Connecting to Neo4j
 18 | 2024-11-26 10:17:52,886 - build_index - ERROR - Failed to get OpenAI API key
 19 | 2024-11-26 10:18:51,127 - build_index - INFO - Start building index
 20 | 2024-11-26 10:18:51,132 - build_index - INFO - Connecting to Neo4j
 21 | 2024-11-26 10:18:56,390 - build_index - INFO - Initializing chat model
 22 | 2024-11-26 10:18:56,447 - build_index - INFO - Initializing embedding
 23 | 2024-11-26 10:22:15,703 - build_index - INFO - Start building index
 24 | 2024-11-26 10:22:15,709 - build_index - INFO - Connecting to Neo4j
 25 | 2024-11-26 10:22:34,162 - build_index - INFO - Initializing chat model
 26 | 2024-11-26 10:22:34,224 - build_index - INFO - Initializing embedding
 27 | 2024-11-26 10:25:28,630 - build_index - INFO - Initializing splitter
 28 | 2024-11-26 10:25:28,633 - build_index - INFO - Initializing GDS
 29 | 2024-11-26 10:25:28,727 - build_index - INFO - Initializing index
 30 | 2024-11-26 10:25:28,741 - build_index - INFO - Building index now!
 31 | 2024-11-26 10:30:37,380 - build_index - INFO - Start building index
 32 | 2024-11-26 10:30:37,387 - build_index - INFO - Connecting to Neo4j
 33 | 2024-11-26 10:36:32,881 - build_index - INFO - Initializing chat model
 34 | 2024-11-26 10:36:32,980 - build_index - INFO - Initializing embedding
 35 | 2024-11-26 10:36:39,310 - build_index - INFO - Initializing splitter
 36 | 2024-11-26 10:36:39,311 - build_index - INFO - Initializing GDS
 37 | 2024-11-26 10:36:39,337 - build_index - INFO - Initializing index
 38 | 2024-11-26 10:36:39,349 - build_index - INFO - Building index now!
 39 | 2024-11-26 10:38:56,895 - build_index - INFO - Start building index
 40 | 2024-11-26 10:38:56,899 - build_index - INFO - Connecting to Neo4j
 41 | 2024-11-26 10:44:37,513 - build_index - INFO - Initializing chat model
 42 | 2024-11-26 10:44:37,595 - build_index - INFO - Initializing embedding
 43 | 2024-11-26 10:44:43,994 - build_index - INFO - Initializing splitter
 44 | 2024-11-26 10:44:43,994 - build_index - INFO - Initializing GDS
 45 | 2024-11-26 10:44:44,025 - build_index - INFO - Initializing index
 46 | 2024-11-26 10:44:44,036 - build_index - INFO - Building index now!
 47 | 2024-11-26 10:46:05,949 - build_index - INFO - Chunking documents
 48 | 2024-11-26 10:46:14,868 - build_index - INFO - Start building index
 49 | 2024-11-26 10:46:14,873 - build_index - INFO - Connecting to Neo4j
 50 | 2024-11-26 10:46:29,714 - build_index - INFO - Initializing chat model
 51 | 2024-11-26 10:46:29,771 - build_index - INFO - Initializing embedding
 52 | 2024-11-26 10:46:36,397 - build_index - INFO - Initializing splitter
 53 | 2024-11-26 10:46:36,398 - build_index - INFO - Initializing GDS
 54 | 2024-11-26 10:46:36,437 - build_index - INFO - Initializing index
 55 | 2024-11-26 10:46:36,451 - build_index - INFO - Building index now!
 56 | 2024-11-26 10:46:40,097 - build_index - INFO - Chunking documents
 57 | 2024-11-26 16:32:30,675 - build_index - INFO - Start building index
 58 | 2024-11-26 16:32:30,680 - build_index - INFO - Connecting to Neo4j
 59 | 2024-11-26 16:35:18,556 - build_index - ERROR - Failed to get OpenAI API key
 60 | 2024-11-26 16:35:34,137 - build_index - INFO - Start building index
 61 | 2024-11-26 16:35:34,141 - build_index - INFO - Connecting to Neo4j
 62 | 2024-11-26 16:36:07,814 - build_index - INFO - Initializing chat model
 63 | 2024-11-26 16:36:07,879 - build_index - INFO - Initializing embedding
 64 | 2024-11-26 16:36:15,558 - build_index - INFO - Initializing splitter
 65 | 2024-11-26 16:36:15,558 - build_index - INFO - Initializing GDS
 66 | 2024-11-26 16:36:15,578 - build_index - INFO - Initializing index
 67 | 2024-11-26 16:36:15,588 - build_index - INFO - Building index now!
 68 | 2024-11-26 16:36:15,589 - build_index - INFO - Chunking documents
 69 | 2024-11-26 16:39:40,385 - build_index - INFO - Start building index
 70 | 2024-11-26 16:39:40,391 - build_index - INFO - Connecting to Neo4j
 71 | 2024-11-26 16:39:48,032 - build_index - INFO - Initializing chat model
 72 | 2024-11-26 16:39:48,090 - build_index - INFO - Initializing embedding
 73 | 2024-11-26 16:39:55,944 - build_index - INFO - Initializing splitter
 74 | 2024-11-26 16:39:55,944 - build_index - INFO - Initializing GDS
 75 | 2024-11-26 16:39:55,967 - build_index - INFO - Initializing index
 76 | 2024-11-26 16:39:55,978 - build_index - INFO - Building index now!
 77 | 2024-11-26 16:39:55,978 - build_index - INFO - Chunking documents
 78 | 2024-11-26 16:46:11,163 - build_index - INFO - Start building index
 79 | 2024-11-26 16:46:11,167 - build_index - INFO - Connecting to Neo4j
 80 | 2024-11-26 16:46:35,611 - build_index - INFO - Initializing chat model
 81 | 2024-11-26 16:46:35,698 - build_index - INFO - Initializing embedding
 82 | 2024-11-26 16:46:43,186 - build_index - INFO - Initializing splitter
 83 | 2024-11-26 16:46:43,187 - build_index - INFO - Initializing GDS
 84 | 2024-11-26 16:46:43,211 - build_index - INFO - Initializing index
 85 | 2024-11-26 16:46:43,224 - build_index - INFO - Building index now!
 86 | 2024-11-26 16:46:43,224 - build_index - INFO - Chunking documents
 87 | 2024-11-26 16:47:40,182 - build_index - INFO - Start building index
 88 | 2024-11-26 16:47:40,188 - build_index - INFO - Connecting to Neo4j
 89 | 2024-11-26 16:47:45,687 - build_index - INFO - Initializing chat model
 90 | 2024-11-26 16:47:45,742 - build_index - INFO - Initializing embedding
 91 | 2024-11-26 16:47:53,678 - build_index - INFO - Initializing splitter
 92 | 2024-11-26 16:47:53,678 - build_index - INFO - Initializing GDS
 93 | 2024-11-26 16:47:53,699 - build_index - INFO - Initializing index
 94 | 2024-11-26 16:47:53,712 - build_index - INFO - Building index now!
 95 | 2024-11-26 16:47:53,712 - build_index - INFO - Chunking documents
 96 | 2024-11-26 16:48:10,278 - build_index - INFO - None
 97 | 2024-11-26 16:48:10,873 - build_index - INFO - 76 nodes merged
 98 | 2024-11-26 16:48:18,765 - build_index - INFO - Index built successfully
 99 | 2024-11-26 16:48:39,576 - build_index - INFO - Start building index
100 | 2024-11-26 16:48:39,581 - build_index - INFO - Connecting to Neo4j
101 | 2024-11-26 16:48:39,852 - build_index - INFO - Initializing chat model
102 | 2024-11-26 16:50:40,671 - build_index - INFO - Start building index
103 | 2024-11-26 16:50:40,678 - build_index - INFO - Connecting to Neo4j
104 | 2024-11-26 16:50:40,805 - build_index - INFO - Initializing chat model
105 | 2024-11-26 16:51:21,814 - build_index - INFO - Start building index
106 | 2024-11-26 16:51:21,819 - build_index - INFO - Connecting to Neo4j
107 | 2024-11-26 16:51:21,943 - build_index - INFO - Initializing chat model
108 | 2024-11-26 16:59:38,367 - build_index - INFO - Start building index
109 | 2024-11-26 16:59:38,376 - build_index - INFO - Connecting to Neo4j
110 | 2024-11-26 16:59:38,461 - build_index - INFO - Initializing chat model
111 | 2024-11-26 17:00:25,413 - build_index - INFO - Start building index
112 | 2024-11-26 17:00:25,419 - build_index - INFO - Connecting to Neo4j
113 | 2024-11-26 17:00:25,506 - build_index - INFO - Initializing chat model
114 | 2024-11-26 17:00:27,891 - build_index - INFO - Initializing embedding
115 | 2024-11-26 17:00:35,649 - build_index - INFO - Initializing splitter
116 | 2024-11-26 17:00:35,650 - build_index - INFO - Initializing GDS
117 | 2024-11-26 17:00:35,696 - build_index - INFO - Initializing index
118 | 2024-11-26 17:09:01,914 - build_index - INFO - Start building index
119 | 2024-11-26 17:09:01,919 - build_index - INFO - Connecting to Neo4j
120 | 2024-11-26 17:09:02,000 - build_index - INFO - Initializing chat model
121 | 2024-11-26 17:09:04,283 - build_index - INFO - Initializing embedding
122 | 2024-11-26 17:09:11,614 - build_index - INFO - Initializing splitter
123 | 2024-11-26 17:09:11,614 - build_index - INFO - Initializing GDS
124 | 2024-11-26 17:09:11,632 - build_index - INFO - Initializing index
125 | 2024-11-26 17:09:11,645 - build_index - INFO - Building index now!
126 | 2024-11-26 17:09:11,645 - build_index - INFO - Chunking documents
127 | 2024-11-26 17:29:13,540 - build_index - INFO - Start building index
128 | 2024-11-26 17:29:13,545 - build_index - INFO - Connecting to Neo4j
129 | 2024-11-26 17:29:13,634 - build_index - INFO - Initializing chat model
130 | 2024-11-26 17:29:54,826 - build_index - INFO - Start building index
131 | 2024-11-26 17:29:54,832 - build_index - INFO - Connecting to Neo4j
132 | 2024-11-26 17:29:54,901 - build_index - INFO - Initializing chat model
133 | 2024-11-26 17:30:00,447 - build_index - INFO - Initializing embedding
134 | 2024-11-26 17:30:07,313 - build_index - INFO - Initializing splitter
135 | 2024-11-26 17:30:07,313 - build_index - INFO - Initializing GDS
136 | 2024-11-26 17:30:07,328 - build_index - INFO - Initializing index
137 | 2024-11-26 17:35:05,456 - build_index - INFO - Start building index
138 | 2024-11-26 17:35:05,462 - build_index - INFO - Connecting to Neo4j
139 | 2024-11-26 17:35:05,567 - build_index - INFO - Initializing chat model
140 | 2024-11-26 17:35:18,345 - build_index - INFO - Initializing embedding
141 | 2024-11-26 17:35:25,155 - build_index - INFO - Initializing splitter
142 | 2024-11-26 17:35:25,155 - build_index - INFO - Initializing GDS
143 | 2024-11-26 17:35:25,171 - build_index - INFO - Initializing index
144 | 2024-11-26 17:35:25,182 - build_index - INFO - Building index now!
145 | 2024-11-26 17:35:25,182 - build_index - INFO - Chunking documents
146 | 2024-11-26 17:36:01,287 - build_index - INFO - Start building index
147 | 2024-11-26 17:36:01,293 - build_index - INFO - Connecting to Neo4j
148 | 2024-11-26 17:36:01,384 - build_index - INFO - Initializing chat model
149 | 2024-11-26 17:36:09,907 - build_index - INFO - Initializing embedding
150 | 2024-11-26 17:36:16,445 - build_index - INFO - Initializing splitter
151 | 2024-11-26 17:36:16,445 - build_index - INFO - Initializing GDS
152 | 2024-11-26 17:36:16,458 - build_index - INFO - Initializing index
153 | 2024-11-26 17:36:16,468 - build_index - INFO - Building index now!
154 | 2024-11-26 17:36:16,468 - build_index - INFO - Chunking documents
155 | 2024-11-26 17:38:39,049 - build_index - INFO - Start building index
156 | 2024-11-26 17:38:39,055 - build_index - INFO - Connecting to Neo4j
157 | 2024-11-26 17:38:39,176 - build_index - INFO - Initializing chat model
158 | 2024-11-26 17:38:47,877 - build_index - INFO - Initializing embedding
159 | 2024-11-26 17:38:53,344 - build_index - INFO - Initializing splitter
160 | 2024-11-26 17:38:53,344 - build_index - INFO - Initializing GDS
161 | 2024-11-26 17:38:53,361 - build_index - INFO - Initializing index
162 | 2024-11-26 17:38:53,368 - build_index - INFO - Building index now!
163 | 2024-11-26 17:38:53,368 - build_index - INFO - Chunking documents
164 | 2024-11-26 17:40:23,615 - build_index - INFO - Start building index
165 | 2024-11-26 17:40:23,620 - build_index - INFO - Connecting to Neo4j
166 | 2024-11-26 17:40:23,720 - build_index - INFO - Initializing chat model
167 | 2024-11-26 17:40:37,214 - build_index - INFO - Initializing embedding
168 | 2024-11-26 17:40:42,511 - build_index - INFO - Initializing splitter
169 | 2024-11-26 17:40:42,511 - build_index - INFO - Initializing GDS
170 | 2024-11-26 17:40:42,531 - build_index - INFO - Initializing index
171 | 2024-11-26 17:40:42,550 - build_index - INFO - Building index now!
172 | 2024-11-26 17:40:42,550 - build_index - INFO - Chunking documents
173 | 2024-11-26 17:46:53,003 - build_index - INFO - Start building index
174 | 2024-11-26 17:46:53,007 - build_index - INFO - Connecting to Neo4j
175 | 2024-11-26 17:46:53,095 - build_index - INFO - Initializing chat model
176 | 2024-11-26 17:50:24,139 - build_index - INFO - Start building index
177 | 2024-11-26 17:50:24,145 - build_index - INFO - Connecting to Neo4j
178 | 2024-11-26 17:50:24,215 - build_index - INFO - Initializing chat model
179 | 2024-11-26 17:53:01,027 - build_index - INFO - Start building index
180 | 2024-11-26 17:53:01,032 - build_index - INFO - Connecting to Neo4j
181 | 2024-11-26 17:53:01,126 - build_index - INFO - Initializing chat model
182 | 2024-11-26 17:53:20,141 - build_index - INFO - Initializing embedding
183 | 2024-11-26 17:53:25,196 - build_index - INFO - Initializing splitter
184 | 2024-11-26 17:53:25,196 - build_index - INFO - Initializing GDS
185 | 2024-11-26 17:53:25,217 - build_index - INFO - Initializing index
186 | 2024-11-26 17:56:22,868 - build_index - INFO - Start building index
187 | 2024-11-26 17:56:22,871 - build_index - INFO - Connecting to Neo4j
188 | 2024-11-26 17:56:22,939 - build_index - INFO - Initializing chat model
189 | 2024-11-26 17:56:39,295 - build_index - INFO - Initializing embedding
190 | 2024-11-26 17:56:45,291 - build_index - INFO - Initializing splitter
191 | 2024-11-26 17:56:45,291 - build_index - INFO - Initializing GDS
192 | 2024-11-26 17:56:45,307 - build_index - INFO - Initializing index
193 | 2024-11-26 17:58:41,061 - build_index - INFO - Start building index
194 | 2024-11-26 17:58:41,065 - build_index - INFO - Connecting to Neo4j
195 | 2024-11-26 17:58:41,156 - build_index - INFO - Initializing chat model
196 | 2024-11-26 17:58:55,289 - build_index - INFO - Initializing embedding
197 | 2024-11-26 17:59:00,747 - build_index - INFO - Initializing splitter
198 | 2024-11-26 17:59:00,747 - build_index - INFO - Initializing GDS
199 | 2024-11-26 17:59:00,764 - build_index - INFO - Initializing index
200 | 2024-11-26 17:59:00,784 - build_index - INFO - Building index now!
201 | 2024-11-26 17:59:00,784 - build_index - INFO - Chunking documents
202 | 2024-11-26 18:00:53,958 - build_index - INFO - Start building index
203 | 2024-11-26 18:00:53,962 - build_index - INFO - Connecting to Neo4j
204 | 2024-11-26 18:00:54,042 - build_index - INFO - Initializing chat model
205 | 2024-11-26 18:01:09,640 - build_index - INFO - Initializing embedding
206 | 2024-11-26 18:01:15,182 - build_index - INFO - Initializing splitter
207 | 2024-11-26 18:01:15,182 - build_index - INFO - Initializing GDS
208 | 2024-11-26 18:01:15,207 - build_index - INFO - Initializing index
209 | 2024-11-26 18:01:15,219 - build_index - INFO - Building index now!
210 | 2024-11-26 18:01:15,219 - build_index - INFO - Chunking documents
211 | 2024-11-26 19:19:54,841 - build_index - INFO - Start building index
212 | 2024-11-26 19:19:54,846 - build_index - INFO - Connecting to Neo4j
213 | 2024-11-26 19:20:10,153 - build_index - INFO - Initializing chat model
214 | 2024-11-26 19:20:10,221 - build_index - INFO - Initializing embedding
215 | 2024-11-26 19:20:16,778 - build_index - INFO - Initializing splitter
216 | 2024-11-26 19:20:16,778 - build_index - INFO - Initializing GDS
217 | 2024-11-26 19:20:16,792 - build_index - INFO - Initializing index
218 | 2024-11-26 19:20:16,801 - build_index - INFO - Building index now!
219 | 2024-11-26 19:20:57,615 - build_index - INFO - Start building index
220 | 2024-11-26 19:20:57,619 - build_index - INFO - Connecting to Neo4j
221 | 2024-11-26 19:20:58,527 - build_index - INFO - Initializing chat model
222 | 2024-11-26 19:20:58,587 - build_index - INFO - Initializing embedding
223 | 2024-11-26 19:21:04,649 - build_index - INFO - Initializing splitter
224 | 2024-11-26 19:21:04,649 - build_index - INFO - Initializing GDS
225 | 2024-11-26 19:21:04,665 - build_index - INFO - Initializing index
226 | 2024-11-26 19:21:04,678 - build_index - INFO - Building index now!
227 | 2024-11-26 19:21:04,678 - build_index - INFO - Chunking documents
228 | 2024-11-26 19:22:16,734 - build_index - INFO - Start building index
229 | 2024-11-26 19:22:16,738 - build_index - INFO - Connecting to Neo4j
230 | 2024-11-26 19:22:21,750 - build_index - INFO - Initializing chat model
231 | 2024-11-26 19:22:21,819 - build_index - INFO - Initializing embedding
232 | 2024-11-26 19:22:27,614 - build_index - INFO - Initializing splitter
233 | 2024-11-26 19:22:27,614 - build_index - INFO - Initializing GDS
234 | 2024-11-26 19:22:27,630 - build_index - INFO - Initializing index
235 | 2024-11-26 19:22:27,641 - build_index - INFO - Building index now!
236 | 2024-11-26 19:28:25,200 - build_index - INFO - Start building index
237 | 2024-11-26 19:28:25,207 - build_index - INFO - Connecting to Neo4j
238 | 2024-11-26 19:28:25,302 - build_index - INFO - Initializing chat model
239 | 2024-11-26 19:28:49,977 - build_index - INFO - Start building index
240 | 2024-11-26 19:28:49,980 - build_index - INFO - Connecting to Neo4j
241 | 2024-11-26 19:28:50,094 - build_index - INFO - Initializing chat model
242 | 2024-11-26 19:29:09,209 - build_index - INFO - Initializing embedding
243 | 2024-11-26 19:29:14,393 - build_index - INFO - Initializing splitter
244 | 2024-11-26 19:29:14,394 - build_index - INFO - Initializing GDS
245 | 2024-11-26 19:29:14,408 - build_index - INFO - Initializing index
246 | 2024-11-26 19:29:14,425 - build_index - INFO - Building index now!
247 | 2024-11-26 19:33:39,829 - build_index - INFO - Start building index
248 | 2024-11-26 19:33:39,835 - build_index - INFO - Connecting to Neo4j
249 | 2024-11-26 19:33:39,921 - build_index - INFO - Initializing chat model
250 | 2024-11-26 19:33:49,296 - build_index - INFO - Initializing embedding
251 | 2024-11-26 19:33:54,975 - build_index - INFO - Initializing splitter
252 | 2024-11-26 19:33:54,976 - build_index - INFO - Initializing GDS
253 | 2024-11-26 19:33:54,995 - build_index - INFO - Initializing index
254 | 2024-11-26 19:33:55,004 - build_index - INFO - Building index now!
255 | 2024-11-26 19:56:09,305 - build_index - INFO - Start building index
256 | 2024-11-26 19:56:09,310 - build_index - INFO - Connecting to Neo4j
257 | 2024-11-26 19:56:09,401 - build_index - INFO - Initializing chat model
258 | 2024-11-27 15:38:59,703 - build_index - INFO - Start building index
259 | 2024-11-27 15:38:59,708 - build_index - INFO - Connecting to Neo4j
260 | 2024-11-27 15:38:59,820 - build_index - INFO - Initializing chat model
261 | 2024-11-27 15:39:37,924 - build_index - INFO - Start building index
262 | 2024-11-27 15:39:37,930 - build_index - INFO - Connecting to Neo4j
263 | 2024-11-27 15:39:38,023 - build_index - INFO - Initializing chat model
264 | 2024-11-27 15:39:58,834 - build_index - INFO - Start building index
265 | 2024-11-27 15:39:58,839 - build_index - INFO - Connecting to Neo4j
266 | 2024-11-27 15:39:58,932 - build_index - INFO - Initializing chat model
267 | 2024-11-27 15:46:38,512 - build_index - INFO - Start building index
268 | 2024-11-27 15:46:38,519 - build_index - INFO - Connecting to Neo4j
269 | 2024-11-27 15:46:38,604 - build_index - INFO - Initializing chat model
270 | 2024-11-27 15:47:24,345 - build_index - INFO - Start building index
271 | 2024-11-27 15:47:24,352 - build_index - INFO - Connecting to Neo4j
272 | 2024-11-27 15:47:24,427 - build_index - INFO - Initializing chat model
273 | 2024-11-27 15:48:44,601 - build_index - INFO - Start building index
274 | 2024-11-27 15:48:44,607 - build_index - INFO - Connecting to Neo4j
275 | 2024-11-27 15:48:44,692 - build_index - INFO - Initializing chat model
276 | 2024-11-27 15:49:17,986 - build_index - INFO - Initializing embedding
277 | 2024-11-27 15:49:23,916 - build_index - INFO - Initializing splitter
278 | 2024-11-27 15:49:23,916 - build_index - INFO - Initializing GDS
279 | 2024-11-27 15:49:23,930 - build_index - INFO - Initializing index
280 | 2024-11-27 15:49:23,940 - build_index - INFO - Building index now!
281 | 2024-11-27 15:49:23,941 - build_index - INFO - Chunking documents
282 | 2024-11-27 15:54:20,436 - build_index - INFO - None
283 | 2024-11-27 15:54:20,974 - build_index - INFO - 76 nodes merged
284 | 2024-11-27 15:55:44,524 - build_index - INFO - Index built successfully
285 | 2024-11-27 15:57:37,600 - build_index - INFO - Start building index
286 | 2024-11-27 15:57:37,607 - build_index - INFO - Connecting to Neo4j
287 | 2024-11-27 15:57:37,696 - build_index - INFO - Initializing chat model
288 | 2024-11-27 15:57:47,421 - build_index - INFO - Initializing embedding
289 | 2024-11-27 15:57:53,005 - build_index - INFO - Initializing splitter
290 | 2024-11-27 15:57:53,005 - build_index - INFO - Initializing GDS
291 | 2024-11-27 15:57:53,025 - build_index - INFO - Initializing index
292 | 2024-11-27 15:57:53,039 - build_index - INFO - Building index now!
293 | 2024-11-27 15:57:53,040 - build_index - INFO - Chunking documents
294 | 2024-11-27 16:11:49,150 - build_index - INFO - Start building index
295 | 2024-11-27 16:11:49,155 - build_index - INFO - Connecting to Neo4j
296 | 2024-11-27 16:11:49,247 - build_index - INFO - Initializing chat model
297 | 2024-11-27 16:11:59,511 - build_index - INFO - Initializing embedding
298 | 2024-11-27 16:12:04,741 - build_index - INFO - Initializing splitter
299 | 2024-11-27 16:12:04,742 - build_index - INFO - Initializing GDS
300 | 2024-11-27 16:12:04,759 - build_index - INFO - Initializing index
301 | 2024-11-27 16:45:04,730 - build_index - INFO - Start building index
302 | 2024-11-27 16:45:04,734 - build_index - INFO - Connecting to Neo4j
303 | 2024-11-27 16:45:04,835 - build_index - INFO - Initializing chat model
304 | 2024-11-27 16:45:13,826 - build_index - INFO - Initializing embedding
305 | 2024-11-27 16:45:19,272 - build_index - INFO - Initializing splitter
306 | 2024-11-27 16:45:19,273 - build_index - INFO - Initializing GDS
307 | 2024-11-27 16:45:19,287 - build_index - INFO - Initializing index
308 | 2024-11-27 16:48:11,915 - build_index - INFO - Start building index
309 | 2024-11-27 16:48:11,922 - build_index - INFO - Connecting to Neo4j
310 | 2024-11-27 16:48:12,004 - build_index - INFO - Initializing chat model
311 | 2024-11-27 16:48:21,656 - build_index - INFO - Initializing embedding
312 | 2024-11-27 16:48:27,064 - build_index - INFO - Initializing splitter
313 | 2024-11-27 16:48:27,065 - build_index - INFO - Initializing GDS
314 | 2024-11-27 16:48:27,083 - build_index - INFO - Initializing index
315 | 2024-11-27 16:50:36,285 - build_index - INFO - Start building index
316 | 2024-11-27 16:50:36,291 - build_index - INFO - Connecting to Neo4j
317 | 2024-11-27 16:50:36,414 - build_index - INFO - Initializing chat model
318 | 2024-11-27 16:50:46,283 - build_index - INFO - Initializing embedding
319 | 2024-11-27 16:50:52,377 - build_index - INFO - Initializing splitter
320 | 2024-11-27 16:50:52,378 - build_index - INFO - Initializing GDS
321 | 2024-11-27 16:50:52,391 - build_index - INFO - Initializing index
322 | 2024-11-27 16:53:24,515 - build_index - INFO - Start building index
323 | 2024-11-27 16:53:24,520 - build_index - INFO - Connecting to Neo4j
324 | 2024-11-27 16:53:24,606 - build_index - INFO - Initializing chat model
325 | 2024-11-27 16:53:33,942 - build_index - INFO - Initializing embedding
326 | 2024-11-27 16:53:39,870 - build_index - INFO - Initializing splitter
327 | 2024-11-27 16:53:39,870 - build_index - INFO - Initializing GDS
328 | 2024-11-27 16:53:39,889 - build_index - INFO - Initializing index
329 | 2024-11-27 17:22:08,489 - build_index - INFO - Start building index
330 | 2024-11-27 17:22:08,494 - build_index - INFO - Connecting to Neo4j
331 | 2024-11-27 17:22:08,580 - build_index - INFO - Initializing chat model
332 | 2024-11-27 17:22:22,722 - build_index - INFO - Initializing embedding
333 | 2024-11-27 17:22:29,242 - build_index - INFO - Initializing splitter
334 | 2024-11-27 17:22:29,243 - build_index - INFO - Initializing GDS
335 | 2024-11-27 17:22:29,255 - build_index - INFO - Initializing index
336 | 2024-11-27 17:26:25,686 - build_index - INFO - Start building index
337 | 2024-11-27 17:26:25,692 - build_index - INFO - Connecting to Neo4j
338 | 2024-11-27 17:26:25,762 - build_index - INFO - Initializing chat model
339 | 2024-11-27 17:26:35,453 - build_index - INFO - Initializing embedding
340 | 2024-11-27 17:26:41,559 - build_index - INFO - Initializing splitter
341 | 2024-11-27 17:26:41,560 - build_index - INFO - Initializing GDS
342 | 2024-11-27 17:26:41,571 - build_index - INFO - Initializing index
343 | 2024-11-27 17:28:40,654 - build_index - INFO - Start building index
344 | 2024-11-27 17:28:40,657 - build_index - INFO - Connecting to Neo4j
345 | 2024-11-27 17:28:40,740 - build_index - INFO - Initializing chat model
346 | 2024-11-27 17:28:50,760 - build_index - INFO - Initializing embedding
347 | 2024-11-27 17:28:56,168 - build_index - INFO - Initializing splitter
348 | 2024-11-27 17:28:56,169 - build_index - INFO - Initializing GDS
349 | 2024-11-27 17:28:56,186 - build_index - INFO - Initializing index
350 | 2024-11-27 17:31:11,075 - build_index - INFO - Start building index
351 | 2024-11-27 17:31:11,082 - build_index - INFO - Connecting to Neo4j
352 | 2024-11-27 17:31:11,158 - build_index - INFO - Initializing chat model
353 | 2024-11-27 17:31:20,756 - build_index - INFO - Initializing embedding
354 | 2024-11-27 17:31:26,119 - build_index - INFO - Initializing splitter
355 | 2024-11-27 17:31:26,122 - build_index - INFO - Initializing GDS
356 | 2024-11-27 17:31:26,135 - build_index - INFO - Initializing index
357 | 2024-11-27 17:35:22,597 - build_index - INFO - Start building index
358 | 2024-11-27 17:35:22,603 - build_index - INFO - Connecting to Neo4j
359 | 2024-11-27 17:35:22,687 - build_index - INFO - Initializing chat model
360 | 2024-11-27 17:35:33,015 - build_index - INFO - Initializing embedding
361 | 2024-11-27 17:35:38,954 - build_index - INFO - Initializing splitter
362 | 2024-11-27 17:35:38,957 - build_index - INFO - Initializing GDS
363 | 2024-11-27 17:35:38,973 - build_index - INFO - Initializing index
364 | 2024-11-27 17:39:45,753 - build_index - INFO - Start building index
365 | 2024-11-27 17:39:45,758 - build_index - INFO - Connecting to Neo4j
366 | 2024-11-27 17:39:45,847 - build_index - INFO - Initializing chat model
367 | 2024-11-27 17:39:54,926 - build_index - INFO - Initializing embedding
368 | 2024-11-27 17:39:59,959 - build_index - INFO - Initializing splitter
369 | 2024-11-27 17:39:59,960 - build_index - INFO - Initializing GDS
370 | 2024-11-27 17:39:59,976 - build_index - INFO - Initializing index
371 | 2024-11-27 17:53:22,940 - build_index - INFO - Start building index
372 | 2024-11-27 17:53:22,944 - build_index - INFO - Connecting to Neo4j
373 | 2024-11-27 17:53:23,040 - build_index - INFO - Initializing chat model
374 | 2024-11-27 17:53:32,284 - build_index - INFO - Initializing embedding
375 | 2024-11-27 17:53:37,421 - build_index - INFO - Initializing splitter
376 | 2024-11-27 17:53:37,422 - build_index - INFO - Initializing GDS
377 | 2024-11-27 17:53:37,436 - build_index - INFO - Initializing index
378 | 2024-11-27 18:00:21,963 - build_index - INFO - Start building index
379 | 2024-11-27 18:00:21,968 - build_index - INFO - Connecting to Neo4j
380 | 2024-11-27 18:00:22,062 - build_index - INFO - Initializing chat model
381 | 2024-11-27 18:00:31,137 - build_index - INFO - Initializing embedding
382 | 2024-11-27 18:00:36,304 - build_index - INFO - Initializing splitter
383 | 2024-11-27 18:00:36,304 - build_index - INFO - Initializing GDS
384 | 2024-11-27 18:00:36,320 - build_index - INFO - Initializing index
385 | 2024-11-27 18:13:29,143 - build_index - INFO - Start building index
386 | 2024-11-27 18:13:29,149 - build_index - INFO - Connecting to Neo4j
387 | 2024-11-27 18:13:29,237 - build_index - INFO - Initializing chat model
388 | 2024-11-27 18:13:38,381 - build_index - INFO - Initializing embedding
389 | 2024-11-27 18:13:43,803 - build_index - INFO - Initializing splitter
390 | 2024-11-27 18:13:43,804 - build_index - INFO - Initializing GDS
391 | 2024-11-27 18:13:43,822 - build_index - INFO - Initializing index
392 | 2024-11-27 18:21:01,468 - build_index - INFO - Start building index
393 | 2024-11-27 18:21:01,473 - build_index - INFO - Connecting to Neo4j
394 | 2024-11-27 18:21:01,559 - build_index - INFO - Initializing chat model
395 | 2024-11-27 18:21:10,604 - build_index - INFO - Initializing embedding
396 | 2024-11-27 18:21:15,706 - build_index - INFO - Initializing splitter
397 | 2024-11-27 18:21:15,708 - build_index - INFO - Initializing GDS
398 | 2024-11-27 18:21:15,727 - build_index - INFO - Initializing index
399 | 2024-11-27 18:28:37,643 - build_index - INFO - Start building index
400 | 2024-11-27 18:28:37,647 - build_index - INFO - Connecting to Neo4j
401 | 2024-11-27 18:28:37,733 - build_index - INFO - Initializing chat model
402 | 2024-11-27 18:28:46,870 - build_index - INFO - Initializing embedding
403 | 2024-11-27 18:28:52,600 - build_index - INFO - Initializing splitter
404 | 2024-11-27 18:28:52,600 - build_index - INFO - Initializing GDS
405 | 2024-11-27 18:28:52,613 - build_index - INFO - Initializing index
406 | 2024-11-27 18:31:30,272 - build_index - INFO - Start building index
407 | 2024-11-27 18:31:30,279 - build_index - INFO - Connecting to Neo4j
408 | 2024-11-27 18:31:30,361 - build_index - INFO - Initializing chat model
409 | 2024-11-27 18:31:38,960 - build_index - INFO - Initializing embedding
410 | 2024-11-27 18:31:44,430 - build_index - INFO - Initializing splitter
411 | 2024-11-27 18:31:44,430 - build_index - INFO - Initializing GDS
412 | 2024-11-27 18:31:44,440 - build_index - INFO - Initializing index
413 | 2024-11-27 18:36:30,250 - build_index - INFO - Start building index
414 | 2024-11-27 18:36:30,256 - build_index - INFO - Connecting to Neo4j
415 | 2024-11-27 18:36:30,352 - build_index - INFO - Initializing chat model
416 | 2024-11-27 18:36:39,474 - build_index - INFO - Initializing embedding
417 | 2024-11-27 18:36:44,792 - build_index - INFO - Initializing splitter
418 | 2024-11-27 18:36:44,792 - build_index - INFO - Initializing GDS
419 | 2024-11-27 18:36:44,809 - build_index - INFO - Initializing index
420 | 2024-11-27 18:40:29,293 - build_index - INFO - Start building index
421 | 2024-11-27 18:40:29,297 - build_index - INFO - Connecting to Neo4j
422 | 2024-11-27 18:40:29,386 - build_index - INFO - Initializing chat model
423 | 2024-11-27 18:40:38,948 - build_index - INFO - Initializing embedding
424 | 2024-11-27 18:40:44,261 - build_index - INFO - Initializing splitter
425 | 2024-11-27 18:40:44,262 - build_index - INFO - Initializing GDS
426 | 2024-11-27 18:40:44,277 - build_index - INFO - Initializing index
427 | 2024-11-27 18:46:20,376 - build_index - INFO - Start building index
428 | 2024-11-27 18:46:20,381 - build_index - INFO - Connecting to Neo4j
429 | 2024-11-27 18:46:20,539 - build_index - INFO - Initializing chat model
430 | 2024-11-27 18:46:29,440 - build_index - INFO - Initializing embedding
431 | 2024-11-27 18:46:34,900 - build_index - INFO - Initializing splitter
432 | 2024-11-27 18:46:34,904 - build_index - INFO - Initializing GDS
433 | 2024-11-27 18:46:34,949 - build_index - INFO - Initializing index
434 | 2024-11-27 18:46:34,955 - build_index - INFO - Building index now!
435 | 2024-11-27 18:46:34,956 - build_index - INFO - Chunking documents
436 | 2024-11-27 18:50:23,332 - build_index - INFO - Start building index
437 | 2024-11-27 18:50:23,337 - build_index - INFO - Connecting to Neo4j
438 | 2024-11-27 18:50:23,357 - build_index - INFO - Initializing chat model
439 | 2024-11-27 18:50:33,206 - build_index - INFO - Initializing embedding
440 | 2024-11-27 18:50:38,077 - build_index - INFO - Initializing splitter
441 | 2024-11-27 18:50:38,079 - build_index - INFO - Initializing GDS
442 | 2024-11-27 18:50:38,092 - build_index - INFO - Initializing index
443 | 2024-11-27 18:52:14,399 - build_index - INFO - Start building index
444 | 2024-11-27 18:52:14,404 - build_index - INFO - Connecting to Neo4j
445 | 2024-11-27 18:52:14,431 - build_index - INFO - Initializing chat model
446 | 2024-11-27 18:52:23,457 - build_index - INFO - Initializing embedding
447 | 2024-11-27 18:52:28,450 - build_index - INFO - Initializing splitter
448 | 2024-11-27 18:52:28,450 - build_index - INFO - Initializing GDS
449 | 2024-11-27 18:52:28,466 - build_index - INFO - Initializing index
450 | 2024-11-27 18:53:04,926 - build_index - INFO - Start building index
451 | 2024-11-27 18:53:04,932 - build_index - INFO - Connecting to Neo4j
452 | 2024-11-27 18:53:04,956 - build_index - INFO - Initializing chat model
453 | 2024-11-27 18:53:13,685 - build_index - INFO - Initializing embedding
454 | 2024-11-27 18:53:19,011 - build_index - INFO - Initializing splitter
455 | 2024-11-27 18:53:19,011 - build_index - INFO - Initializing GDS
456 | 2024-11-27 18:53:19,027 - build_index - INFO - Initializing index
457 | 2024-11-27 18:55:17,198 - build_index - INFO - Start building index
458 | 2024-11-27 18:55:17,203 - build_index - INFO - Connecting to Neo4j
459 | 2024-11-27 18:55:17,225 - build_index - INFO - Initializing chat model
460 | 2024-11-27 18:55:26,237 - build_index - INFO - Initializing embedding
461 | 2024-11-27 18:55:31,408 - build_index - INFO - Initializing splitter
462 | 2024-11-27 18:55:31,408 - build_index - INFO - Initializing GDS
463 | 2024-11-27 18:55:31,423 - build_index - INFO - Initializing index
464 | 2024-11-27 18:56:36,139 - build_index - INFO - Start building index
465 | 2024-11-27 18:56:36,145 - build_index - INFO - Connecting to Neo4j
466 | 2024-11-27 18:56:36,173 - build_index - INFO - Initializing chat model
467 | 2024-11-27 18:56:44,847 - build_index - INFO - Initializing embedding
468 | 2024-11-27 18:56:49,770 - build_index - INFO - Initializing splitter
469 | 2024-11-27 18:56:49,771 - build_index - INFO - Initializing GDS
470 | 2024-11-27 18:56:49,787 - build_index - INFO - Initializing index
471 | 2024-11-27 18:57:42,954 - build_index - INFO - Start building index
472 | 2024-11-27 18:57:42,961 - build_index - INFO - Connecting to Neo4j
473 | 2024-11-27 18:57:42,989 - build_index - INFO - Initializing chat model
474 | 2024-11-27 18:57:52,303 - build_index - INFO - Initializing embedding
475 | 2024-11-27 18:57:57,542 - build_index - INFO - Initializing splitter
476 | 2024-11-27 18:57:57,543 - build_index - INFO - Initializing GDS
477 | 2024-11-27 18:57:57,559 - build_index - INFO - Initializing index
478 | 2024-11-27 19:06:09,582 - build_index - INFO - Start building index
479 | 2024-11-27 19:06:09,587 - build_index - INFO - Connecting to Neo4j
480 | 2024-11-27 19:06:09,608 - build_index - INFO - Initializing chat model
481 | 2024-11-27 19:06:18,464 - build_index - INFO - Initializing embedding
482 | 2024-11-27 19:06:23,628 - build_index - INFO - Initializing splitter
483 | 2024-11-27 19:06:23,630 - build_index - INFO - Initializing GDS
484 | 2024-11-27 19:06:23,647 - build_index - INFO - Initializing index
485 | 2024-11-27 19:06:57,679 - build_index - INFO - Building index now!
486 | 2024-11-27 19:06:57,683 - build_index - INFO - Chunking documents
487 | 2024-11-27 19:17:59,354 - build_index - INFO - Start building index
488 | 2024-11-27 19:17:59,360 - build_index - ERROR - 目前只支持openai,hf
489 | 2024-11-27 19:18:22,893 - build_index - INFO - Start building index
490 | 2024-11-27 19:18:22,894 - build_index - INFO - Connecting to Neo4j
491 | 2024-11-27 19:18:22,924 - build_index - INFO - Initializing chat model
492 | 2024-11-27 19:22:55,011 - build_index - INFO - Start building index
493 | 2024-11-27 19:22:55,017 - build_index - INFO - Connecting to Neo4j
494 | 2024-11-27 19:22:55,040 - build_index - INFO - Initializing chat model
495 | 2024-11-27 19:22:55,128 - build_index - INFO - Initializing embedding
496 | 2024-11-27 19:23:01,763 - build_index - INFO - Initializing splitter
497 | 2024-11-27 19:23:01,763 - build_index - INFO - Initializing GDS
498 | 2024-11-27 19:23:01,777 - build_index - INFO - Initializing index
499 | 2024-11-27 19:23:03,430 - build_index - INFO - Building index now!
500 | 2024-11-27 19:23:03,430 - build_index - INFO - Chunking documents
501 | 2024-11-27 19:25:04,123 - build_index - INFO - Start building index
502 | 2024-11-27 19:25:04,128 - build_index - INFO - Connecting to Neo4j
503 | 2024-11-27 19:25:04,151 - build_index - INFO - Initializing chat model
504 | 2024-11-27 19:25:04,206 - build_index - INFO - Initializing embedding
505 | 2024-11-27 19:25:10,355 - build_index - INFO - Initializing splitter
506 | 2024-11-27 19:25:10,355 - build_index - INFO - Initializing GDS
507 | 2024-11-27 19:25:10,365 - build_index - INFO - Initializing index
508 | 2024-11-27 21:16:05,607 - build_index - INFO - Start building index
509 | 2024-11-27 21:16:05,612 - build_index - INFO - Connecting to Neo4j
510 | 2024-11-27 21:16:05,640 - build_index - INFO - Initializing chat model
511 | 2024-11-27 21:16:05,694 - build_index - INFO - Initializing embedding
512 | 2024-11-27 21:16:12,629 - build_index - INFO - Initializing splitter
513 | 2024-11-27 21:16:12,629 - build_index - INFO - Initializing GDS
514 | 2024-11-27 21:16:12,640 - build_index - INFO - Initializing index
515 | 2024-11-27 21:16:12,641 - build_index - INFO - Building index now!
516 | 2024-11-27 21:16:12,641 - build_index - INFO - Chunking documents
517 | 2024-11-27 21:18:17,197 - build_index - INFO - Start building index
518 | 2024-11-27 21:18:17,202 - build_index - INFO - Connecting to Neo4j
519 | 2024-11-27 21:18:17,227 - build_index - INFO - Initializing chat model
520 | 2024-11-27 21:18:17,282 - build_index - INFO - Initializing embedding
521 | 2024-11-27 21:18:23,074 - build_index - INFO - Initializing splitter
522 | 2024-11-27 21:18:23,075 - build_index - INFO - Initializing GDS
523 | 2024-11-27 21:18:23,085 - build_index - INFO - Initializing index
524 | 2024-11-27 21:18:23,085 - build_index - INFO - Building index now!
525 | 2024-11-27 21:18:23,085 - build_index - INFO - Chunking documents
526 | 2024-11-27 21:20:44,548 - build_index - INFO - Start building index
527 | 2024-11-27 21:20:44,553 - build_index - INFO - Connecting to Neo4j
528 | 2024-11-27 21:20:44,572 - build_index - INFO - Initializing chat model
529 | 2024-11-27 21:20:44,630 - build_index - INFO - Initializing embedding
530 | 2024-11-27 21:20:50,684 - build_index - INFO - Initializing splitter
531 | 2024-11-27 21:20:50,684 - build_index - INFO - Initializing GDS
532 | 2024-11-27 21:20:50,698 - build_index - INFO - Initializing index
533 | 2024-11-27 21:20:50,698 - build_index - INFO - Building index now!
534 | 2024-11-27 21:20:50,698 - build_index - INFO - Chunking documents
535 | 2024-11-27 21:25:58,544 - build_index - INFO - Start building index
536 | 2024-11-27 21:25:58,547 - build_index - INFO - Connecting to Neo4j
537 | 2024-11-27 21:25:58,580 - build_index - INFO - Initializing chat model
538 | 2024-11-27 21:25:58,634 - build_index - INFO - Initializing embedding
539 | 2024-11-27 21:26:04,300 - build_index - INFO - Initializing splitter
540 | 2024-11-27 21:26:04,301 - build_index - INFO - Initializing GDS
541 | 2024-11-27 21:26:04,309 - build_index - INFO - Initializing index
542 | 2024-11-27 21:26:04,310 - build_index - INFO - Building index now!
543 | 2024-11-27 21:26:04,310 - build_index - INFO - Chunking documents
544 | 2024-11-27 21:26:43,405 - build_index - INFO - Start building index
545 | 2024-11-27 21:26:43,410 - build_index - INFO - Connecting to Neo4j
546 | 2024-11-27 21:26:43,432 - build_index - INFO - Initializing chat model
547 | 2024-11-27 21:26:43,496 - build_index - INFO - Initializing embedding
548 | 2024-11-27 21:26:48,893 - build_index - INFO - Initializing splitter
549 | 2024-11-27 21:26:48,893 - build_index - INFO - Initializing GDS
550 | 2024-11-27 21:26:48,904 - build_index - INFO - Initializing index
551 | 2024-11-27 21:26:48,905 - build_index - INFO - Building index now!
552 | 2024-11-27 21:26:48,905 - build_index - INFO - Start building index
553 | 2024-11-27 21:26:48,906 - build_index - INFO - Chunking documents
554 | 2024-11-27 21:31:41,877 - build_index - ERROR - ```json
555 | {
556 |   "nodes":...不是一个合法的 Json
557 | 2024-11-27 21:54:10,171 - build_index - INFO - Start building index
558 | 2024-11-27 21:54:10,176 - build_index - INFO - Connecting to Neo4j
559 | 2024-11-27 21:54:10,206 - build_index - INFO - Initializing chat model
560 | 2024-11-27 21:54:10,259 - build_index - INFO - Initializing embedding
561 | 2024-11-27 21:54:16,612 - build_index - INFO - Initializing splitter
562 | 2024-11-27 21:54:16,612 - build_index - INFO - Initializing GDS
563 | 2024-11-27 21:54:16,628 - build_index - INFO - Initializing index
564 | 2024-11-27 21:54:16,628 - build_index - INFO - Building index now!
565 | 2024-11-27 21:54:16,628 - build_index - INFO - Create_nodes_and_relationships
566 | 2024-11-27 21:54:16,629 - build_index - INFO - Chunking documents
567 | 2024-11-27 21:55:21,171 - build_index - ERROR - ```json
568 | {
569 |   "head": ...不是一个合法的 Json
570 | 2024-11-27 21:55:21,176 - build_index - ERROR - ```json
571 | {
572 |   "head": ...不是一个合法的 Json
573 | 2024-11-27 21:55:21,178 - build_index - ERROR - ```json
574 | {
575 |   "head": ...不是一个合法的 Json
576 | 2024-11-27 21:55:21,180 - build_index - ERROR - ```json
577 | {
578 |   "head": ...不是一个合法的 Json
579 | 2024-11-27 21:55:21,182 - build_index - ERROR - ```json
580 | {
581 |   "head": ...不是一个合法的 Json
582 | 2024-11-27 21:55:21,183 - build_index - ERROR - ```json
583 | {
584 |   "head": ...不是一个合法的 Json
585 | 2024-11-27 21:55:21,184 - build_index - ERROR - ```json
586 | {
587 |   "head": ...不是一个合法的 Json
588 | 2024-11-27 21:55:21,185 - build_index - ERROR - ```json
589 | {
590 |   "head": ...不是一个合法的 Json
591 | 2024-11-27 21:55:21,186 - build_index - ERROR - ```json
592 | {
593 |   "head": ...不是一个合法的 Json
594 | 2024-11-27 21:55:21,188 - build_index - ERROR - ```json
595 | {
596 |   "head": ...不是一个合法的 Json
597 | 2024-11-27 21:55:21,189 - build_index - ERROR - ```json
598 | {
599 |   "head": ...不是一个合法的 Json
600 | 2024-11-27 21:56:50,070 - build_index - INFO - Start building index
601 | 2024-11-27 21:56:50,075 - build_index - INFO - Connecting to Neo4j
602 | 2024-11-27 21:56:50,100 - build_index - INFO - Initializing chat model
603 | 2024-11-27 21:56:50,159 - build_index - INFO - Initializing embedding
604 | 2024-11-27 21:56:55,922 - build_index - INFO - Initializing splitter
605 | 2024-11-27 21:56:55,922 - build_index - INFO - Initializing GDS
606 | 2024-11-27 21:56:55,933 - build_index - INFO - Initializing index
607 | 2024-11-27 21:56:55,933 - build_index - INFO - Building index now!
608 | 2024-11-27 21:56:55,933 - build_index - INFO - Create_nodes_and_relationships
609 | 2024-11-27 21:56:55,934 - build_index - INFO - Chunking documents
610 | 2024-11-28 19:17:59,906 - build_index - INFO - Start building index
611 | 2024-11-28 19:17:59,913 - build_index - INFO - Connecting to Neo4j
612 | 2024-11-28 19:17:59,944 - build_index - INFO - Initializing chat model
613 | 2024-11-28 19:18:00,001 - build_index - INFO - Initializing embedding
614 | 2024-11-28 19:18:06,162 - build_index - INFO - Initializing splitter
615 | 2024-11-28 19:18:06,163 - build_index - INFO - Initializing GDS
616 | 2024-11-28 19:18:06,173 - build_index - INFO - Initializing index
617 | 2024-11-28 19:18:06,173 - build_index - INFO - Building index now!
618 | 2024-11-28 19:18:06,173 - build_index - INFO - Create_nodes_and_relationships
619 | 2024-11-28 19:18:06,173 - build_index - INFO - Chunking documents
620 | 2024-11-28 19:19:38,126 - build_index - ERROR - ```json
621 | {
622 |   "head": ...不是一个合法的 Json
623 | 2024-11-28 19:22:00,757 - build_index - ERROR - ```json
624 | {
625 |   "head": ...不是一个合法的 Json
626 | 2024-11-28 19:25:24,348 - build_index - INFO - Start building index
627 | 2024-11-28 19:25:24,352 - build_index - INFO - Connecting to Neo4j
628 | 2024-11-28 19:25:24,373 - build_index - INFO - Initializing chat model
629 | 2024-11-28 19:25:24,441 - build_index - INFO - Initializing embedding
630 | 2024-11-28 19:25:30,723 - build_index - INFO - Initializing splitter
631 | 2024-11-28 19:25:30,724 - build_index - INFO - Initializing GDS
632 | 2024-11-28 19:25:30,735 - build_index - INFO - Initializing index
633 | 2024-11-28 19:25:30,735 - build_index - INFO - Building index now!
634 | 2024-11-28 19:25:30,736 - build_index - INFO - Create_nodes_and_relationships
635 | 2024-11-28 19:25:30,736 - build_index - INFO - Chunking documents
636 | 2024-11-28 19:25:57,604 - build_index - INFO - Start building index
637 | 2024-11-28 19:25:57,609 - build_index - INFO - Connecting to Neo4j
638 | 2024-11-28 19:25:57,631 - build_index - INFO - Initializing chat model
639 | 2024-11-28 19:25:57,686 - build_index - INFO - Initializing embedding
640 | 2024-11-28 19:26:03,827 - build_index - INFO - Initializing splitter
641 | 2024-11-28 19:26:03,828 - build_index - INFO - Initializing GDS
642 | 2024-11-28 19:26:03,837 - build_index - INFO - Initializing index
643 | 2024-11-28 19:26:03,837 - build_index - INFO - Building index now!
644 | 2024-11-28 19:26:03,837 - build_index - INFO - Create_nodes_and_relationships
645 | 2024-11-28 19:26:03,837 - build_index - INFO - Chunking documents
646 | 2024-11-28 19:27:21,921 - build_index - INFO - Start building index
647 | 2024-11-28 19:27:21,926 - build_index - INFO - Connecting to Neo4j
648 | 2024-11-28 19:27:21,947 - build_index - INFO - Initializing chat model
649 | 2024-11-28 19:27:22,002 - build_index - INFO - Initializing embedding
650 | 2024-11-28 19:27:28,430 - build_index - INFO - Initializing splitter
651 | 2024-11-28 19:27:28,430 - build_index - INFO - Initializing GDS
652 | 2024-11-28 19:27:28,443 - build_index - INFO - Initializing index
653 | 2024-11-28 19:27:28,444 - build_index - INFO - Building index now!
654 | 2024-11-28 19:27:28,444 - build_index - INFO - Create_nodes_and_relationships
655 | 2024-11-28 19:27:28,444 - build_index - INFO - Chunking documents
656 | 2024-11-28 19:28:16,926 - build_index - ERROR - ```json
657 | {
658 |   "head": ...不是一个合法的 Json
659 | 2024-11-28 19:29:34,121 - build_index - ERROR - ```json
660 | {
661 |   "head": ...不是一个合法的 Json
662 | 2024-11-28 19:29:41,073 - build_index - INFO - Start building index
663 | 2024-11-28 19:29:41,079 - build_index - INFO - Connecting to Neo4j
664 | 2024-11-28 19:29:41,101 - build_index - INFO - Initializing chat model
665 | 2024-11-28 19:29:41,154 - build_index - INFO - Initializing embedding
666 | 2024-11-28 19:29:47,396 - build_index - INFO - Initializing splitter
667 | 2024-11-28 19:29:47,396 - build_index - INFO - Initializing GDS
668 | 2024-11-28 19:29:47,408 - build_index - INFO - Initializing index
669 | 2024-11-28 19:29:47,408 - build_index - INFO - Building index now!
670 | 2024-11-28 19:29:47,408 - build_index - INFO - Create_nodes_and_relationships
671 | 2024-11-28 19:29:47,408 - build_index - INFO - Chunking documents
672 | 2024-11-28 19:31:00,430 - build_index - ERROR - ```json
673 | {
674 |   "head": ...不是一个合法的 Json
675 | 2024-11-28 19:31:02,214 - build_index - ERROR - ```json
676 | {
677 |   "head": ...不是一个合法的 Json
678 | 2024-11-28 19:33:12,127 - build_index - INFO - Start building index
679 | 2024-11-28 19:33:12,131 - build_index - INFO - Connecting to Neo4j
680 | 2024-11-28 19:33:12,152 - build_index - INFO - Initializing chat model
681 | 2024-11-28 19:33:12,209 - build_index - INFO - Initializing embedding
682 | 2024-11-28 19:33:18,265 - build_index - INFO - Initializing splitter
683 | 2024-11-28 19:33:18,265 - build_index - INFO - Initializing GDS
684 | 2024-11-28 19:33:18,277 - build_index - INFO - Initializing index
685 | 2024-11-28 19:33:18,278 - build_index - INFO - Building index now!
686 | 2024-11-28 19:33:18,278 - build_index - INFO - Create_nodes_and_relationships
687 | 2024-11-28 19:33:18,278 - build_index - INFO - Chunking documents
688 | 2024-11-28 19:33:36,725 - build_index - INFO - Start building index
689 | 2024-11-28 19:33:36,726 - build_index - INFO - Connecting to Neo4j
690 | 2024-11-28 19:33:36,745 - build_index - INFO - Initializing chat model
691 | 2024-11-28 19:33:36,817 - build_index - INFO - Initializing embedding
692 | 2024-11-28 19:33:42,784 - build_index - INFO - Initializing splitter
693 | 2024-11-28 19:33:42,786 - build_index - INFO - Initializing GDS
694 | 2024-11-28 19:33:42,796 - build_index - INFO - Initializing index
695 | 2024-11-28 19:33:42,797 - build_index - INFO - Building index now!
696 | 2024-11-28 19:33:42,797 - build_index - INFO - Create_nodes_and_relationships
697 | 2024-11-28 19:33:42,797 - build_index - INFO - Chunking documents
698 | 2024-11-28 19:34:50,773 - build_index - ERROR - ```json
699 | {
700 |   "head": ...不是一个合法的 Json
701 | 2024-11-28 19:34:52,850 - build_index - INFO - None
702 | 2024-11-28 19:34:59,600 - build_index - INFO - Start building index
703 | 2024-11-28 19:34:59,603 - build_index - INFO - Connecting to Neo4j
704 | 2024-11-28 19:34:59,635 - build_index - INFO - Initializing chat model
705 | 2024-11-28 19:34:59,690 - build_index - INFO - Initializing embedding
706 | 2024-11-28 19:35:08,152 - build_index - INFO - Initializing splitter
707 | 2024-11-28 19:35:08,152 - build_index - INFO - Initializing GDS
708 | 2024-11-28 19:35:08,190 - build_index - INFO - Initializing index
709 | 2024-11-28 19:35:08,191 - build_index - INFO - Building index now!
710 | 2024-11-28 19:35:08,191 - build_index - INFO - Create_nodes_and_relationships
711 | 2024-11-28 19:35:08,191 - build_index - INFO - Chunking documents
712 | 2024-11-28 19:36:50,938 - build_index - ERROR - 模型没法进行这条任务的实体解析
713 | 2024-11-28 19:36:51,264 - build_index - ERROR - 模型没法进行这条任务的实体解析
714 | 2024-11-28 19:36:51,728 - build_index - ERROR - 模型没法进行这条任务的实体解析
715 | 2024-11-28 19:36:51,783 - build_index - INFO - None
716 | 2024-11-28 19:36:52,008 - build_index - INFO - 104 nodes merged
717 | 2024-11-28 19:43:17,314 - build_index - INFO - Start building index
718 | 2024-11-28 19:43:17,318 - build_index - INFO - Connecting to Neo4j
719 | 2024-11-28 19:43:18,939 - build_index - INFO - Initializing chat model
720 | 2024-11-28 19:43:19,025 - build_index - INFO - Initializing embedding
721 | 2024-11-28 19:43:24,815 - build_index - INFO - Initializing splitter
722 | 2024-11-28 19:43:24,816 - build_index - INFO - Initializing GDS
723 | 2024-11-28 19:43:24,824 - build_index - INFO - Initializing index
724 | 2024-11-28 19:43:24,826 - build_index - INFO - Building index now!
725 | 2024-11-28 19:43:24,826 - build_index - INFO - Create_nodes_and_relationships
726 | 2024-11-28 19:43:24,826 - build_index - INFO - Chunking documents
727 | 2024-11-28 19:47:59,182 - build_index - INFO - Start building index
728 | 2024-11-28 19:47:59,187 - build_index - INFO - Connecting to Neo4j
729 | 2024-11-28 19:47:59,210 - build_index - INFO - Initializing chat model
730 | 2024-11-28 19:47:59,305 - build_index - INFO - Initializing embedding
731 | 2024-11-28 19:48:07,018 - build_index - INFO - Initializing splitter
732 | 2024-11-28 19:48:07,019 - build_index - INFO - Initializing GDS
733 | 2024-11-28 19:48:07,030 - build_index - INFO - Initializing index
734 | 2024-11-28 19:48:07,031 - build_index - INFO - Building index now!
735 | 2024-11-28 19:48:07,031 - build_index - INFO - Create_nodes_and_relationships
736 | 2024-11-28 19:48:07,031 - build_index - INFO - Chunking documents
737 | 2024-11-28 20:00:59,145 - build_index - INFO - Start building index
738 | 2024-11-28 20:00:59,149 - build_index - INFO - Connecting to Neo4j
739 | 2024-11-28 20:00:59,180 - build_index - INFO - Initializing chat model
740 | 2024-11-28 20:00:59,238 - build_index - INFO - Initializing embedding
741 | 2024-11-28 20:01:05,761 - build_index - INFO - Initializing splitter
742 | 2024-11-28 20:01:05,762 - build_index - INFO - Initializing GDS
743 | 2024-11-28 20:01:05,769 - build_index - INFO - Initializing index
744 | 2024-11-28 20:01:05,769 - build_index - INFO - Building index now!
745 | 2024-11-28 20:01:05,770 - build_index - INFO - Create_nodes_and_relationships
746 | 2024-11-28 20:01:05,770 - build_index - INFO - Chunking documents
747 | 2024-11-28 20:07:38,053 - build_index - ERROR - ```json
748 | {
749 |   "head": ...不是一个合法的 Json
750 | 2024-11-28 20:07:43,435 - build_index - ERROR - ```json
751 | {
752 |   "head": ...不是一个合法的 Json
753 | 2024-11-28 20:07:46,243 - build_index - ERROR - ```json
754 | {
755 |   "head": ...不是一个合法的 Json
756 | 2024-11-28 20:07:50,135 - build_index - ERROR - ```json
757 | {
758 |   "head": ...不是一个合法的 Json
759 | 2024-11-28 20:07:51,310 - build_index - ERROR - ```json
760 | {
761 |   "head": ...不是一个合法的 Json
762 | 2024-11-28 20:07:51,491 - build_index - ERROR - ```json
763 | {
764 |   "head": ...不是一个合法的 Json
765 | 2024-11-28 20:07:52,302 - build_index - ERROR - ```json
766 | {
767 |   "head": ...不是一个合法的 Json
768 | 2024-11-28 20:07:52,472 - build_index - ERROR - ```json
769 | {
770 |   "head": ...不是一个合法的 Json
771 | 2024-11-28 20:07:52,642 - build_index - ERROR - ```json
772 | {
773 |   "head": ...不是一个合法的 Json
774 | 2024-11-28 20:07:52,804 - build_index - ERROR - ```json
775 | {
776 |   "head": ...不是一个合法的 Json
777 | 2024-11-28 20:08:06,078 - build_index - INFO - Start building index
778 | 2024-11-28 20:08:06,081 - build_index - INFO - Connecting to Neo4j
779 | 2024-11-28 20:08:06,098 - build_index - INFO - Initializing chat model
780 | 2024-11-28 20:08:06,152 - build_index - INFO - Initializing embedding
781 | 2024-11-28 20:08:12,337 - build_index - INFO - Initializing splitter
782 | 2024-11-28 20:08:12,338 - build_index - INFO - Initializing GDS
783 | 2024-11-28 20:08:12,350 - build_index - INFO - Initializing index
784 | 2024-11-28 20:08:12,351 - build_index - INFO - Building index now!
785 | 2024-11-28 20:08:12,351 - build_index - INFO - Create_nodes_and_relationships
786 | 2024-11-28 20:08:12,352 - build_index - INFO - Chunking documents
787 | 2024-11-28 20:09:49,913 - build_index - ERROR - ```json
788 | {
789 |   "head": ...不是一个合法的 Json
790 | 2024-11-28 20:09:49,918 - build_index - ERROR - ```json
791 | {
792 |   "head": ...不是一个合法的 Json
793 | 2024-11-28 20:09:49,920 - build_index - ERROR - ```json
794 | {
795 |   "head": ...不是一个合法的 Json
796 | 2024-11-28 20:09:49,924 - build_index - ERROR - ```json
797 | {
798 |   "head": ...不是一个合法的 Json
799 | 2024-11-28 20:37:28,154 - build_index - INFO - Start building index
800 | 2024-11-28 20:37:28,160 - build_index - INFO - Connecting to Neo4j
801 | 2024-11-28 20:37:57,323 - build_index - INFO - Initializing chat model
802 | 2024-11-28 20:37:57,377 - build_index - INFO - Initializing embedding
803 | 2024-11-28 20:38:03,272 - build_index - INFO - Initializing splitter
804 | 2024-11-28 20:38:03,274 - build_index - INFO - Initializing GDS
805 | 2024-11-28 20:38:03,318 - build_index - INFO - Initializing index
806 | 2024-11-28 20:38:03,319 - build_index - INFO - Building index now!
807 | 2024-11-28 20:38:03,319 - build_index - INFO - Create_nodes_and_relationships
808 | 2024-11-28 20:38:03,319 - build_index - INFO - Chunking documents
809 | 2024-11-28 20:38:45,183 - build_index - INFO - None
810 | 2024-11-28 20:38:45,447 - build_index - INFO - 44 nodes merged
811 | 2024-11-28 20:38:52,067 - build_index - INFO - Index built successfully
812 | 2024-11-28 20:47:35,215 - build_index - INFO - Start building index
813 | 2024-11-28 20:47:35,219 - build_index - INFO - Connecting to Neo4j
814 | 2024-11-28 20:47:35,379 - build_index - INFO - Initializing chat model
815 | 2024-11-28 20:47:35,453 - build_index - INFO - Initializing embedding
816 | 2024-11-28 20:47:42,616 - build_index - INFO - Initializing splitter
817 | 2024-11-28 20:47:42,617 - build_index - INFO - Initializing GDS
818 | 2024-11-28 20:47:42,646 - build_index - INFO - Initializing index
819 | 2024-11-28 20:47:42,646 - build_index - INFO - Building index now!
820 | 2024-11-28 20:47:42,647 - build_index - INFO - Create_nodes_and_relationships
821 | 2024-11-28 20:47:42,647 - build_index - INFO - Chunking documents
822 | 2024-11-28 20:49:16,020 - build_index - ERROR - ```json
823 | {
824 |   "head": ...不是一个合法的 Json
825 | 2024-11-28 20:49:16,025 - build_index - ERROR - ```json
826 | {
827 |   "head": ...不是一个合法的 Json
828 | 2024-11-28 20:49:16,027 - build_index - ERROR - ```json
829 | {
830 |   "head": ...不是一个合法的 Json
831 | 2024-11-28 20:49:16,031 - build_index - ERROR - ```json
832 | {
833 |   "head": ...不是一个合法的 Json
834 | 2024-11-28 20:49:48,687 - build_index - INFO - None
835 | 2024-11-28 20:49:48,855 - build_index - INFO - 84 nodes merged
836 | 2024-11-28 20:50:19,571 - build_index - INFO - Index built successfully
837 | 2024-11-28 20:51:36,090 - build_index - INFO - Start building index
838 | 2024-11-28 20:51:36,094 - build_index - INFO - Connecting to Neo4j
839 | 2024-11-28 20:51:36,231 - build_index - INFO - Initializing chat model
840 | 2024-11-28 20:52:03,682 - build_index - INFO - Start building index
841 | 2024-11-28 20:52:03,683 - build_index - INFO - Connecting to Neo4j
842 | 2024-11-28 20:52:03,700 - build_index - INFO - Initializing chat model
843 | 2024-11-28 20:52:12,767 - build_index - INFO - Initializing embedding
844 | 2024-11-28 20:52:17,901 - build_index - INFO - Initializing splitter
845 | 2024-11-28 20:52:17,903 - build_index - INFO - Initializing GDS
846 | 2024-11-28 20:52:17,934 - build_index - INFO - Initializing index
847 | 2024-11-28 20:52:17,935 - build_index - INFO - Building index now!
848 | 2024-11-28 20:52:17,935 - build_index - INFO - Create_nodes_and_relationships
849 | 2024-11-28 20:52:17,935 - build_index - INFO - Chunking documents
850 | 2024-11-28 20:55:20,998 - build_index - ERROR - ```json
851 | {
852 |   "head": ...不是一个合法的 Json
853 | 2024-11-28 20:55:21,004 - build_index - ERROR - ```json
854 | {
855 |   "head": ...不是一个合法的 Json
856 | 2024-11-28 20:55:21,006 - build_index - ERROR - ```json
857 | {
858 |   "head": ...不是一个合法的 Json
859 | 2024-11-28 20:55:21,007 - build_index - ERROR - ```json
860 | {
861 |   "head": ...不是一个合法的 Json
862 | 2024-11-28 20:55:21,008 - build_index - ERROR - ```json
863 | {
864 |   "head": ...不是一个合法的 Json
865 | 2024-11-28 20:55:21,010 - build_index - ERROR - ```json
866 | {
867 |   "head": ...不是一个合法的 Json
868 | 2024-11-28 20:55:21,012 - build_index - ERROR - ```json
869 | {
870 |   "head": ...不是一个合法的 Json
871 | 2024-11-28 20:55:21,013 - build_index - ERROR - ```json
872 | {
873 |   "head": ...不是一个合法的 Json
874 | 2024-11-28 20:55:21,014 - build_index - ERROR - ```json
875 | {
876 |   "head": ...不是一个合法的 Json
877 | 2024-11-28 20:55:21,015 - build_index - ERROR - ```json
878 | {
879 |   "head": ...不是一个合法的 Json
880 | 2024-11-28 20:55:21,811 - build_index - INFO - None
881 | 2024-11-28 20:55:22,146 - build_index - INFO - 6 nodes merged
882 | 2024-11-28 20:55:26,850 - build_index - INFO - Index built successfully
883 | 2024-11-28 20:57:07,398 - build_index - INFO - Start building index
884 | 2024-11-28 20:57:07,402 - build_index - INFO - Connecting to Neo4j
885 | 2024-11-28 20:57:07,441 - build_index - INFO - Initializing chat model
886 | 2024-11-28 20:57:17,702 - build_index - INFO - Initializing embedding
887 | 2024-11-28 20:57:23,980 - build_index - INFO - Initializing splitter
888 | 2024-11-28 20:57:23,980 - build_index - INFO - Initializing GDS
889 | 2024-11-28 20:57:23,997 - build_index - INFO - Initializing index
890 | 2024-11-28 20:57:23,997 - build_index - INFO - Building index now!
891 | 2024-11-28 20:57:23,997 - build_index - INFO - Create_nodes_and_relationships
892 | 2024-11-28 20:57:23,997 - build_index - INFO - Chunking documents
893 | 2024-11-28 21:04:04,549 - build_index - ERROR - ```json
894 | {
895 |   "head": ...不是一个合法的 Json
896 | 2024-11-28 21:04:08,310 - build_index - ERROR - ```json
897 | {
898 |   "head": ...不是一个合法的 Json
899 | 2024-11-28 21:04:09,591 - build_index - ERROR - ```json
900 | {
901 |   "head": ...不是一个合法的 Json
902 | 2024-11-28 21:04:09,776 - build_index - ERROR - ```json
903 | {
904 |   "head": ...不是一个合法的 Json
905 | 2024-11-28 21:04:09,946 - build_index - ERROR - ```json
906 | {
907 |   "head": ...不是一个合法的 Json
908 | 2024-11-28 21:04:10,106 - build_index - ERROR - ```json
909 | {
910 |   "head": ...不是一个合法的 Json
911 | 2024-11-28 21:04:10,259 - build_index - ERROR - ```json
912 | {
913 |   "head": ...不是一个合法的 Json
914 | 2024-11-28 21:04:10,419 - build_index - ERROR - ```json
915 | {
916 |   "head": ...不是一个合法的 Json
917 | 2024-11-28 21:04:10,578 - build_index - ERROR - ```json
918 | {
919 |   "head": ...不是一个合法的 Json
920 | 2024-11-28 21:04:10,735 - build_index - ERROR - ```json
921 | {
922 |   "head": ...不是一个合法的 Json
923 | 2024-11-28 21:04:10,895 - build_index - ERROR - ```json
924 | {
925 |   "head": ...不是一个合法的 Json
926 | 2024-11-28 21:04:23,019 - build_index - INFO - Start building index
927 | 2024-11-28 21:04:23,023 - build_index - INFO - Connecting to Neo4j
928 | 2024-11-28 21:04:23,053 - build_index - INFO - Initializing chat model
929 | 2024-11-28 21:04:32,215 - build_index - INFO - Initializing embedding
930 | 2024-11-28 21:04:37,683 - build_index - INFO - Initializing splitter
931 | 2024-11-28 21:04:37,683 - build_index - INFO - Initializing GDS
932 | 2024-11-28 21:04:37,697 - build_index - INFO - Initializing index
933 | 2024-11-28 21:04:37,697 - build_index - INFO - Building index now!
934 | 2024-11-28 21:04:37,697 - build_index - INFO - Create_nodes_and_relationships
935 | 2024-11-28 21:04:37,697 - build_index - INFO - Chunking documents
936 | 2024-11-28 21:08:45,257 - build_index - ERROR - 不是一个合法的Json
937 | 2024-11-28 21:08:47,059 - build_index - ERROR - 不是一个合法的Json
938 | 2024-11-28 21:08:47,227 - build_index - ERROR - 不是一个合法的Json
939 | 2024-11-28 21:08:47,398 - build_index - ERROR - 不是一个合法的Json
940 | 2024-11-28 21:08:47,559 - build_index - ERROR - 不是一个合法的Json
941 | 2024-11-28 21:08:47,721 - build_index - ERROR - 不是一个合法的Json
942 | 2024-11-28 21:08:47,892 - build_index - ERROR - 不是一个合法的Json
943 | 2024-11-28 21:08:48,055 - build_index - ERROR - 不是一个合法的Json
944 | 2024-11-28 21:10:12,876 - build_index - INFO - Start building index
945 | 2024-11-28 21:10:12,880 - build_index - INFO - Connecting to Neo4j
946 | 2024-11-28 21:10:12,905 - build_index - INFO - Initializing chat model
947 | 2024-11-28 21:10:21,153 - build_index - INFO - Initializing embedding
948 | 2024-11-28 21:10:25,984 - build_index - INFO - Initializing splitter
949 | 2024-11-28 21:10:25,984 - build_index - INFO - Initializing GDS
950 | 2024-11-28 21:10:25,991 - build_index - INFO - Initializing index
951 | 2024-11-28 21:10:25,992 - build_index - INFO - Building index now!
952 | 2024-11-28 21:10:25,992 - build_index - INFO - Create_nodes_and_relationships
953 | 2024-11-28 21:10:25,992 - build_index - INFO - Chunking documents
954 | 2024-11-28 21:15:06,719 - build_index - ERROR - 不是一个合法的Json
955 | 2024-11-28 21:15:09,765 - build_index - ERROR - 不是一个合法的Json
956 | 2024-11-28 21:15:13,253 - build_index - INFO - None
957 | 2024-11-28 21:15:13,621 - build_index - INFO - 110 nodes merged
958 | 2024-11-28 21:16:35,372 - build_index - INFO - Index built successfully
959 | 


--------------------------------------------------------------------------------