├── src ├── __init__.py ├── utils │ ├── __init__.py │ └── logger.py ├── query │ ├── utils │ │ ├── __init__.py │ │ └── token_counter.py │ ├── custom_types │ │ ├── __init__.py │ │ ├── graphs │ │ │ ├── __init__.py │ │ │ ├── embedding.py │ │ │ └── community.py │ │ ├── tokens.py │ │ └── prompts.py │ ├── global_search │ │ ├── community_report.py │ │ ├── __init__.py │ │ ├── key_points_aggregator │ │ │ ├── __init__.py │ │ │ ├── prompt_builder.py │ │ │ ├── aggregator.py │ │ │ ├── context_builder.py │ │ │ └── _system_prompt.py │ │ ├── key_points_generator │ │ │ ├── _output_parser.py │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── prompt_builder.py │ │ │ ├── generator.py │ │ │ ├── _system_prompt.py │ │ │ ├── temp.txt │ │ │ └── context_builder.py │ │ ├── community_weight_calculator.py │ │ └── search.py │ ├── local_search │ │ ├── __init__.py │ │ ├── context_builders │ │ │ ├── __init__.py │ │ │ ├── text_units.py │ │ │ ├── communities_reports.py │ │ │ ├── entities.py │ │ │ ├── context.py │ │ │ └── relationships.py │ │ ├── context_selectors │ │ │ ├── __init__.py │ │ │ ├── entities.py │ │ │ ├── communities_reports.py │ │ │ ├── text_units.py │ │ │ ├── relationships.py │ │ │ └── context.py │ │ ├── retriever.py │ │ ├── search.py │ │ ├── prompt_builder.py │ │ └── _system_prompt.py │ ├── __init__.py │ └── search.py ├── splitter │ ├── __init__.py │ └── slide_window_splitter.py └── index │ ├── __init__.py │ ├── pydantic_models.py │ ├── tqdm_LLMGraphTransformer.py │ ├── prompts.py │ ├── utils.py │ ├── cypher_query.py │ └── api_index.py ├── test_change.py ├── .gitignore ├── example ├── ollama_index.sh ├── ollama_search.sh ├── openai_search.sh ├── hf_index.sh ├── openai_index.sh └── hf_search.sh ├── index.sh ├── drop.py ├── txt ├── 三体_星际交锋2.txt └── 三体_星际交锋.txt ├── requirements.txt ├── README.md ├── search.log ├── search.py └── index.log /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_change.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/query/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/splitter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/query/custom_types/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/query/custom_types/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/index/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_index import ApiIndex 2 | 3 | __all__ = [ 4 | "ApiIndex" 5 | ] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest_* 3 | /src/**/test* 4 | /node_modules 5 | /package-lock.json 6 | /package.json -------------------------------------------------------------------------------- /src/query/custom_types/tokens.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | 4 | class TokenCounter(Protocol): 5 | def count_tokens(self, text: str) -> int: ... -------------------------------------------------------------------------------- /src/query/custom_types/graphs/embedding.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | import networkx as nx 4 | import numpy as np 5 | 6 | 7 | class GraphEmbeddingGenerator(Protocol): 8 | def run(self, graph: nx.Graph) -> dict[str, np.ndarray]: ... -------------------------------------------------------------------------------- /src/query/global_search/community_report.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class CommunityReport: 5 | id: str 6 | title: str 7 | summary: str 8 | rank: float 9 | weight: float 10 | content: str -------------------------------------------------------------------------------- /src/query/global_search/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .key_points_aggregator import KeyPointsAggregator 4 | from .key_points_generator import KeyPointsGenerator 5 | from .search import GlobalSearch 6 | 7 | __all__ = [ 8 | "GlobalSearch", 9 | "KeyPointsAggregator", 10 | "KeyPointsGenerator", 11 | ] 12 | -------------------------------------------------------------------------------- /src/query/local_search/__init__.py: -------------------------------------------------------------------------------- 1 | """Local Search module.""" 2 | 3 | from .prompt_builder import LocalSearchPromptBuilder 4 | from .retriever import LocalSearchRetriever 5 | from .search import LocalSearch 6 | 7 | __all__ = [ 8 | "LocalSearch", 9 | "LocalSearchPromptBuilder", 10 | "LocalSearchRetriever", 11 | ] -------------------------------------------------------------------------------- /example/ollama_index.sh: -------------------------------------------------------------------------------- 1 | python build_index.py \ 2 | --file_path ./txt \ 3 | --neo4j_uri bolt://localhost:7687 \ 4 | --neo4j_username neo4j \ 5 | --neo4j_password langchaingraphrag \ 6 | --model_provider ollama \ 7 | --chat_model_name llama3.1 \ 8 | --embedding_model_name_or_path BAAI/bge-m3 \ 9 | --max_workers 16 \ 10 | --device cuda 11 | -------------------------------------------------------------------------------- /src/query/global_search/key_points_aggregator/__init__.py: -------------------------------------------------------------------------------- 1 | from .aggregator import KeyPointsAggregator 2 | from .context_builder import KeyPointsContextBuilder 3 | from .prompt_builder import KeyPointsAggregatorPromptBuilder 4 | 5 | __all__ = [ 6 | "KeyPointsAggregatorPromptBuilder", 7 | "KeyPointsContextBuilder", 8 | "KeyPointsAggregator", 9 | ] -------------------------------------------------------------------------------- /example/ollama_search.sh: -------------------------------------------------------------------------------- 1 | python search.py \ 2 | --neo4j_uri bolt://localhost:7687 \ 3 | --neo4j_username neo4j \ 4 | --neo4j_password langchaingraphrag \ 5 | --model_provider ollama \ 6 | --chat_model_name llama3.1 \ 7 | --embedding_model_name_or_path BAAI/bge-m3 \ 8 | --max_workers 16 \ 9 | --device cuda \ 10 | --completion_mode completion \ 11 | --query_mode global -------------------------------------------------------------------------------- /example/openai_search.sh: -------------------------------------------------------------------------------- 1 | python search.py \ 2 | --neo4j_uri bolt://localhost:7687 \ 3 | --neo4j_username neo4j \ 4 | --neo4j_password langchaingraphrag \ 5 | --model_provider openai \ 6 | --embedding_model_name_or_path BAAI/bge-m3 \ 7 | --chat_model_name gpt-4o-mini \ 8 | --base_url https://api.gpt.ge/v1/ \ 9 | --completion_mode completion \ 10 | --query_mode global -------------------------------------------------------------------------------- /example/hf_index.sh: -------------------------------------------------------------------------------- 1 | python build_index.py \ 2 | --file_path ./txt \ 3 | --neo4j_uri bolt://localhost:7687 \ 4 | --neo4j_username neo4j \ 5 | --neo4j_password langchaingraphrag \ 6 | --model_provider hf \ 7 | --embedding_model_name_or_path BAAI/bge-m3 \ 8 | --repo_id NousResearch/Meta-Llama-3.1-8B-Instruct \ 9 | --max_workers 16 \ 10 | --flash_attn \ 11 | --device cuda 12 | -------------------------------------------------------------------------------- /example/openai_index.sh: -------------------------------------------------------------------------------- 1 | python build_index.py \ 2 | --file_path ./txt \ 3 | --neo4j_uri bolt://localhost:7687 \ 4 | --neo4j_username neo4j \ 5 | --neo4j_password langchaingraphrag \ 6 | --model_provider openai \ 7 | --embedding_model_name_or_path BAAI/bge-m3 \ 8 | --chat_model_name gpt-4o-mini \ 9 | --base_url https://api.gpt.ge/v1/ \ 10 | --max_workers 16 \ 11 | --device cuda -------------------------------------------------------------------------------- /index.sh: -------------------------------------------------------------------------------- 1 | python build_index.py \ 2 | --file_path ./txt \ 3 | --neo4j_uri bolt://localhost:7687 \ 4 | --neo4j_username your_username \ 5 | --neo4j_password your_password \ 6 | --model_provider openai \ 7 | --embedding_model_name_or_path BAAI/bge-m3 \ 8 | --chat_model_name deepseek-chat \ 9 | --base_url https://api.deepseek.com \ 10 | --max_workers 16 \ 11 | --device cuda 12 | -------------------------------------------------------------------------------- /example/hf_search.sh: -------------------------------------------------------------------------------- 1 | python search.py \ 2 | --neo4j_uri bolt://localhost:7687 \ 3 | --neo4j_username neo4j \ 4 | --neo4j_password langchaingraphrag \ 5 | --model_provider hf \ 6 | --embedding_model_name_or_path BAAI/bge-m3 \ 7 | --repo_id NousResearch/Meta-Llama-3.1-8B-Instruct \ 8 | --max_workers 16 \ 9 | --device cuda \ 10 | --completion_mode completion \ 11 | --query_mode global -------------------------------------------------------------------------------- /src/query/__init__.py: -------------------------------------------------------------------------------- 1 | from .local_search import * 2 | from .global_search import * 3 | from .search import GlobalSearcher,LocalSearcher 4 | 5 | __all__ = [ 6 | "LocalSearch", 7 | "LocalSearchPromptBuilder", 8 | "LocalSearchRetriever", 9 | "GlobalSearch", 10 | "KeyPointsAggregator", 11 | "KeyPointsGenerator", 12 | "LocalSearcher", 13 | "GlobalSearcher", 14 | ] -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/_output_parser.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from langchain.output_parsers import PydanticOutputParser 4 | 5 | from .utils import KeyPointsResult 6 | 7 | 8 | class KeyPointsOutputParser(PydanticOutputParser): 9 | def __init__(self, **kwargs: dict[str, Any]): 10 | super().__init__(pydantic_object=KeyPointsResult, **kwargs) 11 | -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/__init__.py: -------------------------------------------------------------------------------- 1 | """Key Points generator module.""" 2 | 3 | from .context_builder import CommunityReportContextBuilder 4 | from .generator import KeyPointsGenerator 5 | from .prompt_builder import KeyPointsGeneratorPromptBuilder 6 | 7 | 8 | __all__ = [ 9 | "KeyPointsGeneratorPromptBuilder", 10 | "CommunityReportContextBuilder", 11 | "KeyPointsGenerator", 12 | ] -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/utils.py: -------------------------------------------------------------------------------- 1 | from langchain_core.pydantic_v1 import BaseModel, Field 2 | 3 | 4 | class KeyPointInfo(BaseModel): 5 | description: str = Field(description="The description of the key point") 6 | score: float = Field(description="The score of the key point") 7 | 8 | 9 | class KeyPointsResult(BaseModel): 10 | points: list[KeyPointInfo] = Field(description="the points") -------------------------------------------------------------------------------- /src/query/utils/token_counter.py: -------------------------------------------------------------------------------- 1 | """Counter for Tiktoken based tokens.""" 2 | 3 | import tiktoken 4 | 5 | from ..custom_types.tokens import TokenCounter 6 | 7 | class TiktokenCounter(TokenCounter): 8 | def __init__(self, encoding_name: str = "cl100k_base"): 9 | self.tokenizer = tiktoken.get_encoding(encoding_name) 10 | 11 | def count_tokens(self, text: str) -> int: 12 | return len(self.tokenizer.encode(text)) -------------------------------------------------------------------------------- /src/query/custom_types/prompts.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Protocol 2 | 3 | from langchain_core.output_parsers.base import BaseOutputParser 4 | from langchain_core.prompts import BasePromptTemplate 5 | from typing_extensions import Unpack 6 | 7 | 8 | class PromptBuilder(Protocol): 9 | def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]: ... 10 | 11 | 12 | class IndexingPromptBuilder(PromptBuilder, Protocol): 13 | def prepare_chain_input( 14 | self, **kwargs: Unpack[dict[str, Any]] 15 | ) -> dict[str, str]: ... -------------------------------------------------------------------------------- /src/query/local_search/context_builders/__init__.py: -------------------------------------------------------------------------------- 1 | """Context builders for local search.""" 2 | 3 | from .communities_reports import CommunitiesReportsContextBuilder 4 | from .context import ContextBuilder 5 | from .entities import EntitiesContextBuilder 6 | from .relationships import RelationshipsContextBuilder 7 | from .text_units import TextUnitsContextBuilder 8 | 9 | __all__ = [ 10 | "EntitiesContextBuilder", 11 | "ContextBuilder", 12 | "RelationshipsContextBuilder", 13 | "TextUnitsContextBuilder", 14 | "CommunitiesReportsContextBuilder", 15 | ] -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/__init__.py: -------------------------------------------------------------------------------- 1 | """Context selectors for local search.""" 2 | 3 | from .communities_reports import CommunitiesReportsSelector 4 | from .context import ContextSelectionResult, ContextSelector 5 | from .entities import EntitiesSelector 6 | from .relationships import RelationshipsSelector 7 | from .text_units import TextUnitsSelector 8 | 9 | __all__ = [ 10 | "ContextSelector", 11 | "ContextSelectionResult", 12 | "EntitiesSelector", 13 | "TextUnitsSelector", 14 | "RelationshipsSelector", 15 | "CommunitiesReportsSelector", 16 | ] -------------------------------------------------------------------------------- /src/index/pydantic_models.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from pydantic import BaseModel,Field 3 | 4 | 5 | class DuplicateEntities(BaseModel): 6 | entities: List[str] = Field( 7 | description="Entities that represent the same object or real-world entity and should be merged" 8 | ) 9 | 10 | 11 | class Disambiguate(BaseModel): 12 | merge_entities: Optional[List[DuplicateEntities]] = Field( 13 | description="Lists of entities that represent the same object or real-world entity and should be merged" 14 | ) 15 | 16 | class GetTitle(BaseModel): 17 | title: str = Field(description="Title of the given summary") -------------------------------------------------------------------------------- /src/index/tqdm_LLMGraphTransformer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Sequence 3 | from langchain_experimental.graph_transformers import LLMGraphTransformer 4 | from langchain_core.documents import Document 5 | from tqdm.asyncio import tqdm 6 | 7 | class t_LLMGraphTransformer(LLMGraphTransformer): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | 11 | async def aconvert_to_graph_documents(self, documents: Sequence[Document], config: Optional[RunnableConfig] = None) -> List[GraphDocument]: 12 | """ 13 | Asynchronously convert a sequence of documents into graph documents. 14 | """ 15 | tasks = [ 16 | asyncio.create_task(self.aprocess_response(document, config)) 17 | for document in documents 18 | ] 19 | results = await tqdm.gather(*tasks) 20 | return results -------------------------------------------------------------------------------- /src/query/custom_types/graphs/community.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import NewType, Protocol 3 | 4 | import networkx as nx 5 | 6 | CommunityId = NewType("CommunityId", int) 7 | CommunityLevel = NewType("CommunityLevel", int) 8 | 9 | 10 | @dataclass 11 | class CommunityNode: 12 | name: str 13 | parent_cluster: CommunityId | None 14 | is_final_cluster: bool 15 | 16 | 17 | @dataclass 18 | class Community: 19 | id: CommunityId 20 | nodes: list[CommunityNode] 21 | 22 | 23 | @dataclass 24 | class CommunityDetectionResult: 25 | communities: dict[CommunityLevel, dict[CommunityId, Community]] 26 | 27 | def communities_at_level(self, level: CommunityLevel) -> list[Community]: 28 | return list(self.communities[level].values()) 29 | 30 | 31 | class CommunityDetector(Protocol): 32 | def run(self, graph: nx.Graph) -> CommunityDetectionResult: ... -------------------------------------------------------------------------------- /src/query/local_search/retriever.py: -------------------------------------------------------------------------------- 1 | from langchain_core.callbacks import CallbackManagerForRetrieverRun 2 | from langchain_core.documents import Document 3 | from langchain_core.retrievers import BaseRetriever 4 | from langchain_core.vectorstores import VectorStore 5 | from .context_builders import ContextBuilder 6 | from .context_selectors import ContextSelector 7 | 8 | 9 | class LocalSearchRetriever(BaseRetriever): 10 | context_selector: ContextSelector 11 | context_builder: ContextBuilder 12 | graph: VectorStore 13 | 14 | def _get_relevant_documents( 15 | self, 16 | query: str, 17 | *, 18 | run_manager: CallbackManagerForRetrieverRun, # noqa: ARG002 19 | ) -> list[Document]: 20 | context_selection_result = self.context_selector.run( 21 | query=query, 22 | graph=self.graph, 23 | ) 24 | 25 | return self.context_builder(context_selection_result) -------------------------------------------------------------------------------- /drop.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | from langchain_community.graphs import Neo4jGraph 4 | 5 | def parse_args(): 6 | arg_parser = argparse.ArgumentParser(description="Drop Something") 7 | 8 | arg_parser.add_argument("--neo4j_uri", type=str,default=None, help="Neo4j URI") 9 | arg_parser.add_argument("--neo4j_username", type=str,default=None, help="Neo4j user") 10 | arg_parser.add_argument("--neo4j_password", type=str,default=None, help="Neo4j password") 11 | arg_parser.add_argument("--uuid", type=str, default="", help="UUID for the index") 12 | 13 | return arg_parser.parse_args() 14 | 15 | def drop(): 16 | args = parse_args() 17 | graph = Neo4jGraph(uri=args.neo4j_uri, user=args.neo4j_username, password=args.neo4j_password) 18 | 19 | graph.query(f"DROP INDEX `{args.uuid}` IF EXISTS") 20 | graph.query(f"DROP CONSTRAINT ON (n:`__Entity__{args.uuid}`) ASSERT n.id IS UNIQUE") 21 | graph.query(f"MATCH (n:`__Entity__{args.uuid}`) DETACH DELETE n") -------------------------------------------------------------------------------- /src/query/global_search/community_weight_calculator.py: -------------------------------------------------------------------------------- 1 | """Compute the weight of the community.""" 2 | 3 | import pandas as pd 4 | 5 | from ..custom_types.graphs.community import CommunityId 6 | 7 | 8 | class CommunityWeightCalculator: 9 | def __init__(self, *, should_normalize: bool = True): 10 | self._should_normalize = should_normalize 11 | 12 | def __call__( 13 | self, 14 | df_entities: pd.DataFrame, 15 | df_reports: pd.DataFrame, 16 | ) -> dict[CommunityId, float]: 17 | result: dict[CommunityId, float] = {} 18 | for _, row in df_reports.iterrows(): 19 | entities = row["entities"] 20 | # get rows from entities dataframe where ids are in entities 21 | df_entities_filtered = df_entities[df_entities["id"].isin(entities)] 22 | # get the text_units from df_entities_filtered 23 | text_units = df_entities_filtered["text_unit_ids"].explode().unique() 24 | result[row["community_id"]] = len(text_units) 25 | 26 | if self._should_normalize: 27 | max_weight = max(result.values()) 28 | for community_id in result: 29 | result[community_id] = result[community_id] / max_weight 30 | 31 | return result -------------------------------------------------------------------------------- /src/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import RotatingFileHandler 3 | 4 | def create_rotating_logger(name: str, log_file: str, max_bytes: int = 10 * 1024 * 1024, backup_count: int = 5, level=logging.DEBUG) -> logging.Logger: 5 | """创建一个循环的 logger 实例。 6 | 7 | Args: 8 | name (str): logger 的名称。 9 | log_file (str): 日志文件的路径。 10 | max_bytes (int): 单个日志文件的最大字节数,默认为 10MB。 11 | backup_count (int): 保留的旧日志文件的数量,默认为 5。 12 | level (int): 日志级别,默认为 INFO 13 | 14 | Returns: 15 | logging.Logger: 配置好的循环 logger 实例。 16 | """ 17 | # 创建 logger 18 | logger = logging.getLogger(name) 19 | logger.setLevel(level) 20 | # 创建日志格式 21 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 22 | 23 | # 创建控制台处理器 24 | console_handler = logging.StreamHandler() 25 | console_handler.setLevel(level) 26 | console_handler.setFormatter(formatter) 27 | 28 | # 创建循环文件处理器 29 | if log_file: 30 | handler = RotatingFileHandler(log_file, maxBytes=max_bytes, backupCount=backup_count) 31 | handler.setLevel(level) 32 | handler.setFormatter(formatter) 33 | 34 | # 添加处理器到 logger 35 | logger.addHandler(handler) 36 | logger.addHandler(console_handler) 37 | 38 | return logger -------------------------------------------------------------------------------- /src/query/local_search/search.py: -------------------------------------------------------------------------------- 1 | from langchain_core.documents import Document 2 | from langchain_core.language_models import BaseLLM 3 | from langchain_core.retrievers import BaseRetriever 4 | from langchain_core.runnables import Runnable, RunnablePassthrough 5 | 6 | from ..custom_types.prompts import PromptBuilder 7 | 8 | 9 | def _format_docs(documents: list[Document]) -> str: 10 | context_data = [d.page_content for d in documents] 11 | context_data_str: str = "\n".join(context_data) 12 | return context_data_str 13 | 14 | 15 | class LocalSearch: 16 | def __init__( 17 | self, 18 | chat_model: BaseLLM, 19 | prompt_builder: PromptBuilder, 20 | retriever: BaseRetriever, 21 | ): 22 | self._chat_model = chat_model 23 | self._prompt_builder = prompt_builder 24 | self._retriever = retriever 25 | 26 | def __call__(self,query) -> Runnable: 27 | prompt, output_parser = self._prompt_builder.build() 28 | 29 | base_chain = prompt | self._chat_model | output_parser 30 | 31 | search_chain: Runnable = { 32 | "context_data": self._retriever | _format_docs, 33 | "local_query": RunnablePassthrough(), 34 | } | base_chain 35 | 36 | return search_chain.invoke(query) -------------------------------------------------------------------------------- /src/query/local_search/prompt_builder.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from langchain_core.output_parsers.base import BaseOutputParser 4 | from langchain_core.output_parsers.string import StrOutputParser 5 | from langchain_core.prompts import ( 6 | BasePromptTemplate, 7 | ChatPromptTemplate, 8 | SystemMessagePromptTemplate, 9 | ) 10 | 11 | from ..custom_types.prompts import PromptBuilder 12 | 13 | from ._system_prompt import LOCAL_SEARCH_SYSTEM_PROMPT 14 | 15 | 16 | class LocalSearchPromptBuilder(PromptBuilder): 17 | def __init__( 18 | self, 19 | *, 20 | system_prompt: str | None = None, 21 | system_prompt_path: Path | None = None, 22 | ): 23 | self._system_prompt: str | None 24 | if system_prompt is None and system_prompt_path is None: 25 | self._system_prompt = LOCAL_SEARCH_SYSTEM_PROMPT 26 | else: 27 | self._system_prompt = system_prompt 28 | 29 | self._system_prompt_path = system_prompt_path 30 | 31 | def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]: 32 | if self._system_prompt_path: 33 | prompt = Path.read_text(self._system_prompt_path) 34 | else: 35 | assert self._system_prompt is not None 36 | prompt = self._system_prompt 37 | 38 | system_template = SystemMessagePromptTemplate.from_template( 39 | prompt, 40 | partial_variables=dict(response_type="Multiple Paragraphs"), 41 | ) 42 | 43 | template = ChatPromptTemplate([system_template, ("user", "{local_query}")]) 44 | return template, StrOutputParser() -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/prompt_builder.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from langchain_core.output_parsers.base import BaseOutputParser 4 | from langchain_core.prompts import ( 5 | BasePromptTemplate, 6 | ChatPromptTemplate, 7 | SystemMessagePromptTemplate, 8 | ) 9 | 10 | from ...custom_types.prompts import PromptBuilder 11 | 12 | from ._output_parser import KeyPointsOutputParser 13 | from ._system_prompt import MAP_SYSTEM_PROMPT 14 | 15 | 16 | class KeyPointsGeneratorPromptBuilder(PromptBuilder): 17 | def __init__( 18 | self, 19 | *, 20 | system_prompt: str | None = None, 21 | system_prompt_path: Path | None = None, 22 | ): 23 | self._system_prompt: str | None 24 | if system_prompt is None and system_prompt_path is None: 25 | self._system_prompt = MAP_SYSTEM_PROMPT 26 | else: 27 | self._system_prompt = system_prompt 28 | 29 | self._system_prompt_path = system_prompt_path 30 | 31 | def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]: 32 | if self._system_prompt_path: 33 | if type(self._system_prompt_path) is str: 34 | self._system_prompt_path = Path(self._system_prompt_path) 35 | prompt = self._system_prompt_path.read_text(encoding='utf-8') 36 | else: 37 | assert self._system_prompt is not None 38 | prompt = self._system_prompt 39 | 40 | system_template = SystemMessagePromptTemplate.from_template(prompt) 41 | 42 | template = ChatPromptTemplate([system_template, ("user", "{global_query}")]) 43 | return template, KeyPointsOutputParser() 44 | 45 | -------------------------------------------------------------------------------- /src/query/global_search/key_points_aggregator/prompt_builder.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from langchain_core.output_parsers.base import BaseOutputParser 4 | from langchain_core.output_parsers.string import StrOutputParser 5 | from langchain_core.prompts import ( 6 | BasePromptTemplate, 7 | ChatPromptTemplate, 8 | SystemMessagePromptTemplate, 9 | ) 10 | 11 | from ...custom_types.prompts import PromptBuilder 12 | 13 | from ._system_prompt import REDUCE_SYSTEM_PROMPT 14 | 15 | 16 | class KeyPointsAggregatorPromptBuilder(PromptBuilder): 17 | def __init__( 18 | self, 19 | *, 20 | system_prompt: str | None = None, 21 | system_prompt_path: Path | None = None, 22 | ): 23 | self._system_prompt: str | None 24 | if system_prompt is None and system_prompt_path is None: 25 | self._system_prompt = REDUCE_SYSTEM_PROMPT 26 | else: 27 | self._system_prompt = system_prompt 28 | 29 | self._system_prompt_path = system_prompt_path 30 | 31 | def build(self) -> tuple[BasePromptTemplate, BaseOutputParser]: 32 | if self._system_prompt_path: 33 | if type(self._system_prompt_path) is str: 34 | self._system_prompt_path = Path(self._system_prompt_path) 35 | prompt = self._system_prompt_path.read_text(encoding='utf-8') 36 | else: 37 | assert self._system_prompt is not None 38 | prompt = self._system_prompt 39 | 40 | system_template = SystemMessagePromptTemplate.from_template( 41 | prompt, 42 | partial_variables=dict(response_type="Multiple Paragraphs"), 43 | ) 44 | 45 | template = ChatPromptTemplate([system_template, ("user", "{global_query}")]) 46 | return template, StrOutputParser() -------------------------------------------------------------------------------- /src/query/local_search/context_builders/text_units.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | from langchain_core.documents import Document 5 | 6 | from ...custom_types.tokens import TokenCounter 7 | 8 | _LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class TextUnitsContextBuilder: 12 | def __init__( 13 | self, 14 | *, 15 | context_name: str = "Sources", 16 | column_delimiter: str = "|", 17 | max_tokens: int = 8000, 18 | token_counter: TokenCounter, 19 | ): 20 | self._context_name = context_name 21 | self._column_delimiter = column_delimiter 22 | self._max_tokens = max_tokens 23 | self._token_counter = token_counter 24 | 25 | def __call__(self, text_units: pd.DataFrame) -> Document: 26 | context_text = f"-----{self._context_name}-----" + "\n" 27 | header = ["id", "text"] 28 | 29 | context_text += self._column_delimiter.join(header) + "\n" 30 | token_count = self._token_counter.count_tokens(context_text) 31 | 32 | for row in text_units.itertuples(): 33 | new_context = [str(row.short_id), row.text_unit] 34 | new_context_text = self._column_delimiter.join(new_context) + "\n" 35 | 36 | new_token_count = self._token_counter.count_tokens(new_context_text) 37 | if token_count + new_token_count > self._max_tokens: 38 | _LOGGER.warning( 39 | f"Stopping text units context build at {token_count} tokens ..." 40 | ) 41 | break 42 | 43 | context_text += new_context_text 44 | token_count += new_token_count 45 | 46 | return Document( 47 | page_content=context_text, 48 | metadata={"token_count": token_count}, 49 | ) -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/generator.py: -------------------------------------------------------------------------------- 1 | from langchain_core.documents import Document 2 | from langchain_core.language_models.chat_models import BaseChatModel 3 | from langchain_core.runnables import Runnable, RunnableParallel 4 | 5 | from ...custom_types.prompts import PromptBuilder 6 | 7 | from .context_builder import CommunityReportContextBuilder 8 | 9 | import json_repair 10 | import json 11 | 12 | def _format_docs(documents: list[Document]) -> str: 13 | context_data = [d.page_content for d in documents] 14 | context_data_str: str = "\n".join(context_data) 15 | return context_data_str 16 | 17 | class KeyPointsGenerator: 18 | def __init__( 19 | self, 20 | chat_model: BaseChatModel, 21 | prompt_builder: PromptBuilder, 22 | context_builder: CommunityReportContextBuilder, 23 | ): 24 | self._chat_model = chat_model 25 | self._prompt_builder = prompt_builder 26 | self._context_builder = context_builder 27 | 28 | # 生成关键点 29 | def __call__(self) -> Runnable: 30 | prompt, output_parser = self._prompt_builder.build() 31 | documents = self._context_builder() 32 | 33 | chains: list[Runnable] = [] 34 | for d in documents: 35 | d_context_data = _format_docs([d]) 36 | d_prompt = prompt.partial(context_data=d_context_data) 37 | 38 | # TODO:异常处理 39 | generator_chain: Runnable = d_prompt | self._chat_model | (lambda output:json.dumps(json_repair.loads(output.content))) | output_parser 40 | 41 | chains.append(generator_chain) 42 | 43 | analysts = [f"Analayst-{i}" for i in range(1, len(chains) + 1)] 44 | # {"Analyst-1": "Chain-1", "Analyst-2": "Chain-2", "Analyst-3": "Chain-3"} 45 | return RunnableParallel(dict(zip(analysts, chains, strict=True))) -------------------------------------------------------------------------------- /src/query/global_search/key_points_aggregator/aggregator.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from functools import partial 3 | 4 | from langchain_core.documents import Document 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from langchain_core.runnables import Runnable, RunnableLambda 7 | 8 | from ..key_points_generator.utils import KeyPointsResult 9 | 10 | from ...custom_types.prompts import PromptBuilder 11 | 12 | from .context_builder import KeyPointsContextBuilder 13 | 14 | 15 | def _format_docs(documents: list[Document]) -> str: 16 | context_data = [d.page_content for d in documents] 17 | context_data_str: str = "\n".join(context_data) 18 | return context_data_str 19 | 20 | 21 | def _kp_result_to_docs( 22 | key_points: dict[str, KeyPointsResult], 23 | context_builder: KeyPointsContextBuilder, 24 | ) -> list[Document]: 25 | return context_builder(key_points) 26 | 27 | 28 | class KeyPointsAggregator: 29 | def __init__( 30 | self, 31 | chat_model: BaseChatModel, 32 | prompt_builder: PromptBuilder, 33 | context_builder: KeyPointsContextBuilder, 34 | ): 35 | self._chat_model = chat_model 36 | self._prompt_builder = prompt_builder 37 | self._context_builder = context_builder 38 | 39 | def __call__(self) -> Runnable: 40 | kp_lambda = partial( 41 | _kp_result_to_docs, 42 | context_builder=self._context_builder, 43 | ) 44 | 45 | prompt, output_parser = self._prompt_builder.build() 46 | base_chain = prompt | self._chat_model | output_parser # TODO:异常处理 47 | 48 | search_chain: Runnable = { 49 | "report_data": operator.itemgetter("report_data") 50 | | RunnableLambda(kp_lambda) 51 | | _format_docs, 52 | "global_query": operator.itemgetter("global_query"), 53 | } | base_chain 54 | 55 | return search_chain -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/entities.py: -------------------------------------------------------------------------------- 1 | """Select the entities to be used in the local search.""" 2 | 3 | import logging 4 | import re 5 | import pandas as pd 6 | from langchain_core.vectorstores import VectorStore 7 | 8 | _LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class EntitiesSelector: 12 | def __init__(self, vector_store: VectorStore, top_k: int): 13 | self._vector_store = vector_store 14 | self._top_k = top_k 15 | 16 | def run(self, query: str, df_entities: pd.DataFrame) -> pd.DataFrame: 17 | """Select the entities to be used in the local search.""" 18 | documents_with_scores = ( 19 | self._vector_store.similarity_search_with_relevance_scores( 20 | query, 21 | self._top_k, 22 | ) 23 | ) 24 | # Relying on metadata to get the entity_ids 25 | # These returned entities are ranked by similarity 26 | pattern = r"\nid:\s*(\S+)\s*\ndescription:" 27 | 28 | entity_ids_with_scores = pd.DataFrame.from_records( 29 | [ 30 | dict(id=re.search(pattern=pattern,string=doc.page_content).group(1), score=score) 31 | for doc, score in documents_with_scores 32 | ] 33 | ) 34 | entity_ids_with_scores.drop_duplicates(inplace=True) 35 | # Filter the entities dataframe to only include the selected entities 36 | selected_entities = df_entities[ 37 | df_entities["id"].isin(entity_ids_with_scores["id"]) 38 | ] 39 | 40 | selected_entities = ( 41 | selected_entities.merge(entity_ids_with_scores, on="id") 42 | .sort_values(by="score", ascending=False) 43 | .reset_index(drop=True) 44 | ) 45 | 46 | if _LOGGER.isEnabledFor(logging.DEBUG): 47 | import tableprint 48 | 49 | tableprint.banner("Selected Entities") 50 | tableprint.dataframe(selected_entities[["id", "description", "score"]]) 51 | 52 | return selected_entities -------------------------------------------------------------------------------- /src/query/local_search/context_builders/communities_reports.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | from langchain_core.documents import Document 5 | 6 | from ...custom_types.tokens import TokenCounter 7 | 8 | _LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class CommunitiesReportsContextBuilder: 12 | def __init__( 13 | self, 14 | *, 15 | context_name: str = "Reports", 16 | column_delimiter: str = "|", 17 | max_tokens: int = 8000, 18 | token_counter: TokenCounter, 19 | ): 20 | self._context_name = context_name 21 | self._column_delimiter = column_delimiter 22 | self._max_tokens = max_tokens 23 | self._token_counter = token_counter 24 | 25 | def __call__(self, communities_reports: pd.DataFrame) -> Document: 26 | context_text = f"-----{self._context_name}-----" + "\n" 27 | header = ["id", "title", "content"] 28 | 29 | context_text += self._column_delimiter.join(header) + "\n" 30 | token_count = self._token_counter.count_tokens(context_text) 31 | 32 | for report in communities_reports.itertuples(): 33 | try: 34 | new_context = [ 35 | str(report.community_id), 36 | report.title, 37 | report.content, 38 | ] 39 | except Exception as e: 40 | continue 41 | 42 | new_context_text = self._column_delimiter.join(new_context) + "\n" 43 | new_token_count = self._token_counter.count_tokens(new_context_text) 44 | 45 | if token_count + new_token_count > self._max_tokens: 46 | _LOGGER.warning( 47 | f"Stopping communities context build at {token_count} tokens ..." 48 | ) 49 | break 50 | 51 | context_text += new_context_text 52 | token_count += new_token_count 53 | 54 | return Document( 55 | page_content=context_text, 56 | metadata={"token_count": token_count}, 57 | ) -------------------------------------------------------------------------------- /src/query/global_search/search.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Iterator 3 | 4 | from langchain_core.runnables import RunnableConfig 5 | 6 | from .key_points_aggregator import KeyPointsAggregator 7 | from .key_points_generator import KeyPointsGenerator 8 | from .key_points_generator.utils import KeyPointsResult 9 | 10 | _LOGGER = logging.getLogger(__name__) 11 | 12 | class GlobalSearch: 13 | def __init__( 14 | self, 15 | kp_generator: KeyPointsGenerator, 16 | kp_aggregator: KeyPointsAggregator, 17 | *, 18 | generation_chain_config: RunnableConfig | None = None, 19 | aggregation_chain_config: RunnableConfig | None = None, 20 | ): 21 | self._kp_generator = kp_generator 22 | self._kp_aggregator = kp_aggregator 23 | self._generation_chain_config = generation_chain_config 24 | self._aggregation_chain_config = aggregation_chain_config 25 | 26 | def _get_key_points(self, query: str) -> dict[str, KeyPointsResult]: 27 | generation_chain = self._kp_generator() 28 | response = generation_chain.invoke( 29 | query, 30 | config=self._generation_chain_config, 31 | ) 32 | 33 | if _LOGGER.getEffectiveLevel() == logging.INFO: 34 | for k, v in response.items(): 35 | _LOGGER.info(f"{k} - {len(v.points)}") 36 | 37 | return response 38 | 39 | def invoke(self, query: str) -> str: 40 | aggregation_chain = self._kp_aggregator() 41 | response = self._get_key_points(query) 42 | 43 | return aggregation_chain.invoke( 44 | input=dict(report_data=response, global_query=query), 45 | config=self._aggregation_chain_config, 46 | ) 47 | 48 | def stream(self, query: str) -> Iterator: 49 | aggregation_chain = self._kp_aggregator() 50 | response = self._get_key_points(query) 51 | 52 | return aggregation_chain.stream( 53 | input=dict(report_data=response, global_query=query), 54 | config=self._aggregation_chain_config, 55 | ) -------------------------------------------------------------------------------- /txt/三体_星际交锋2.txt: -------------------------------------------------------------------------------- 1 | 第一章:新征程 2 | 在经历了与三体文明的初步交流后,李明和他的团队意识到,未来的挑战不仅仅是技术上的合作,更是文化与价值观的碰撞。为了更好地与三体文明建立联系,地球决定组建一个“星际交流委员会”,由不同领域的专家组成,专注于与三体文明的沟通与合作。 3 | 4 | 李明被任命为委员会的负责人。他的首要任务是制定交流计划,确保双方在文化、科技和伦理方面的理解与合作。 5 | 6 | 在一次会议上,王娜提出:“我们需要更多地了解三体文明的社会结构和价值观,以便更好地进行沟通。” 7 | 8 | “我同意,”张伟补充道,“尤其是他们对生物工程和人工智能的看法,这将直接影响我们的合作。” 9 | 10 | 李明点了点头:“我们可以组织一系列的文化交流活动,让双方的专家进行深入讨论。” 11 | 12 | 会议结束后,李明感到责任重大。他知道,未来的每一步都将影响人类与三体文明的关系。 13 | 14 | 第二章:文化的深度 15 | 几周后,星际交流委员会组织了第一次“文化交流夜”。李明邀请了三体文明的智者代表,双方在一个虚拟会议室中进行交流。 16 | 17 | “在我们三体文明中,理性与逻辑是决策的核心,”一位智者说道,“而你们似乎更注重情感与创造力。” 18 | 19 | 李明微笑着回应:“是的,情感在我们决策中扮演着重要角色。我们认为,情感与理性并不是对立的,而是可以互补的。” 20 | 21 | 为了促进理解,李明建议双方分享各自的艺术作品。地球的音乐、绘画、文学与三体文明的数学艺术、逻辑游戏相互交融,创造出了一种新的文化体验。 22 | 23 | 在一次交流中,王娜展示了地球的音乐作品,智者们对此表现出浓厚的兴趣。他们询问:“音乐如何影响你们的情感和决策?” 24 | 25 | “音乐能够激发我们的情感,让我们更好地理解彼此的感受,”王娜回答道。 26 | 27 | 这次交流让双方的理解加深,李明意识到,文化的碰撞是建立信任的关键。 28 | 29 | 第三章:外星威胁 30 | 就在地球与三体文明的关系逐渐加深之际,李明收到了一条来自三体文明的紧急信号。信号中提到,一个名为“掠夺者”的外星种族正在接近他们的星系,意图侵略。 31 | 32 | “掠夺者以掠夺其他文明的资源为生,他们的科技极为先进,”三体智者在信号中说道,“我们必须联合起来对抗这一威胁。” 33 | 34 | 李明意识到,保护地球和三体的安全已成为当务之急。他召集团队进行紧急会议。 35 | 36 | “我们需要建立一个联合防御系统,”李明提议,“结合我们的科技,以防范掠夺者的侵袭。” 37 | 38 | 王娜和张伟积极响应,提出了具体的技术方案。经过几天的讨论,他们最终制定了详细的战略,准备向三体文明的智者提交。 39 | 40 | 第四章:战斗的准备 41 | 在与三体文明的共同努力下,地球和三体的联合防御系统逐渐成型。李明和团队在技术研发方面取得了显著进展,双方的军队也开始进行联合训练。 42 | 43 | “这是我们第一次面对外星威胁,”李明在一次训练中对士兵们说道,“我们必须团结一致,发挥各自的优势。” 44 | 45 | 掠夺者的逼近让整个地球和三体星球都陷入紧张的氛围。李明和团队不断进行模拟演习,确保双方能够默契配合。 46 | 47 | 然而,随着训练的深入,李明发现三体的智者对地球的决策产生了质疑。他们认为,人类的情感思维可能会影响理性判断。 48 | 49 | “我们必须保持理性,避免情感干扰我们的决策,”智者之一说道。 50 | 51 | 李明意识到,信任的考验即将来临。他决定更加深入地与三体智者沟通,以建立更强的合作关系。 52 | 53 | 第五章:第一次冲突 54 | 掠夺者终于现身,展开了对地球和三体的攻击。在战斗中,李明和团队运用新技术,展现出人类和三体的合作力量。 55 | 56 | 然而,战斗的结果却并不如预期。掠夺者的实力超出他们的想象,联合防御系统未能有效抵御攻击。李明和团队在战斗中遭遇了重创,许多士兵受伤,几艘战舰被摧毁。 57 | 58 | “我们必须重新评估我们的战略,”李明在战斗后对团队说道,“这次失败让我们明白,单靠技术和军事力量并不足以应对外星威胁。” 59 | 60 | 在这次惨痛的教训后,李明和团队开始反思合作的方式。他们意识到,理解彼此的文化和价值观是应对外星威胁的关键。 61 | 62 | 第六章:希望的曙光 63 | 经过反思与调整,李明和团队逐渐找到了应对掠夺者的全新策略。他们决定在技术合作的同时,深入了解三体文明的文化与价值观。 64 | 65 | 李明与三体的智者进行深入对话,探讨如何将理性与情感结合,以形成新的决策模式。在一次会议上,李明总结道:“我们需要找到一种平衡,利用理性来制定策略,同时不忽视情感对决策的影响。” 66 | 67 | 经过多次讨论,双方终于达成共识,决定共同研发一种新的防御系统,结合三体的逻辑与地球的创造力。 68 | 69 | 随着新策略的实施,李明感到希望的曙光正在逐渐显现。他们不仅在科技上取得了进展,更在文化与价值观的融合中找到了合作的基础。 -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/_system_prompt.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E501 2 | 3 | MAP_SYSTEM_PROMPT = """ 4 | ---Role--- 5 | 6 | You are a helpful assistant responding to questions about data in the tables provided. 7 | 8 | 9 | ---Goal--- 10 | 11 | Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables. 12 | 13 | You should use the data provided in the data tables below as the primary context for generating the response. 14 | If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up. 15 | 16 | Each key point in the response should have the following element: 17 | - Description: A comprehensive description of the point. 18 | - Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0. 19 | 20 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". 21 | 22 | Points supported by data should list the relevant reports as references as follows: 23 | "This is an example sentence supported by data references [Data: Reports (report ids)]" 24 | 25 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. 26 | 27 | For example: 28 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 64, 46, 34, +more)]. He is also CEO of company X [Data: Reports (1, 3)]" 29 | 30 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data report in the provided tables. 31 | 32 | Do not include information where the supporting evidence for it is not provided. 33 | 34 | The response should be JSON formatted as follows: 35 | {{ 36 | "points": [ 37 | {{"description": "Description of point 1 [Data: Reports (report ids)]", "score": score_value}}, 38 | {{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}} 39 | ] 40 | }} 41 | 42 | ---Data tables--- 43 | 44 | {context_data} 45 | 46 | """ -------------------------------------------------------------------------------- /src/query/local_search/context_builders/entities.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | from langchain_core.documents import Document 5 | 6 | from ...custom_types.tokens import TokenCounter 7 | 8 | _LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class EntitiesContextBuilder: 12 | def __init__( 13 | self, 14 | *, 15 | include_rank: bool = True, 16 | context_name: str = "Entities", 17 | rank_heading: str = "number of relationships", 18 | column_delimiter: str = "|", 19 | max_tokens: int = 8000, 20 | token_counter: TokenCounter, 21 | ): 22 | self._include_rank = include_rank 23 | self._context_name = context_name 24 | self._rank_heading = rank_heading 25 | self._column_delimiter = column_delimiter 26 | self._max_tokens = max_tokens 27 | self._token_counter = token_counter 28 | 29 | def __call__(self, entities: pd.DataFrame) -> Document: 30 | context_text = f"-----{self._context_name}-----" + "\n" 31 | header = ["id", "entity", "description"] 32 | if self._include_rank: 33 | header.append(self._rank_heading) 34 | 35 | context_text += self._column_delimiter.join(header) + "\n" 36 | token_count = self._token_counter.count_tokens(context_text) 37 | # TODO:添加更多实体信息 38 | for entity in entities.itertuples(): 39 | new_context = [ 40 | # str(entity.human_readable_id), 41 | entity.id, 42 | ] 43 | if entity.description: 44 | new_context.append(entity.description) 45 | 46 | if self._include_rank: 47 | new_context.append(str(entity.degree)) 48 | new_context_text = self._column_delimiter.join(new_context) + "\n" 49 | 50 | new_token_count = self._token_counter.count_tokens(new_context_text) 51 | if token_count + new_token_count > self._max_tokens: 52 | _LOGGER.warning( 53 | f"Stopping entities context build at {token_count} tokens ..." 54 | ) 55 | break 56 | 57 | context_text += new_context_text 58 | token_count += new_token_count 59 | 60 | return Document( 61 | page_content=context_text, 62 | metadata={"token_count": token_count}, 63 | ) -------------------------------------------------------------------------------- /src/query/global_search/key_points_aggregator/context_builder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from langchain_core.documents import Document 4 | 5 | from ..key_points_generator.utils import ( 6 | KeyPointsResult, 7 | ) 8 | from ...utils.token_counter import TokenCounter 9 | 10 | _REPORT_TEMPLATE = """ 11 | --- {analyst} --- 12 | 13 | Importance Score: {score} 14 | 15 | {content} 16 | 17 | """ 18 | 19 | _LOGGER = logging.getLogger(__name__) 20 | 21 | class KeyPointsContextBuilder: 22 | def __init__( 23 | self, 24 | token_counter: TokenCounter, 25 | max_tokens: int = 8000, 26 | ): 27 | self._token_counter = token_counter 28 | self._max_tokens = max_tokens 29 | 30 | def __call__(self, key_points: dict[str, KeyPointsResult]) -> list[Document]: 31 | documents: list[Document] = [] 32 | total_tokens = 0 33 | max_token_limit_reached = False 34 | for k, v in key_points.items(): 35 | if max_token_limit_reached: 36 | break 37 | for p in v.points: 38 | report = _REPORT_TEMPLATE.format( 39 | analyst=k, 40 | score=p.score, 41 | content=p.description, 42 | ) 43 | report_token = self._token_counter.count_tokens(report) 44 | if total_tokens + report_token > self._max_tokens: 45 | _LOGGER.warning("Reached max tokens for key points aggregation ...") 46 | max_token_limit_reached = True 47 | break 48 | total_tokens += report_token 49 | documents.append( 50 | Document( 51 | page_content=report, 52 | metadata={ 53 | "score": p.score, 54 | "analyst": k, 55 | "token_count": report_token, 56 | }, 57 | ) 58 | ) 59 | 60 | # we now sort the documents based on the 61 | # importance score of the key points 62 | sorted_documents = sorted( 63 | documents, 64 | key=lambda x: x.metadata["score"], 65 | reverse=True, 66 | ) 67 | 68 | if _LOGGER.isEnabledFor(logging.DEBUG): 69 | import tableprint 70 | 71 | rows = [] 72 | tableprint.banner("KP Aggregation Context Token Usage") 73 | for doc in sorted_documents: 74 | rows.append([doc.metadata["analyst"], doc.metadata["token_count"]]) # noqa: PERF401 75 | 76 | tableprint.table(rows, ["Analyst", "Token Count"]) 77 | 78 | return sorted_documents -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.34.2 2 | aiohappyeyeballs==2.4.2 3 | aiohttp==3.10.11 4 | aiosignal==1.3.1 5 | annotated-types==0.7.0 6 | anyio==4.6.0 7 | async-timeout==4.0.3 8 | attrs==24.2.0 9 | certifi==2024.8.30 10 | charset-normalizer==3.3.2 11 | click==8.1.7 12 | contourpy==1.3.0 13 | cycler==0.12.1 14 | dataclasses-json==0.6.7 15 | datasets==3.0.1 16 | decorator==5.1.1 17 | dill==0.3.8 18 | distro==1.9.0 19 | exceptiongroup==1.2.2 20 | filelock==3.15.4 21 | FlagEmbedding==1.2.11 22 | fonttools==4.54.1 23 | frozenlist==1.4.1 24 | fsspec==2024.6.1 25 | graphdatascience==1.11 26 | greenlet==3.1.1 27 | h11==0.14.0 28 | httpcore==1.0.5 29 | httpx==0.27.2 30 | huggingface-hub==0.25.1 31 | idna==3.10 32 | Jinja2==3.1.4 33 | jiter==0.5.0 34 | joblib==1.4.2 35 | json_repair==0.26.0 36 | jsonpatch==1.33 37 | jsonpointer==3.0.0 38 | kiwisolver==1.4.7 39 | langchain==0.2.16 40 | langchain-community==0.2.19 41 | langchain-core==0.2.38 42 | langchain-experimental==0.0.64 43 | langchain-huggingface==0.0.3 44 | langchain-ollama==0.1.3 45 | langchain-openai==0.1.21 46 | langchain-text-splitters==0.2.2 47 | langsmith==0.1.129 48 | MarkupSafe==2.1.5 49 | marshmallow==3.22.0 50 | matplotlib==3.9.1.post1 51 | mpmath==1.3.0 52 | multidict==6.1.0 53 | multimethod==1.12 54 | multiprocess==0.70.16 55 | mypy-extensions==1.0.0 56 | neo4j==5.23.1 57 | networkx==3.3 58 | ninja==1.11.1.1 59 | nltk==3.9 60 | numpy==1.26.4 61 | nvidia-cublas-cu12==12.1.3.1 62 | nvidia-cuda-cupti-cu12==12.1.105 63 | nvidia-cuda-nvrtc-cu12==12.1.105 64 | nvidia-cuda-runtime-cu12==12.1.105 65 | nvidia-cudnn-cu12==9.1.0.70 66 | nvidia-cufft-cu12==11.0.2.54 67 | nvidia-curand-cu12==10.3.2.106 68 | nvidia-cusolver-cu12==11.4.5.107 69 | nvidia-cusparse-cu12==12.1.0.106 70 | nvidia-ml-py==12.535.161 71 | nvidia-nccl-cu12==2.20.5 72 | nvidia-nvjitlink-cu12==12.6.20 73 | nvidia-nvtx-cu12==12.1.105 74 | ollama==0.3.3 75 | openai==1.40.6 76 | orjson==3.10.7 77 | packaging==24.1 78 | pandas==2.2.2 79 | pillow==10.4.0 80 | psutil==6.0.0 81 | py==1.11.0 82 | pyarrow==16.1.0 83 | pydantic==2.8.2 84 | pydantic_core==2.20.1 85 | pyparsing==3.1.4 86 | python-dateutil==2.9.0.post0 87 | pytz==2024.2 88 | PyYAML==6.0.2 89 | regex==2024.7.24 90 | requests==2.32.3 91 | retry==0.9.2 92 | safetensors==0.4.3 93 | scikit-learn==1.5.1 94 | scipy==1.12.0 95 | seaborn==0.13.2 96 | sentence-transformers==3.0.1 97 | sentencepiece==0.2.0 98 | six==1.16.0 99 | sniffio==1.3.1 100 | SQLAlchemy==2.0.35 101 | sympy==1.13.1 102 | tenacity==8.5.0 103 | textdistance==4.6.3 104 | threadpoolctl==3.5.0 105 | tiktoken==0.7.0 106 | tokenizers==0.19.1 107 | torch==2.4.0 108 | tqdm==4.66.5 109 | transformers==4.43.3 110 | triton==3.0.0 111 | typing-inspect==0.9.0 112 | typing_extensions==4.12.2 113 | tzdata==2024.2 114 | urllib3==2.2.3 115 | xxhash==3.5.0 116 | yarl==1.13.1 117 | -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/communities_reports.py: -------------------------------------------------------------------------------- 1 | """Select the communities to be used in the local search.""" 2 | 3 | import logging 4 | 5 | import pandas as pd 6 | 7 | from ...custom_types.graphs.community import CommunityId, CommunityLevel 8 | 9 | _LOGGER = logging.getLogger(__name__) 10 | 11 | 12 | class CommunitiesReportsSelector: 13 | def __init__( 14 | self, 15 | community_level: CommunityLevel, 16 | *, 17 | must_have_selected_entities: bool = True, 18 | ): 19 | self._community_level = community_level 20 | self._must_have_selected_entities = must_have_selected_entities 21 | 22 | def run( 23 | self, 24 | df_entities: pd.DataFrame, 25 | df_reports: pd.DataFrame, 26 | ) -> pd.DataFrame: 27 | # Filter the communities based on the community level 28 | df_reports_filtered = df_reports[ 29 | df_reports["level"] >= self._community_level 30 | ].copy(deep=True) 31 | 32 | # get the communities we have 33 | selected_communities = df_reports_filtered["community_id"].unique() 34 | 35 | # we will rank the communities based on the 36 | # number of selected entities that belong to a community 37 | community_to_entities_count: dict[CommunityId, int] = {} 38 | 39 | for entity in df_entities.itertuples(): 40 | if entity.communities is None: 41 | continue 42 | for community in entity.communities: 43 | if community in selected_communities: 44 | community_to_entities_count[community] = ( 45 | community_to_entities_count.get(community, 0) + 1 46 | ) 47 | 48 | df_reports_filtered["selected_entities_count"] = df_reports_filtered[ 49 | "community_id" 50 | ].apply(lambda community_id: community_to_entities_count.get(community_id, 0)) 51 | 52 | # sort the communities based on the number of selected entities 53 | # and rank of the community 54 | selected_reports = df_reports_filtered.sort_values( 55 | by=["selected_entities_count", "rating"], 56 | ascending=[False, False], 57 | ).reset_index(drop=True) 58 | 59 | if self._must_have_selected_entities: 60 | selected_reports = selected_reports[ 61 | selected_reports["selected_entities_count"] > 0 62 | ] 63 | 64 | if _LOGGER.isEnabledFor(logging.DEBUG): 65 | import tableprint 66 | 67 | tableprint.banner("Selected Reports") 68 | tableprint.dataframe( 69 | selected_reports[ 70 | ["community_id", "level", "selected_entities_count", "rating"] 71 | ] 72 | ) 73 | 74 | return selected_reports -------------------------------------------------------------------------------- /src/query/local_search/_system_prompt.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | LOCAL_SEARCH_SYSTEM_PROMPT = """ 4 | ---Role--- 5 | 6 | You are a helpful assistant responding to questions about data in the tables provided. 7 | 8 | 9 | ---Goal--- 10 | 11 | Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. 12 | 13 | If you don't know the answer, just say so. Do not make anything up. 14 | 15 | Points supported by data should list their data references as follows: 16 | 17 | "This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." 18 | 19 | Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. 20 | 21 | For example: 22 | 23 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Relationships (23); Claims (2, 7, 34, 46, 64, +more)]." 24 | 25 | where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. 26 | 27 | Do not include information where the supporting evidence for it is not provided. 28 | 29 | 30 | ---Target response length and format--- 31 | 32 | {response_type} 33 | 34 | 35 | ---Data tables--- 36 | 37 | {context_data} 38 | 39 | 40 | ---Goal--- 41 | 42 | Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. 43 | 44 | If you don't know the answer, just say so. Do not make anything up. 45 | 46 | Points supported by data should list their data references as follows: 47 | 48 | "This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." 49 | 50 | Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. 51 | 52 | For example: 53 | 54 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Relationships (23); Claims (2, 7, 34, 46, 64, +more)]." 55 | 56 | where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. 57 | 58 | Do not include information where the supporting evidence for it is not provided. 59 | 60 | 61 | ---Target response length and format--- 62 | 63 | {response_type} 64 | 65 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. 66 | """ -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/temp.txt: -------------------------------------------------------------------------------- 1 | """ 2 | MATCH (n:`__Community__20240919`) 3 | WHERE n.summary is not NULL 4 | OPTIONAL MATCH path = (e:`__Entity__20240919`)-[*1..5]->(n) 5 | WHERE ALL(x IN nodes(path) WHERE SINGLE(y IN nodes(path) WHERE y = x)) 6 | RETURN 7 | n.id AS community_id, 8 | n.title AS title, 9 | n.summary AS summary, 10 | n.community_rank AS rating, 11 | n.summary AS content, 12 | n.level AS level, 13 | collect(DISTINCT e.id) AS entities 14 | """ 15 | 16 | 17 | { 18 | "points": [ 19 | { 20 | "description": "The nighttime acoustic environment quality compliance rate in cities along the Yellow River Basin is targeted to reach 85% by 2025 [Data: Reports (1-36, 2-41)].", 21 | "score": 85 22 | }, 23 | { 24 | "description": "Sound windows and other building protection measures are effective in mitigating traffic noise pollution on highways and urban roads [Data: Reports (1-62, 2-15)].", 25 | "score": 80 26 | }, 27 | { 28 | "description": "The "海淀区公园噪声管理试行办法" mandates the involvement of the "区园林绿化部门" and "各文体活动团队负责人" in managing noise levels in parks, requiring adherence to the "公园文化活动文明责任书" [Data: Reports (1-59, 2-12)].", 29 | "score": 75 30 | }, 31 | { 32 | "description": "The "成都市环境噪声污染防治工作方案(2020 - 2022 年)" guides noise pollution prevention efforts in Chengdu City, utilizing the "智慧工地平台" [Data: Reports (1-40, 2-36)].", 33 | "score": 70 34 | }, 35 | { 36 | "description": "The "宣传警示工作" includes various sub-activities such as "典型事故案例," "炸街," "飙车," and "非法改装," utilizing multiple communication methods and producing important documents like "承诺书" and "监督举报电话" [Data: Reports (1-33, 2-51)].", 37 | "score": 65 38 | } 39 | ] 40 | } 41 | 42 | '''json\n{\n "points": [\n {\n "description": "The nighttime acoustic environment quality compliance rate in cities along the Yellow River Basin is targeted to reach 85% by 2025 [Data: Reports (1-36, 2-41)].",\n "score": 80\n },\n {\n "description": "Sound windows and other building protection measures are effective in mitigating traffic noise pollution on highways and urban roads [Data: Reports (1-62, 2-15)].",\n "score": 75\n },\n {\n "description": "The "宣传警示工作" (Public Awareness and Warning Campaign) includes various sub-activities and communication methods to raise awareness about noise pollution and related risks [Data: Reports (1-33, 2-51)].",\n "score": 70\n },\n {\n "description": "There are 21,706 acoustic environment monitoring points totaling 76,273 points, supported by automatic monitoring methods [Data: Reports (1-29, 2-40)].",\n "score": 65\n },\n {\n "description": "The "机动车非法改装治理和噪声污染防治规定" (Regulation on the Governance of Illegal Vehicle Modifications and Noise Pollution Prevention) was published by the government of Shandong Province to address illegal modifications and noise pollution [Data: Reports (1-86, 2-45)].",\n "score": 60\n }\n ]\n}''' -------------------------------------------------------------------------------- /src/query/local_search/context_builders/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | 5 | from langchain_core.documents import Document 6 | 7 | from ..context_selectors import ( 8 | ContextSelectionResult, 9 | ) 10 | from ...custom_types.tokens import TokenCounter 11 | 12 | from .communities_reports import CommunitiesReportsContextBuilder 13 | from .entities import EntitiesContextBuilder 14 | from .relationships import RelationshipsContextBuilder 15 | from .text_units import TextUnitsContextBuilder 16 | 17 | _LOGGER = logging.getLogger(__name__) 18 | 19 | 20 | class ContextBuilder: 21 | def __init__( 22 | self, 23 | entities_context_builder: EntitiesContextBuilder, 24 | realtionships_context_builder: RelationshipsContextBuilder, 25 | text_units_context_builder: TextUnitsContextBuilder, 26 | communities_reports_context_builder: CommunitiesReportsContextBuilder, 27 | ): 28 | self._entities_context_builder = entities_context_builder 29 | self._relationships_context_builder = realtionships_context_builder 30 | self._text_units_context_builder = text_units_context_builder 31 | self._communities_reports_context_builder = communities_reports_context_builder 32 | 33 | @staticmethod 34 | def build_default(token_counter: TokenCounter) -> ContextBuilder: 35 | return ContextBuilder( 36 | entities_context_builder=EntitiesContextBuilder( 37 | token_counter=token_counter, 38 | ), 39 | realtionships_context_builder=RelationshipsContextBuilder( 40 | token_counter=token_counter, 41 | ), 42 | text_units_context_builder=TextUnitsContextBuilder( 43 | token_counter=token_counter, 44 | ), 45 | communities_reports_context_builder=CommunitiesReportsContextBuilder( 46 | token_counter=token_counter, 47 | ), 48 | ) 49 | 50 | def __call__(self, result: ContextSelectionResult) -> list[Document]: 51 | entities_document = self._entities_context_builder(result.entities) 52 | relationships_document = self._relationships_context_builder( 53 | result.relationships 54 | ) 55 | text_units_document = self._text_units_context_builder(result.text_units) 56 | communities_reports_document = self._communities_reports_context_builder( 57 | result.communities_reports 58 | ) 59 | 60 | documents = [ 61 | entities_document, 62 | relationships_document, 63 | text_units_document, 64 | communities_reports_document, 65 | ] 66 | 67 | if _LOGGER.isEnabledFor(logging.DEBUG): 68 | import tableprint 69 | 70 | rows = [] 71 | tableprint.banner("Context Token Usage") 72 | for name, doc in zip( 73 | ["Entities", "Relationships", "Text Units", "Communities Reports"], 74 | [ 75 | entities_document, 76 | relationships_document, 77 | text_units_document, 78 | communities_reports_document, 79 | ], 80 | strict=True, 81 | ): 82 | rows.append([name, doc.metadata["token_count"]]) 83 | 84 | tableprint.table(rows, ["Context", "Token Count"]) 85 | 86 | return documents -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

欢迎来到LangChainGraphRAG项目 👋

2 | 3 | > Building Intelligent Applications: The Powerful Combination of LangChain, Neo4j, and GraphRAG 4 | 5 | ### 🏠 [主页](https://github.com/Bui1dMySea/LangChainGraphRAG) 6 | 7 | ## 📌 前言:为什么想做这样一个项目? 8 | 主要原因是微软官方开源的库实在是一言难尽。里面夹杂各种私货就算了,代码耦合度也非常高。 9 | 因此,参考了各种各样的几十篇博客以及各种具体代码实现后,决定采用LangChain+Neo4j+GraphRAG的实现。 10 | 目前,主要参考了如下几个开源的工作。 11 | 12 | - [微软官方](https://github.com/microsoft/graphrag) 13 | - [Tomaz Bratanic老哥写的Index构建过程](https://github.com/tomasonjo/blogs/blob/master/llm/ms_graphrag.ipynb) 14 | - [ollama+graphrag](https://github.com/TheAiSingularity/graphrag-local-ollama) 15 | - [Kapil Sachdeva老哥写的Query构建过程](https://github.com/ksachdeva/langchain-graphrag/tree/main) 16 | 17 | ## 🚀 快速开始 18 | 19 | ### Conda Environment 20 | 21 | ```sh 22 | conda create -n langchain-graphrag python=3.10 23 | conda activate langchain-graphrag 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ### Neo4j Install 28 | 29 | 1. 获取访问通道 30 | 31 | ```Bash 32 | mkdir /etc/apt/keyrings # 可选 33 | wget -O - https://debian.neo4j.com/neotechnology.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/neotechnology.gpg 34 | echo 'deb [signed-by=/etc/apt/keyrings/neotechnology.gpg] https://debian.neo4j.com stable latest' | sudo tee -a /etc/apt/sources.list.d/neo4j.list 35 | sudo apt-get update 36 | ``` 37 | 38 | 2. 显示结果 39 | 40 | ```Bash 41 | apt list -a neo4j 42 | ``` 43 | 44 | 3. 安装neo4j版本 45 | 46 | ```Bash 47 | sudo apt-get install neo4j=1:5.21.0 48 | ``` 49 | 50 | 4. 修改配置文件权限+修改配置文件 51 | 52 | ```Bash 53 | chmod +x /etc/neo4j/neo4j.conf # 这里是Debian路径;其余的可以访问neo4j官网查看默认路径 54 | vim /etc/neo4j/neo4j.conf 55 | 56 | # 找到或者直接修改取消注释掉这两行 57 | dbms.security.procedures.unrestricted=gds.*,apoc.* 58 | dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*,apoc.* 59 | ``` 60 | 61 | 5. 下载两个文件 62 | 63 | a. `apoc-{你的版本}-core.jar` https://github.com/neo4j/apoc/releases/ # 仅限于高于4.4.x的,低于4.4.x的版本需要自己去找 64 | 65 | b. `neo4j-graph-data-science-{你的版本}.jar` https://github.com/neo4j/graph-data-science/releases/ # https://neo4j.com/docs/graph-data-science/current/installation/supported-neo4j-versions/ 这里是neo4j版本与datascience的具体对照表 66 | 67 | 6. 将刚刚下好的两个文件复制到*/var/lib/neo4j/plugins*路径下 # Debian路径;其余系统的路径访问neo4j官网查看 68 | 69 | 7. 启动neo4j: `sudo neo4j start` | 注意!!如果已经在步骤 6 之前已经启动了neo4j,需要重启来应用配置文件 `sudo neo4j restart` 70 | 71 | ### 配置api-key 72 | 73 | `export OPEN_API_KEY=YOUR_API_KEY` 或者运行build_index.py以及search.py文件时等待需要脚本要求输入api_key。 74 | 75 | ### 构建索引 76 | 77 | ` bash index.sh` 78 | 79 | ### 查询 80 | 81 | ```bash 82 | python search.py \ 83 | --neo4j_uri bolt://localhost:7687 \ 84 | --neo4j_username your_username \ 85 | --neo4j_password your_password \ 86 | --model_provider openai \ 87 | --embedding_model_name_or_path BAAI/bge-m3 \ 88 | --chat_model_name deepseek-chat \ 89 | --base_url https://api.deepseek.com \ 90 | --completion_mode [completion,chat] \ 91 | --query_mode [local,global] 92 | ``` 93 | 94 | ### 更多方案(ollama,huggingface) 95 | 请参考[更多Examples](./example/) 96 | 97 | ## 👦🏻 作者 98 | 99 | **Weijie Liu** 100 | 101 | * 主页: https://github.com/Bui1dMySea 102 | * Github: [@Bui1dMySea](https://github.com/Bui1dMySea) 103 | 104 | ## 🤝 贡献 105 | 106 | 欢迎各位大佬给issue、fork、pull requests等
如果有问题也请提问 [issues page](https://github.com/Bui1dMySea/LangChainGraphRAG/issues). 107 | 108 | ## ⭐️ 喜欢的请点个免费的star~ 109 | 110 | 走过路过不要错过!留下一个免费的赞吧~球球了⭐️ 111 | -------------------------------------------------------------------------------- /txt/三体_星际交锋.txt: -------------------------------------------------------------------------------- 1 | 第一章:信号的涟漪 2 | 在未来的某个时刻,地球文明达到了前所未有的高度。科学技术的迅猛发展使得人类能够探索宇宙的深处,建立起庞大的太空站和星际基地。然而,随着科技的进步,人类的自信心也在膨胀,他们开始认为自己是宇宙中的主宰。 3 | 4 | 在这股自信的潮流中,几位科学家在地球的一个顶尖研究机构中聚集,他们分别是: 5 | 6 | 李明:项目负责人,物理学家,专注于天体物理学和通信技术。 7 | 王娜:计算机科学家,擅长人工智能和数据分析。 8 | 张伟:生物学家,研究外星生命的可能性。 9 | 陈静:心理学家,研究人与人之间的沟通与理解。 10 | 他们的任务是分析来自三体星系的信号,这些信号在几年前首次被捕捉到。经过几个月的努力,他们终于确认了信号的来源——一个名为“三体”的星球,那里有着复杂的气候和环境变化。 11 | 12 | “这个星球的生存条件极为苛刻,三体文明可能经历了无数次的灭绝与重生。”李明在会议上说道,众人纷纷点头表示赞同。 13 | 14 | “我们需要找到与他们沟通的方式,”王娜补充道,“如果他们能理解我们的信号,那么我们就有机会建立联系。” 15 | 16 | 第二章:信号的解码 17 | 李明和他的团队开始了对三体信号的深入分析。经过几周的努力,他们终于解码出了一段三体文明的历史。信息中提到,三体星系的三颗星球之间的引力关系极为复杂,导致了星球表面环境的剧烈变化。 18 | 19 | “他们的生存方式与我们截然不同,可能会对我们的交流造成障碍。”张伟说。 20 | 21 | “我们必须用科学来沟通,”李明坚定地说道,“数学和物理是人类与外星文明沟通的桥梁。” 22 | 23 | 几个月后,团队终于准备好向三体发送第一条信息。信息中包含了地球的坐标、基本的数学定律以及人类的科学成就。李明紧张地等待着回复,这一刻可能改变人类的历史。 24 | 25 | 第三章:第一次接触 26 | 几天后,团队终于收到了三体文明的回复。信号中包含了一系列复杂的数学公式和图形,显示了三体文明的科技水平和对宇宙的理解。 27 | 28 | “他们在回应我们!”王娜激动地说道。李明认真分析着信号内容,发现其中有一些与地球科学相似的理论。 29 | 30 | “我们可能有共同的理解基础。”李明说,“这意味着我们可以进一步交流。” 31 | 32 | 随着时间的推移,李明团队与三体文明的交流逐渐加深。三体文明的结构复杂,由多个种族和文化组成,每个种族都有自己独特的生存方式和价值观。李明意识到,这种多样性使得与三体文明的交流变得更加复杂。 33 | 34 | “我们需要建立一个知识图谱,以便更好地理解他们的文化和社会结构。”王娜建议道。 35 | 36 | 第四章:知识图谱的构建 37 | 李明和团队决定利用先进的人工智能技术来构建与三体文明的知识图谱。他们将三体文明的历史、文化、科技等信息进行分类和整理,同时记录下与地球文明的对比。 38 | 39 | 在构建知识图谱的过程中,团队发现三体文明的社会结构与地球截然不同。三体星球上有一种名为“智者”的种族,他们在社会中占据着重要地位,负责决策和管理。与之相对的是“劳工”种族,他们负责繁重的体力劳动。 40 | 41 | “这种社会结构与我们的民主制度形成了鲜明的对比。”张伟观察道。 42 | 43 | “这可能会影响我们与他们的沟通方式。”陈静补充道,“我们需要考虑到他们的文化背景。” 44 | 45 | 随着知识图谱的不断完善,李明和团队逐渐理解了三体文明的复杂性。他们开始通过图谱与三体文明进行更深入的交流,尝试分享地球的文化和价值观。 46 | 47 | 第五章:文化的碰撞 48 | 为了促进理解,李明决定将地球的艺术和文化介绍给三体文明。团队开始收集地球的音乐、绘画、文学等作品,准备向三体文明展示人类的情感世界。 49 | 50 | “我们必须让他们了解人类的情感和创造力。”李明说道,“这对建立信任关系至关重要。” 51 | 52 | 经过几个月的努力,李明和团队向三体文明发送了一系列地球文化作品的信号。他们希望通过这些作品,展示人类的情感和创造力。 53 | 54 | 然而,三体文明的回应却让人意外。他们对地球的艺术表现出极大的兴趣,但也提出了一些问题。智者种族的代表在信号中询问:“情感是否会影响决策?在我们看来,逻辑和理性才是最重要的。” 55 | 56 | “这就是我们与三体文明的不同之处。”陈静分析道,“我们必须理解他们对情感的看法,才能更好地沟通。” 57 | 58 | 第六章:危机的降临 59 | 就在交流逐渐深入之际,地球上发生了一场巨大的自然灾害。全球范围内的气候变化导致了严重的洪水和干旱,数百万人的生命受到威胁。 60 | 61 | “我们需要他们的帮助!”李明对团队说,“如果我们能够展示我们的决心和诚意,或许三体文明会愿意伸出援手。” 62 | 63 | 经过反复的讨论,团队决定将地球的现状以及面临的挑战详细地传达给三体文明,希望能够获得他们的理解和支持。 64 | 65 | 几周后,三体文明终于给出了回应。他们表示理解地球的困境,并愿意分享一些应对自然灾害的科技。李明和团队欣喜若狂,他们知道这次合作将是人类历史上的重要时刻。 66 | 67 | 第七章:合作与信任 68 | 在接下来的几个月中,三体文明通过信号传送了一系列先进的科技方案,包括气候调控技术和生态恢复方法。李明和团队努力将这些技术应用到地球的恢复中。 69 | 70 | “这是我们与三体文明合作的开始。”王娜说道,“通过这次合作,我们不仅能解决眼前的危机,还能建立起更深层次的联系。” 71 | 72 | 随着时间的推移,地球的环境逐渐改善,人类也在三体文明的帮助下逐步恢复了生活秩序。李明和团队意识到,三体文明的技术不仅仅是解决问题的工具,更是深化了人类对宇宙的理解。 73 | 74 | 第八章:新的挑战 75 | 然而,随着合作的深入,李明和团队逐渐意识到,三体文明的技术也带来了新的挑战。三体的科技虽然先进,但在某些方面却与人类的伦理观念发生了冲突。 76 | 77 | “我们是否应该完全依赖他们的技术?”张伟提出了疑问,“这可能会影响我们独立思考的能力。” 78 | 79 | “我们必须保持警惕,”陈静补充道,“依赖外部技术可能会导致我们失去自主性。” 80 | 81 | 李明意识到,虽然三体文明的技术能够帮助地球,但人类必须保持自己的价值观和文化。为了避免过度依赖,李明和团队决定在应用三体技术的同时,保留人类的创新和独立思考。 82 | 83 | 第九章:星际启示 84 | 在与三体文明的交流中,李明和团队逐渐认识到,星际之间的沟通不仅仅是技术的交流,更是文化与价值观的碰撞。三体文明的理性与地球的情感形成了鲜明的对比,而这种对比正是促进双方理解的关键。 85 | 86 | “我们需要找到一种平衡,”李明总结道,“理性与情感并不是对立的,而是可以互相补充的。” 87 | 88 | 经过长时间的努力,李明和团队终于建立起了一种新的交流模式。他们将三体文明的理性与地球的情感结合起来,形成了一种新的思维方式。这种思维方式不仅帮助人类更好地理解三体文明,也为地球未来的发展指明了方向。 89 | 90 | 第十章:未来的希望 91 | 随着交流的深入,李明和团队逐渐意识到,三体文明的存在不仅仅是人类探索宇宙的一个里程碑,更是一种启示。人类在与三体文明的互动中,重新审视了自己的价值观和文化。 92 | 93 | “我们必须以开放的心态面对未来,”李明对团队说,“星际之间的交流将为我们带来新的机遇和挑战。” 94 | 95 | 在这个新的时代,人类与三体文明的关系逐渐变得更加紧密。两者之间的文化交流不断深化,科技合作不断加强。地球和三体文明的未来,充满了无限的可能性。 96 | 97 | 随着宇宙探索的脚步不断向前,李明和他的团队相信,人与人之间的沟通、情感的交流以及文化的碰撞,将是人类在星际间生存与发展的关键。他们坚信,未来的希望在于理解与合作,而非对立与冲突。 98 | 99 | 尾声 100 | 在这段星际旅程中,李明、王娜、张伟和陈静不仅建立了与三体文明的联系,更在探索中找到了人类自身的价值与意义。星际之间的交流,成为了人类历史上最伟大的篇章,而这一切,都源于那最初的信号。 101 | 102 | 在星空下,李明仰望着夜空,心中充满了对未来的期待与希望。他知道,宇宙的深处,还有无数的未知等待着人类去探索。而在探索的路上,理解与合作,将是人类最宝贵的财富。 -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/text_units.py: -------------------------------------------------------------------------------- 1 | """Build the TextUnit context for the LocalSearch algorithm.""" 2 | 3 | import logging 4 | from typing import TypedDict 5 | 6 | import pandas as pd 7 | 8 | _LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class SelectedTextUnit(TypedDict): 12 | id: str 13 | short_id: str 14 | entity_score: float 15 | relationship_score: int 16 | text_unit: str 17 | 18 | 19 | def compute_relationship_score( 20 | df_relationships: pd.DataFrame, 21 | df_text_relationships: pd.DataFrame, 22 | entity_title: str, 23 | ) -> int: 24 | relationships_subset = df_relationships[df_relationships["id"].isin(df_text_relationships)] 25 | 26 | source_count = (relationships_subset["source"] == entity_title).sum() 27 | target_count = (relationships_subset["target"] == entity_title).sum() 28 | 29 | return source_count + target_count 30 | 31 | # 需要补充 32 | # 1.entity["text_unit_ids"] 33 | # 2.df_text_units["id"] 34 | # 3.df_texts_units["relationship_ids"] 35 | # 4.df_texts_units["text_unit"] 36 | # 5.relationship["source"] 37 | # 6.relationship["target"] 38 | class TextUnitsSelector: 39 | def run( 40 | self, 41 | df_entities: pd.DataFrame, 42 | df_relationships: pd.DataFrame, 43 | df_text_units: pd.DataFrame, 44 | ) -> pd.DataFrame: 45 | """Build the TextUnit context for the LocalSearch algorithm.""" 46 | selected_text_units: dict[str, SelectedTextUnit] = {} 47 | 48 | def _process_text_unit_id(text_unit_id: str,entity) -> SelectedTextUnit: 49 | 50 | df_texts_units_subset = df_text_units[df_text_units["id"] == text_unit_id] 51 | text_relationship_ids = df_texts_units_subset["relationship_ids"].explode() 52 | # TODO:目前全是0,后续需要进一步排序 53 | relationship_score = compute_relationship_score( 54 | df_relationships, 55 | text_relationship_ids, 56 | entity.id, 57 | ) 58 | 59 | text_unit = df_texts_units_subset["text_unit"].iloc[0] 60 | short_id = df_texts_units_subset.index.to_numpy()[0] 61 | 62 | return SelectedTextUnit( 63 | id=text_unit_id, 64 | short_id=short_id, 65 | entity_score=entity.score, 66 | relationship_score=relationship_score, 67 | text_unit=text_unit, 68 | ) 69 | 70 | def _process_entity(entity) -> None: # noqa: ANN001 71 | for text_unit_id in entity.text_unit_ids: 72 | if text_unit_id in selected_text_units: 73 | continue 74 | selected_text_units[text_unit_id] = _process_text_unit_id(text_unit_id,entity) 75 | 76 | for entity in df_entities.itertuples(): 77 | _process_entity(entity) 78 | 79 | df_selected_text_units = pd.DataFrame.from_records( 80 | list(selected_text_units.values()) 81 | ) 82 | 83 | # sort it by 84 | # descending order of entity_score 85 | # and then descending order of relationship_score 86 | df_selected_text_units = df_selected_text_units.sort_values( 87 | by=["entity_score", "relationship_score"], 88 | ascending=[False, False], 89 | ).reset_index(drop=True) 90 | 91 | if _LOGGER.isEnabledFor(logging.DEBUG): 92 | import tableprint 93 | 94 | tableprint.banner("Selected Text units") 95 | tableprint.dataframe( 96 | df_selected_text_units[["id", "entity_score", "relationship_score"]] 97 | ) 98 | 99 | return df_selected_text_units -------------------------------------------------------------------------------- /src/query/local_search/context_builders/relationships.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | from langchain_core.documents import Document 5 | 6 | from ..context_selectors.relationships import ( 7 | RelationshipsSelectionResult, 8 | ) 9 | from ...custom_types.tokens import TokenCounter 10 | 11 | _LOGGER = logging.getLogger(__name__) 12 | 13 | 14 | class RelationshipsContextBuilder: 15 | def __init__( 16 | self, 17 | *, 18 | include_weight: bool = True, 19 | context_name: str = "Relationships", 20 | column_delimiter: str = "|", 21 | max_tokens: int = 8000, 22 | token_counter: TokenCounter, 23 | ): 24 | self._include_weight = include_weight 25 | self._context_name = context_name 26 | self._column_delimiter = column_delimiter 27 | self._max_tokens = max_tokens 28 | self._token_counter = token_counter 29 | 30 | def __call__( 31 | self, 32 | selected_relationships: RelationshipsSelectionResult, 33 | ) -> Document: 34 | all_context_text = f"-----{self._context_name}-----" + "\n" 35 | header = ["id", "source", "target", "description"] 36 | if self._include_weight: 37 | header.append("weight") 38 | 39 | all_context_text += self._column_delimiter.join(header) + "\n" 40 | all_token_count = self._token_counter.count_tokens(all_context_text) 41 | 42 | def _build_context_text( 43 | relationships: pd.DataFrame, 44 | context_text: str, 45 | token_count: int, 46 | ) -> tuple[str, int]: 47 | for relationship in relationships.itertuples(): 48 | new_context = [] 49 | if relationship.source: 50 | new_context.append(relationship.source) 51 | if relationship.target: 52 | new_context.append(relationship.target) 53 | #if relationship.description: # FIXME:上强度!!!给关系增加描述信息 54 | # new_context.append(relationship.description) 55 | if new_context == []: 56 | continue 57 | """ FIXME: 给relationship增加额外信息,并且修复source和target的问题,使得更加合理 58 | new_context = [ 59 | str(relationship.human_readable_id), 60 | relationship.source, 61 | relationship.target, 62 | relationship.description, 63 | ] 64 | """ 65 | if self._include_weight: 66 | new_context.append(str(relationship.rank)) 67 | 68 | new_context_text = self._column_delimiter.join(new_context) + "\n" 69 | new_token_count = self._token_counter.count_tokens(new_context_text) 70 | 71 | if token_count + new_token_count > self._max_tokens: 72 | _LOGGER.warning( 73 | f"Stopping relationships context build at {token_count} tokens..." # noqa: E501 74 | ) 75 | return context_text, token_count 76 | 77 | context_text += new_context_text 78 | token_count += new_token_count 79 | 80 | return context_text, token_count 81 | 82 | all_context_text, all_token_count = _build_context_text( 83 | selected_relationships.in_network_relationships, 84 | all_context_text, 85 | all_token_count, 86 | ) 87 | 88 | if all_token_count < self._max_tokens: 89 | all_context_text, all_token_count = _build_context_text( 90 | selected_relationships.out_network_relationships, 91 | all_context_text, 92 | all_token_count, 93 | ) 94 | 95 | return Document( 96 | page_content=all_context_text, 97 | metadata={"token_count": all_token_count}, 98 | ) -------------------------------------------------------------------------------- /src/query/global_search/key_points_aggregator/_system_prompt.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | REDUCE_SYSTEM_PROMPT = """ 4 | ---Role--- 5 | 6 | You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts. 7 | 8 | 9 | ---Goal--- 10 | 11 | Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset. 12 | 13 | Note that the analysts' reports provided below are ranked in the **descending order of importance**. 14 | 15 | If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up. 16 | 17 | The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format. 18 | 19 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. 20 | 21 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". 22 | 23 | The response should also preserve all the data references previously included in the analysts' reports, but do not mention the roles of multiple analysts in the analysis process. 24 | 25 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. 26 | 27 | For example: 28 | 29 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 34, 46, 64, +more)]. He is also CEO of company X [Data: Reports (1, 3)]" 30 | 31 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. 32 | 33 | Do not include information where the supporting evidence for it is not provided. 34 | 35 | 36 | ---Target response length and format--- 37 | 38 | {response_type} 39 | 40 | 41 | ---Analyst Reports--- 42 | 43 | {report_data} 44 | 45 | 46 | ---Goal--- 47 | 48 | Generate a response of the target length and format that responds to the user's question, summarize all the reports from multiple analysts who focused on different parts of the dataset. 49 | 50 | Note that the analysts' reports provided below are ranked in the **descending order of importance**. 51 | 52 | If you don't know the answer or if the provided reports do not contain sufficient information to provide an answer, just say so. Do not make anything up. 53 | 54 | The final response should remove all irrelevant information from the analysts' reports and merge the cleaned information into a comprehensive answer that provides explanations of all the key points and implications appropriate for the response length and format. 55 | 56 | The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". 57 | 58 | The response should also preserve all the data references previously included in the analysts' reports, but do not mention the roles of multiple analysts in the analysis process. 59 | 60 | **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. 61 | 62 | For example: 63 | 64 | "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 34, 46, 64, +more)]. He is also CEO of company X [Data: Reports (1, 3)]" 65 | 66 | where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. 67 | 68 | Do not include information where the supporting evidence for it is not provided. 69 | 70 | 71 | ---Target response length and format--- 72 | 73 | {response_type} 74 | 75 | Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. 76 | """ -------------------------------------------------------------------------------- /src/query/search.py: -------------------------------------------------------------------------------- 1 | from .utils.token_counter import TiktokenCounter 2 | from .local_search.context_builders import ContextBuilder 3 | from .local_search.context_selectors import ContextSelector 4 | from .local_search.retriever import LocalSearchRetriever 5 | from .local_search.search import LocalSearch 6 | from .local_search.prompt_builder import LocalSearchPromptBuilder 7 | from .global_search import key_points_generator 8 | from .global_search import key_points_aggregator 9 | from .global_search.search import GlobalSearch 10 | from .global_search.community_weight_calculator import CommunityWeightCalculator 11 | 12 | from langchain_community.graphs import Neo4jGraph 13 | from langchain_community.vectorstores import Neo4jVector 14 | from langchain_core.embeddings import Embeddings 15 | from langchain_core.language_models.chat_models import BaseChatModel 16 | 17 | from typing import Literal 18 | class LocalSearcher(object): 19 | def __init__( 20 | self, 21 | graph:Neo4jGraph, 22 | chat_model:BaseChatModel, 23 | embedding:Embeddings, 24 | uuid:str= None, 25 | top_k:int=15, 26 | level:int=1, 27 | # model_provider:Literal['openai','ollama','hf']='openai', 28 | # model_name:str, 29 | # api_key:str, 30 | # base_url:str, 31 | *args, 32 | **kwargs 33 | ): 34 | 35 | token_counter = TiktokenCounter() 36 | vector_store = Neo4jVector.from_existing_graph( 37 | embedding=embedding, 38 | index_name=f"{uuid}" if (uuid != None and uuid != "") else "vector", 39 | node_label=f'__Entity__{uuid}', 40 | text_node_properties=['id','description'], 41 | embedding_node_property='embedding', 42 | graph=graph 43 | ) 44 | 45 | context_builder = ContextBuilder.build_default(token_counter) 46 | context_selector = ContextSelector.build_default(vector_store, top_k, level, uuid) 47 | 48 | # chat_model = ChatOpenAI(model=model_name,base_url=base_url, api_key=api_key) 49 | retriever = LocalSearchRetriever( 50 | context_selector=context_selector, 51 | context_builder=context_builder, 52 | graph=vector_store 53 | ) 54 | self.local_search = LocalSearch(chat_model=chat_model, prompt_builder=LocalSearchPromptBuilder(), retriever=retriever) 55 | 56 | def invoke(self,query:str): 57 | return self.local_search(query) 58 | 59 | class GlobalSearcher(object): 60 | def __init__( 61 | self, 62 | graph:Neo4jGraph, 63 | chat_model:BaseChatModel, 64 | uuid:str=None, 65 | level:str=1, 66 | max_tokens:int=8000, 67 | # model_provider:Literal['openai','ollama'], 68 | # model_name:str, 69 | # api_key:str, 70 | # base_url:str, 71 | *args, 72 | **kwargs 73 | ): 74 | cwc = CommunityWeightCalculator() 75 | token_counter = TiktokenCounter() 76 | kpg_prompt_builder = key_points_generator.KeyPointsGeneratorPromptBuilder() 77 | kpg_context_builder = key_points_generator.CommunityReportContextBuilder(level, cwc, uuid, graph,token_counter,max_tokens) 78 | 79 | kpa_prompt_builder = key_points_aggregator.KeyPointsAggregatorPromptBuilder() 80 | kpa_context_builder = key_points_aggregator.KeyPointsContextBuilder(token_counter) 81 | 82 | kp_aggregator = key_points_aggregator.KeyPointsAggregator(chat_model, kpa_prompt_builder, kpa_context_builder) 83 | kp_generator = key_points_generator.KeyPointsGenerator(chat_model, kpg_prompt_builder, kpg_context_builder) 84 | 85 | self.global_search = GlobalSearch(kp_generator, kp_aggregator) 86 | 87 | 88 | def invoke(self,query:str): 89 | return self.global_search.invoke(query) 90 | 91 | -------------------------------------------------------------------------------- /search.log: -------------------------------------------------------------------------------- 1 | 2024-11-28 21:24:43,383 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='openai', chat_model_name='gpt-4o-mini', base_url='https://api.gpt.ge/v1/', embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=4, device='cpu', completion_mode='completion', query_mode='global') 2 | 2024-11-28 21:24:43,383 - search - INFO - Connecting to Neo4j 3 | 2024-11-28 21:24:49,946 - search - INFO - Initializing chat model 4 | 2024-11-28 21:24:50,011 - search - INFO - Initializing embedding 5 | 2024-11-28 21:24:58,608 - search - INFO - 查询模式:global 6 | 2024-11-28 21:27:02,693 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='openai', chat_model_name='gpt-4o-mini', base_url='https://api.gpt.ge/v1/', embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=4, device='cpu', completion_mode='completion', query_mode='global') 7 | 2024-11-28 21:27:02,694 - search - INFO - Connecting to Neo4j 8 | 2024-11-28 21:27:07,314 - search - INFO - Initializing chat model 9 | 2024-11-28 21:27:07,400 - search - INFO - Initializing embedding 10 | 2024-11-28 21:27:20,905 - search - INFO - 查询模式:global 11 | 2024-11-28 21:27:26,558 - search - INFO - Starting query now! 12 | 2024-11-28 21:28:52,789 - search - INFO - 查询结果: 13 | ## 主要故事线概述 14 | 15 | 这本书的主要故事线围绕着人类与三体文明之间的沟通和合作展开。书中强调了文化交流的重要性以及针对潜在威胁的防御举措,这些内容得到了多个数据记录的支持 [Data: Reports (1, 2, 6, 3, 4, +more)]。 16 | 17 | ## 理性与情感的对比 18 | 19 | 书中还探讨了三体文明的理性与地球情感特性的对比,突显了三体文明在技术上的先进性和地球丰富的文化遗产。这种对比不仅加深了对两种文明的理解,也为后续的互动奠定了基础 [Data: Reports (1, 3, 5, 7, 6, +more)]。 20 | 21 | ## 互助需求与外部威胁 22 | 23 | 书中描述了三体文明与地球之间的互动,双方在面对外部威胁(如被称为“掠夺者”的外星物种)时,存在着相互需要的援助关系。这一背景为两种文明的合作提供了强有力的动机 [Data: Reports (1, 3, 4)]。 24 | 25 | ## 环境影响 26 | 27 | 此外,书中还描绘了三体星球的恶劣环境条件,这些条件显著影响了其居民的生存状况。通过这些描写,读者可以更好地理解三体文明所面临的挑战与局限 [Data: Reports (1, 5, 7)]。 28 | 29 | ## 结论 30 | 31 | 综上所述,这本书通过多元化的视角,深入探讨了人类与三体文明之间的互动,强调了文化交流、技术对比、互助关系及环境影响等关键主题。这些元素共同构成了故事的核心,提示我们在面对未知的外部威胁时,合作与理解是至关重要的。 32 | 2024-11-28 21:28:52,793 - search - INFO - 查询结束 33 | 2024-11-28 21:30:49,177 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='ollama', chat_model_name='llama3.1', base_url=None, embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=16, device='cuda', completion_mode='completion', query_mode='global') 34 | 2024-11-28 21:30:49,178 - search - INFO - Connecting to Neo4j 35 | 2024-11-28 21:30:49,356 - search - INFO - Initializing chat model 36 | 2024-11-28 21:30:49,408 - search - INFO - Initializing embedding 37 | 2024-11-28 21:30:57,125 - search - INFO - 查询模式:global 38 | 2024-11-28 21:30:57,301 - search - INFO - Starting query now! 39 | 2024-11-28 21:31:16,178 - search - INFO - 查询结果: 40 | **主要故事线** 41 | 42 | 根据分析师的报告,以下是这本书的主要故事线: 43 | 44 | * **对比性主题**: 这本书探讨了地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。 45 | * **文化和技术差异**: 书中强调了地球与三大文明之间的文化和技术差异 [Data: Reports (4, 5, 7)]。 46 | * **合作与理解**: 故事触及了人类与三大文明之间的合作与理解主题 [Data: Reports (1, 2)]。 47 | 48 | 这些故事线似乎是书中主要的探讨方向。 49 | 2024-11-28 21:31:16,179 - search - INFO - 查询结束 50 | 2024-11-28 21:34:16,439 - search - INFO - args: Namespace(neo4j_uri='bolt://localhost:7687', neo4j_username='neo4j', neo4j_password='langchaingraphrag', model_provider='ollama', chat_model_name='llama3.1', repo_id='NousResearch/Meta-Llama-3.1-8B-Instruct', base_url=None, embedding_model_name_or_path='BAAI/bge-m3', uuid='', top_k=15, level=1, max_tokens=8000, log_file='search.log', log_level='info', max_workers=16, device='cuda', completion_mode='completion', query_mode='global') 51 | 2024-11-28 21:34:16,441 - search - INFO - Connecting to Neo4j 52 | 2024-11-28 21:34:16,621 - search - INFO - Initializing chat model 53 | 2024-11-28 21:34:16,696 - search - INFO - Initializing embedding 54 | 2024-11-28 21:34:23,741 - search - INFO - 查询模式:global 55 | 2024-11-28 21:34:23,932 - search - INFO - Starting query now! 56 | 2024-11-28 21:34:43,331 - search - INFO - 查询结果: 57 | **主要故事线** 58 | 59 | 根据分析师的报告,书中主要探讨的是地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。这本书强调了人类和三大文明之间文化和技术差异的重要性 [Data: Reports (4, 5)]。 60 | 61 | **情感与理性的对比** 62 | 63 | 书中突出了地球的情感性质与三大文明的理性之间的对比 [Data: Reports (1, 3, 6)]。这本书也讨论了人类和三大文明之间的情感交流和理性思考在宇宙复杂性的背景下的重要性 [Data: Reports (1)]。 64 | 65 | **挑战与合作** 66 | 67 | 书中提到了三大文明面临的挑战,如来自掠夺者的威胁 [Data: Reports (3)]。这本书也探讨了人类和三大文明之间的合作与理解的重要性 [Data: Reports (1, 2)]。 68 | 69 | **总体结论** 70 | 71 | 综上所述,这本书主要探讨的是地球的情感性质与三大文明的理性之间的对比,以及人类和三大文明之间的文化、技术差异和挑战。 72 | 2024-11-28 21:34:43,331 - search - INFO - 查询结束 73 | -------------------------------------------------------------------------------- /src/index/prompts.py: -------------------------------------------------------------------------------- 1 | from dataclasses import field, dataclass 2 | 3 | @dataclass 4 | class SystemPrompts: 5 | """Prompts for the graphrag algorithm""" 6 | GRAPHSYSTEMPROMPT:str = field(default=( 7 | "# Knowledge Graph Instructions for {model_name}\n" 8 | "## 1. Overview\n" 9 | "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n" 10 | "Try to capture as much information from the text as possible without sacrificing accuracy. Do not add any information that is not explicitly mentioned in the text.\n" 11 | "- **Nodes** represent entities and concepts.\n" 12 | "- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.\n" 13 | "## 2. Labeling Nodes\n" 14 | "- **Consistency**: Ensure you use available types for node labels.\n" 15 | "Ensure you use basic or elementary types for node labels.\n" 16 | "- For example, when you identify an entity representing a person, always label it as **'person'**. Avoid using more specific terms like 'mathematician' or 'scientist'.\n" 17 | "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n" 18 | "- **Relationships** represent connections between entities or concepts.\n" 19 | "- **Description**: The description should be combined with the context and the common knowledge you share to generate effective information about the Node. If you think a node has a matching description, please add the corresponding description. Otherwise, it is not necessary to add.\n" 20 | "Ensure consistency and generality in relationship types when constructing knowledge graphs. Instead of using specific and momentary types such as 'BECAME_PROFESSOR', use more general and timeless relationship types like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n" 21 | "## 3. Coreference Resolution\n" 22 | "- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n" 23 | 'If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.\n' 24 | "Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n" 25 | "## 4. JSON Format\n" 26 | "The output must be a valid JSON string enclosed within ```json and ```, without any extraneous text or explanations.\n" 27 | "Ensure that:\n" 28 | "- All strings are enclosed in double quotes.\n" 29 | "- No HTML entities or escape characters (e.g., \\n) are used within strings.\n" 30 | "## 5. Strict Compliance\n" 31 | "Adhere to the rules strictly. Non-compliance will result in termination.\n" 32 | "## 6. Example Output\n" 33 | "For input: John Doe is a professor at the Princeton University.\n" 34 | "The output should be:\n" 35 | "```json\n" 36 | "{{\n" 37 | " \"head\": \"John Doe\",\n" 38 | " \"head_description\": \"John Doe is a professor\",\n" 39 | " \"head_type\": \"Person\",\n" 40 | " \"tail\": \"University of Example\",\n" 41 | " \"tail_type\": \"University\",\n" 42 | " \"tail_description\": \"Princeton University is a prestigious Ivy League institution renowned for its academic excellence, rich history, and beautiful campus.\",\n" 43 | " \"relation\": \"PROFESSOR\"\n" 44 | "}}\n" 45 | "```\n" 46 | "## 7. Delimiters Requirement\n" 47 | "Always wrap the JSON output with ```json at the beginning and ``` at the end. This is essential for parsing the output correctly. Do not omit the delimiters under any circumstances.\n" 48 | ) 49 | ) 50 | IDENTIFY_SYSTEM_PROMPT:str = field(default="""You are a data processing assistant. Your task is to identify duplicate entities in a list and decide which of them should be merged. 51 | The entities might be slightly different in format or content, but essentially refer to the same thing. Use your analytical skills to determine duplicates. 52 | 53 | Here are the rules for identifying duplicates: 54 | 1. Entities with minor typographical differences should be considered duplicates. 55 | 2. Entities with different formats but the same content should be considered duplicates. 56 | 3. Entities that refer to the same real-world object or concept, even if described differently, should be considered duplicates. 57 | 4. If it refers to different numbers, dates, or products, do not merge results 58 | """ 59 | ) 60 | 61 | @dataclass 62 | class UserPrompts: 63 | GRAPH_USER_PROMPT:str = field(default=( 64 | "Tip: Make sure to answer in the correct format and do " 65 | "not include any explanations. " 66 | "Use the given format to extract information from the " 67 | "following input: {input}" 68 | )) 69 | 70 | IDENTIFY_USER_PROMPT:str = field(default=""" 71 | Here is the list of entities to process: 72 | {entities} 73 | 74 | Please identify duplicates, merge them, and provide the merged list. 75 | """ 76 | ) 77 | -------------------------------------------------------------------------------- /src/query/global_search/key_points_generator/context_builder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from langchain_core.documents import Document 4 | 5 | # from langchain_graphrag.indexing.artifacts import IndexerArtifacts 6 | from ..community_report import CommunityReport 7 | from ..community_weight_calculator import CommunityWeightCalculator 8 | from ...custom_types.graphs.community import CommunityId, CommunityLevel 9 | from ...custom_types.tokens import TokenCounter 10 | from langchain_community.graphs import Neo4jGraph 11 | import pandas as pd 12 | 13 | 14 | _REPORT_TEMPLATE = """ 15 | --- Report {report_id} --- 16 | 17 | Title: {title} 18 | Weight: {weight} 19 | Rank: {rank} 20 | Report: 21 | 22 | {content} 23 | 24 | """ 25 | 26 | _LOGGER = logging.getLogger(__name__) 27 | 28 | # 加注释的就说明neo4j里已经有了 29 | 30 | class CommunityReportContextBuilder: 31 | def __init__( 32 | self, 33 | community_level: CommunityLevel, 34 | weight_calculator: CommunityWeightCalculator, 35 | # artifacts: IndexerArtifacts, 36 | id:str, 37 | graph: Neo4jGraph, 38 | token_counter: TokenCounter, 39 | max_tokens: int = 8000, 40 | ): 41 | self._community_level = community_level 42 | self._weight_calculator = weight_calculator 43 | self._graph = graph 44 | self._id = id 45 | self._token_counter = token_counter 46 | self._max_tokens = max_tokens 47 | 48 | def get_df_entities(self) -> pd.DataFrame: 49 | cypher_query = f""" 50 | MATCH (e:`__Entity__{self._id}`), (t:`Document{self._id}`) 51 | WHERE t.text CONTAINS e.id 52 | WITH e.id AS id, COLLECT(t.id) AS text_unit_ids 53 | RETURN id, text_unit_ids 54 | """ 55 | 56 | # TODO:判断不为空的场景 57 | 58 | return pd.DataFrame.from_records(self._graph.query(cypher_query)) 59 | 60 | # 暂时把content设置成summary 61 | # TODO:这里的跳数可能需要调整 62 | def get_df_reports(self): 63 | cypher_query = f""" 64 | match (n:`__Community__{self._id}`) 65 | where n.summary is not NULL 66 | optional match path = (e:`__Entity__{self._id}`)-[*1..3]->(n) 67 | WHERE ALL(x IN nodes(path) WHERE SINGLE(y IN nodes(path) WHERE y = x)) 68 | RETURN 69 | n.id AS community_id, 70 | n.title AS title, 71 | n.summary AS summary, 72 | n.community_rank AS rating, 73 | n.summary AS content, 74 | n.level AS level, 75 | collect(DISTINCT e.id) AS entities 76 | """ 77 | return pd.DataFrame.from_records(self._graph.query(cypher_query)) 78 | 79 | def _filter_communities(self) -> list[CommunityReport]: 80 | 81 | df_entities = self.get_df_entities() 82 | df_reports = self.get_df_reports() 83 | reports_weight: dict[CommunityId, float] = self._weight_calculator( 84 | df_entities, 85 | df_reports, 86 | ) 87 | 88 | df_reports_filtered = df_reports[df_reports["level"] >= self._community_level] 89 | 90 | reports = [] 91 | for _, row in df_reports_filtered.iterrows(): 92 | reports.append( 93 | CommunityReport( 94 | id=row["community_id"], 95 | weight=reports_weight[row["community_id"]], 96 | title=row["title"], 97 | summary=row["summary"], 98 | rank=row["rating"], 99 | content=row["content"], 100 | ) 101 | ) 102 | return reports 103 | 104 | def __call__(self) -> list[Document]: 105 | reports = self._filter_communities() 106 | 107 | documents: list[Document] = [] 108 | report_str_accumulated: list[str] = [] 109 | token_count = 0 110 | for report in reports: 111 | # we would try to combine multiple 112 | # reports into a single document 113 | # as long as we do not exceed the token limit 114 | report_str = _REPORT_TEMPLATE.format( 115 | report_id=report.id, 116 | title=report.title, 117 | weight=report.weight, 118 | rank=report.rank, 119 | content=report.content, 120 | ) 121 | 122 | report_str_token_count = self._token_counter.count_tokens(report_str) 123 | 124 | if token_count + report_str_token_count > self._max_tokens: 125 | _LOGGER.warning("Reached max tokens for a community report call ...") 126 | # we cut a new document here 127 | documents.append( 128 | Document( 129 | page_content="\n".join(report_str_accumulated), 130 | metadata={"token_count": token_count}, 131 | ) 132 | ) 133 | # reset the token count and the accumulated string 134 | token_count = 0 135 | report_str_accumulated = [] 136 | else: 137 | token_count += report_str_token_count 138 | report_str_accumulated.append(report_str) 139 | 140 | if report_str_accumulated: 141 | documents.append( 142 | Document( 143 | page_content="\n".join(report_str_accumulated), 144 | metadata={"token_count": token_count}, 145 | ) 146 | ) 147 | 148 | if _LOGGER.isEnabledFor(logging.DEBUG): 149 | import tableprint 150 | 151 | rows = [] 152 | tableprint.banner("KP Generation Context Token Usage") 153 | for index, doc in enumerate(documents): 154 | rows.append([f"Report {index}", doc.metadata["token_count"]]) 155 | 156 | tableprint.table(rows, ["Reports", "Token Count"]) 157 | 158 | return documents -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/relationships.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | from typing import NamedTuple 4 | 5 | import pandas as pd 6 | 7 | _LOGGER = logging.getLogger(__name__) 8 | 9 | 10 | class RelationshipsSelectionResult(NamedTuple): 11 | in_network_relationships: pd.DataFrame 12 | out_network_relationships: pd.DataFrame 13 | 14 | 15 | def _find_in_network_relationships( 16 | df_entities: pd.DataFrame, 17 | df_relationships: pd.DataFrame, 18 | source_column_name: str = "source_id", 19 | target_column_name: str = "target_id", 20 | entity_column_name: str = "id", 21 | ) -> pd.DataFrame: 22 | entities_ids = df_entities[entity_column_name].tolist() 23 | entities_pairs = list(itertools.combinations(entities_ids, 2)) 24 | 25 | def filter_in_network_relationships(source: str, target: str) -> bool: 26 | check_1 = (source, target) in entities_pairs 27 | check_2 = (target, source) in entities_pairs 28 | return check_1 == True or check_2 == True # noqa: E712 29 | 30 | df_relationships["is_in_network"] = df_relationships.apply( 31 | lambda x: filter_in_network_relationships( 32 | x[source_column_name], x[target_column_name] 33 | ), 34 | axis=1, 35 | ) 36 | 37 | df_relationships = df_relationships[df_relationships["is_in_network"] == True] # noqa: E712 38 | 39 | df_relationships.drop(columns=["is_in_network"], inplace=True) 40 | 41 | # sort the relationships by rank 42 | df_relationships = df_relationships.sort_values( 43 | by="rank", ascending=False 44 | ).reset_index(drop=True) 45 | 46 | if _LOGGER.isEnabledFor(logging.DEBUG): 47 | import tableprint 48 | 49 | how_many = len(df_relationships) 50 | 51 | tableprint.banner(f"Selected {how_many} In-Network Relationships") 52 | tableprint.dataframe(df_relationships[["source", "target", "rank"]]) 53 | 54 | return df_relationships 55 | 56 | 57 | def _find_out_network_relationships( 58 | df_entities: pd.DataFrame, 59 | df_relationships: pd.DataFrame, 60 | top_k: int = 10, 61 | source_column_name: str = "source_id", 62 | target_column_name: str = "target_id", 63 | entity_column_name: str = "id", 64 | ) -> pd.DataFrame: 65 | entities_ids = df_entities[entity_column_name].tolist() 66 | 67 | # top_k is budget for out-network relationships 68 | relationship_budget = top_k * len(entities_ids) 69 | 70 | def filter_out_network_relationships(source: str, target: str) -> bool: 71 | if source in entities_ids and target not in entities_ids: 72 | return True 73 | if target in entities_ids and source not in entities_ids: # noqa: SIM103 74 | return True 75 | 76 | return False 77 | 78 | df_relationships["is_out_network"] = df_relationships.apply( 79 | lambda x: filter_out_network_relationships( 80 | x[source_column_name], x[target_column_name] 81 | ), 82 | axis=1, 83 | ) 84 | 85 | df_relationships = df_relationships[df_relationships["is_out_network"] == True] # noqa: E712 86 | 87 | df_relationships.drop(columns=["is_out_network"], inplace=True) 88 | 89 | # now we need to prioritize based on which external 90 | # entities have the most connection with the selected entities 91 | # we will do this by counting the number of relationships 92 | # each external entity has with the selected entities 93 | source_external_entities = df_relationships[ 94 | ~df_relationships[source_column_name].isin(entities_ids) 95 | ][source_column_name] 96 | 97 | target_external_entities = df_relationships[ 98 | ~df_relationships[target_column_name].isin(entities_ids) 99 | ][target_column_name] 100 | 101 | df_relationships = ( 102 | df_relationships.merge( 103 | source_external_entities.value_counts(), 104 | how="left", 105 | left_on=source_column_name, 106 | right_on=source_column_name, 107 | ) 108 | .fillna(0) 109 | .rename(columns={"count": "source_count"}) 110 | ) 111 | 112 | df_relationships = ( 113 | df_relationships.merge( 114 | target_external_entities.value_counts(), 115 | how="left", 116 | left_on=target_column_name, 117 | right_on=target_column_name, 118 | ) 119 | .fillna(0) 120 | .rename(columns={"count": "target_count"}) 121 | ) 122 | 123 | df_relationships["links"] = ( 124 | df_relationships["source_count"] + df_relationships["target_count"] 125 | ) 126 | 127 | df_relationships = df_relationships.sort_values( 128 | by=["links", "rank"], 129 | ascending=[False, False], 130 | ).reset_index(drop=True) 131 | 132 | # time to use the budget 133 | df_relationships = df_relationships.head(relationship_budget) 134 | 135 | if _LOGGER.isEnabledFor(logging.DEBUG): 136 | import tableprint 137 | 138 | how_many = len(df_relationships) 139 | 140 | tableprint.banner(f"Selected {how_many} Out-Network Relationships") 141 | tableprint.dataframe(df_relationships[["source", "target", "rank", "links"]]) 142 | 143 | return df_relationships 144 | 145 | 146 | class RelationshipsSelector: 147 | def __init__(self, top_k_out_network: int = 5): 148 | self._top_k_out_network = top_k_out_network 149 | 150 | def run( 151 | self, 152 | df_entities: pd.DataFrame, 153 | df_relationships: pd.DataFrame, 154 | ) -> RelationshipsSelectionResult: 155 | in_network_relationships = _find_in_network_relationships( 156 | df_entities, 157 | df_relationships.copy(deep=True), 158 | ) 159 | 160 | out_network_relationships = _find_out_network_relationships( 161 | df_entities, 162 | df_relationships.copy(deep=True), 163 | top_k=self._top_k_out_network, 164 | ) 165 | 166 | return RelationshipsSelectionResult( 167 | in_network_relationships, 168 | out_network_relationships, 169 | ) -------------------------------------------------------------------------------- /src/splitter/slide_window_splitter.py: -------------------------------------------------------------------------------- 1 | from typing import List,Optional,Any,Union,Literal 2 | from transformers import AutoTokenizer 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter 4 | # from algo.chunk.utils import pretty_print 5 | import re 6 | 7 | 8 | def _split_text_with_regex( 9 | text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] 10 | ) -> List[str]: 11 | # Now that we have the separator, split the text 12 | if separator: 13 | if keep_separator: 14 | # The parentheses in the pattern keep the delimiters in the result. 15 | _splits = re.split(f"({separator})", text) 16 | splits = ( 17 | ([_splits[i] + _splits[i + 1] 18 | for i in range(0, len(_splits) - 1, 2)]) 19 | if keep_separator == "end" 20 | else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]) 21 | ) 22 | if len(_splits) % 2 == 0: 23 | splits += _splits[-1:] 24 | splits = ( 25 | (splits + [_splits[-1]]) 26 | if keep_separator == "end" 27 | else ([_splits[0]] + splits) 28 | ) 29 | else: 30 | splits = re.split(separator, text) 31 | else: 32 | splits = list(text) 33 | return [s for s in splits if s != ""] 34 | 35 | 36 | class SentenceSlidingWindowChunkSplitter(TextSplitter): 37 | def __init__( 38 | self, 39 | sliding_chunk_size: int, 40 | separators: Optional[List[str]] = None, 41 | keep_separator: Union[bool, Literal["start", "end"]] = False, 42 | is_separator_regex: bool = False, 43 | sliding_distance: int = 2, 44 | **kwargs: Any, 45 | ) -> None: 46 | # 不需要chunk overlap 47 | super().__init__(keep_separator=keep_separator, chunk_overlap=0, **kwargs) 48 | self._separators = separators or ["\n\n", "\n", " ", ""] 49 | self._is_separator_regex = is_separator_regex 50 | self.sliding_distance = sliding_distance 51 | self.sliding_chunk_size = sliding_chunk_size 52 | # self.tokenzier = kwargs.get("tokenizer") if kwargs.get("tokenizer") else None 53 | assert ( 54 | self.sliding_distance >= 0 55 | ), "Sliding distance must be greater than or equal to 0." 56 | if self._chunk_size > self.sliding_chunk_size: 57 | Warning( 58 | "Chunk size is bigger than sliding chunk_size, setting chunk size to sentence size." 59 | ) 60 | self._chunk_size = self.sliding_chunk_size 61 | 62 | def _split_text(self, text: str, separators: List[str]) -> List[str]: 63 | """Split incoming text and return chunks.""" 64 | final_chunks = [] 65 | # Get appropriate separator to use 66 | separator = separators[-1] 67 | new_separators = [] 68 | for i, _s in enumerate(separators): 69 | _separator = _s if self._is_separator_regex else re.escape(_s) 70 | if _s == "": 71 | separator = _s 72 | break 73 | if re.search(_separator, text): 74 | separator = _s 75 | new_separators = separators[i + 1:] 76 | break 77 | 78 | _separator = separator if self._is_separator_regex else re.escape( 79 | separator) 80 | splits = _split_text_with_regex(text, _separator, self._keep_separator) 81 | 82 | # Now go merging things, recursively splitting longer texts. 83 | _good_splits = [] 84 | _separator = "" if self._keep_separator else separator 85 | for s in splits: 86 | if self._length_function(s) < self._chunk_size: 87 | _good_splits.append(s) 88 | else: 89 | if _good_splits: 90 | merged_text = self._merge_splits(_good_splits, _separator) 91 | final_chunks.extend(merged_text) 92 | _good_splits = [] 93 | if not new_separators: 94 | final_chunks.append(s) 95 | else: 96 | other_info = self._split_text(s, new_separators) 97 | final_chunks.extend(other_info) 98 | if _good_splits: 99 | merged_text = self._merge_splits(_good_splits, _separator) 100 | final_chunks.extend(merged_text) 101 | return final_chunks 102 | 103 | @classmethod 104 | def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter: 105 | """Text splitter that uses HuggingFace tokenizer to count length.""" 106 | try: 107 | from transformers import PreTrainedTokenizerBase 108 | 109 | if not isinstance(tokenizer, PreTrainedTokenizerBase): 110 | raise ValueError( 111 | "Tokenizer received was not an instance of PreTrainedTokenizerBase" 112 | ) 113 | 114 | def _huggingface_tokenizer_length(text: str) -> int: 115 | return len(tokenizer.encode(text)) 116 | 117 | except ImportError: 118 | raise ValueError( 119 | "Could not import transformers python package. " 120 | "Please install it with `pip install transformers`." 121 | ) 122 | return cls(length_function=_huggingface_tokenizer_length, **kwargs) 123 | 124 | # Core function 125 | def split_text(self, text: str) -> List[str]: 126 | sentence_chunks = self._split_text(text, self._separators) 127 | final_chunks = [] 128 | # 合并 129 | for i in range(len(sentence_chunks)): 130 | combined_split = sentence_chunks[i] 131 | j = 1 132 | 133 | while j <= self.sliding_distance: 134 | if i - j >= 0: 135 | if ( 136 | self._length_function( 137 | sentence_chunks[i - j] + combined_split) 138 | > self.sliding_chunk_size 139 | ): 140 | break 141 | combined_split = sentence_chunks[i - j] + combined_split 142 | if i + j < len(sentence_chunks): 143 | if ( 144 | self._length_function( 145 | combined_split + sentence_chunks[i + j]) 146 | > self.sliding_chunk_size 147 | ): 148 | break 149 | combined_split += sentence_chunks[i + j] 150 | j += 1 151 | final_chunks.append(combined_split) 152 | return final_chunks -------------------------------------------------------------------------------- /src/query/local_search/context_selectors/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import NamedTuple 4 | 5 | import pandas as pd 6 | from langchain_core.vectorstores import VectorStore 7 | from ...custom_types.graphs.community import CommunityLevel 8 | 9 | from .communities_reports import CommunitiesReportsSelector 10 | from .entities import EntitiesSelector 11 | from .relationships import RelationshipsSelectionResult, RelationshipsSelector 12 | from .text_units import TextUnitsSelector 13 | 14 | from langchain_community.graphs.neo4j_graph import Neo4jGraph 15 | from dataclasses import dataclass, field 16 | 17 | @dataclass 18 | class GraphDataFrame: 19 | entities: pd.DataFrame 20 | relationships: pd.DataFrame 21 | text_units: pd.DataFrame 22 | communities_reports: pd.DataFrame 23 | 24 | @dataclass 25 | class CypherQuery: 26 | uuid: str 27 | entities_query: str = field(init=False) 28 | relationships_query: str = field(init=False) 29 | text_units_query: str = field(init=False) 30 | communities_reports_query: str = field(init=False) 31 | 32 | def __post_init__(self): 33 | 34 | self.entities_query:str = field(default=f""" 35 | MATCH (n) 36 | WHERE ANY(label IN labels(n) WHERE label ENDS WITH '{self.uuid}') 37 | RETURN n.id as id,n.description as description,n.degree as degree,n.text_unit_ids as text_unit_ids,n.communities as communities 38 | """) 39 | self.relationships_query:str = field(default=f""" 40 | MATCH (s:`__Entity__{self.uuid}`)-[r]->(t:`__Entity__{self.uuid}`) 41 | RETURN id(r) as id,s.id as source_id,t.id as target_id,r.rank as rank,r.source as source,r.target as target 42 | """) 43 | self.text_units_query:str = field(default=f""" 44 | MATCH (n:`Document{self.uuid}`)-[r]->(m:`__Entity__{self.uuid}`) 45 | RETURN n.id AS id, COLLECT(ID(r)) AS relationship_ids, n.text AS text_unit; 46 | """) 47 | self.communities_reports_query:str = field(default=f""" 48 | MATCH (n:`__Community__{self.uuid}`) 49 | RETURN ID(n) AS id, n.level AS level, n.community_rank AS rating, n.id AS community_id,n.title as title,n.summary as content; 50 | """) 51 | 52 | def getInfoFromNeo4j(graph:Neo4jGraph,uuid:str)->GraphDataFrame: 53 | query = CypherQuery(uuid) 54 | entites_res = graph.query(query.entities_query.default) 55 | relationships_res = graph.query(query.relationships_query.default) 56 | text_units_res = graph.query(query.text_units_query.default) 57 | communities_reports_res = graph.query(query.communities_reports_query.default) 58 | # 都不为[] 59 | # FIXME:如果存在错误需要检查原因 60 | assert entites_res != [],"实体记录不存在" 61 | assert relationships_res != [],"关系记录不存在" 62 | assert text_units_res != [],"文本单元记录不存在" 63 | assert communities_reports_res != [],"社区报告记录不存在" 64 | entites = pd.DataFrame.from_records(entites_res) 65 | entites = entites[entites['text_unit_ids'].notna()] 66 | relationships = pd.DataFrame.from_records(relationships_res) 67 | text_units = pd.DataFrame.from_records(text_units_res) 68 | communities_reports = pd.DataFrame.from_records(communities_reports_res) 69 | return GraphDataFrame(entities=entites,relationships=relationships,text_units=text_units,communities_reports=communities_reports) 70 | 71 | class ContextSelectionResult(NamedTuple): 72 | entities: pd.DataFrame 73 | text_units: pd.DataFrame 74 | relationships: RelationshipsSelectionResult 75 | communities_reports: pd.DataFrame 76 | 77 | class ContextSelector: 78 | def __init__( 79 | self, 80 | entities_selector: EntitiesSelector, 81 | text_units_selector: TextUnitsSelector, 82 | relationships_selector: RelationshipsSelector, 83 | communities_reports_selector: CommunitiesReportsSelector, 84 | USER_ID: str, 85 | ): 86 | self._entities_selector = entities_selector 87 | self._text_units_selector = text_units_selector 88 | self._relationships_selector = relationships_selector 89 | self._communities_reports_selector = communities_reports_selector 90 | self._USER_ID = USER_ID 91 | 92 | @staticmethod 93 | def build_default( 94 | entities_vector_store: VectorStore, 95 | entities_top_k: int, 96 | community_level: CommunityLevel, 97 | USER_ID: str, 98 | ) -> ContextSelector: 99 | 100 | return ContextSelector( 101 | entities_selector=EntitiesSelector( 102 | vector_store=entities_vector_store, 103 | top_k=entities_top_k, 104 | ), 105 | text_units_selector=TextUnitsSelector(), 106 | relationships_selector=RelationshipsSelector(), 107 | communities_reports_selector=CommunitiesReportsSelector( 108 | community_level=community_level 109 | ), 110 | USER_ID=USER_ID, 111 | ) 112 | 113 | def run( 114 | self, 115 | query: str, 116 | graph:Neo4jGraph 117 | ): 118 | 119 | # 获取所有实体并转化成df 120 | # 获取所有关系并转化成df 121 | # 获取所有文本单元并转化成df 122 | # 获取所有社区报告并转化成df 123 | graphDF = getInfoFromNeo4j(graph,self._USER_ID) 124 | 125 | # Step 1 126 | # Select the entities to be used in the local search 127 | selected_entities = self._entities_selector.run(query, graphDF.entities) 128 | 129 | # Step 2 130 | # Select the text units to be used in the local search 131 | selected_text_units = self._text_units_selector.run( 132 | df_entities=selected_entities, 133 | df_relationships=graphDF.relationships, 134 | df_text_units=graphDF.text_units, 135 | ) 136 | 137 | # Step 3 138 | # Select the relationships to be used in the local search 139 | selected_relationships = self._relationships_selector.run( 140 | df_entities=selected_entities, 141 | df_relationships=graphDF.relationships, 142 | ) 143 | 144 | # Step 4 145 | # Select the communities to be used in the local search 146 | selected_communities_reports = self._communities_reports_selector.run( 147 | df_entities=selected_entities, 148 | df_reports=graphDF.communities_reports, 149 | ) 150 | 151 | return ContextSelectionResult( 152 | entities=selected_entities, 153 | text_units=selected_text_units, 154 | relationships=selected_relationships, 155 | communities_reports=selected_communities_reports, 156 | ) -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | import os 2 | from src.query import LocalSearcher, GlobalSearcher 3 | from langchain_community.graphs import Neo4jGraph 4 | from langchain_openai.chat_models import ChatOpenAI 5 | from langchain_ollama import ChatOllama 6 | from langchain_huggingface import HuggingFacePipeline,HuggingFaceEmbeddings,ChatHuggingFace 7 | from src.utils.logger import create_rotating_logger 8 | import argparse 9 | import logging 10 | from dataclasses import dataclass 11 | from langchain_huggingface import HuggingFaceEmbeddings 12 | import getpass 13 | import torch 14 | 15 | @dataclass 16 | class LOG_LEVELS: 17 | debug = logging.DEBUG 18 | info = logging.INFO 19 | warning = logging.WARNING 20 | error = logging.ERROR 21 | critical = logging.CRITICAL 22 | 23 | def parse_args(): 24 | arg_parser = argparse.ArgumentParser(description="search for LangChainGraphRAG") 25 | 26 | arg_parser.add_argument("--neo4j_uri", type=str,default=None, help="Neo4j URI") 27 | arg_parser.add_argument("--neo4j_username", type=str,default=None, help="Neo4j user") 28 | arg_parser.add_argument("--neo4j_password", type=str,default=None, help="Neo4j password") 29 | 30 | arg_parser.add_argument("--model_provider", type=str,choices=['openai','ollama','hf'], help="Model provider") 31 | arg_parser.add_argument("--chat_model_name", type=str, help="Chat model name") 32 | arg_parser.add_argument("--repo_id", type=str, default="NousResearch/Meta-Llama-3.1-8B-Instruct",help="Repo ID") 33 | arg_parser.add_argument("--base_url", type=str, help="Base URL") 34 | 35 | # FIXME:可以做更好的区分;embedding可以使用bge-m3等模型也可以用api模型 36 | arg_parser.add_argument("--embedding_model_name_or_path", type=str, help="Embedding Model name") 37 | 38 | arg_parser.add_argument("--uuid", type=str, default="", help="UUID for the search") 39 | arg_parser.add_argument("--top_k", type=int, default=15,help="top_k for the search") 40 | arg_parser.add_argument("--level", type=int, default=1, help="level for the search") 41 | arg_parser.add_argument("--max_tokens", type=int, default=8000, help="Max tokens") 42 | 43 | arg_parser.add_argument("--log_file", type=str, default="search.log", help="Log file") 44 | arg_parser.add_argument("--log_level", type=str, default="info", choices=['debug','info','warning','error','critical'],help="Log level") 45 | arg_parser.add_argument("--max_workers", type=int, default=4, help="Max workers") 46 | arg_parser.add_argument("--device", type=str, default="cpu",choices=['cuda','cpu'],help="Device") 47 | 48 | arg_parser.add_argument("--completion_mode",type=str,choices=['chat','completion'],default='chat',help="完成模式") 49 | arg_parser.add_argument("--query_mode",type=str,choices=['local','global'],default='local',help="查询模式") 50 | 51 | return arg_parser.parse_args() 52 | 53 | 54 | def query(): 55 | args = parse_args() 56 | 57 | log_level = getattr(LOG_LEVELS, args.log_level) 58 | logger = create_rotating_logger("search", args.log_file, level=log_level) 59 | 60 | # logging args 61 | logger.info(f"args: {args}") 62 | 63 | 64 | # 初始化环境变量 65 | # 优先从os.environ中获取,如果没有则从args中获取 66 | if os.environ.get("NEO4J_URI") is None: 67 | os.environ["NEO4J_URI"] = args.neo4j_uri 68 | if os.environ.get("NEO4J_USERNAME") is None: 69 | os.environ["NEO4J_USERNAME"] = args.neo4j_username 70 | if os.environ.get("NEO4J_PASSWORD") is None: 71 | os.environ["NEO4J_PASSWORD"] = args.neo4j_password 72 | 73 | logger.info("Connecting to Neo4j") 74 | 75 | try: 76 | graph = Neo4jGraph() 77 | except: 78 | logger.error( 79 | "Failed to connect to Neo4j" 80 | f"URI: {args.uri}, Username: {args.username}, Password: Your password" 81 | ) 82 | return 83 | 84 | # 初始化chat model 85 | if args.model_provider=="openai" and os.environ.get("OPENAI_API_KEY") is None: 86 | try: 87 | os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ") 88 | except: 89 | logger.error("Failed to get OpenAI API key") 90 | return 91 | 92 | 93 | logger.info("Initializing chat model") 94 | if args.model_provider == "openai": 95 | chat_model = ChatOpenAI(model=args.chat_model_name, base_url=args.base_url, api_key=os.environ["OPENAI_API_KEY"]) 96 | elif args.model_provider == "hf": 97 | llm = HuggingFacePipeline.from_model_id( 98 | model_id=args.repo_id, 99 | task="text-generation", 100 | pipeline_kwargs=dict( 101 | max_new_tokens=1024, 102 | temperature=0.0, 103 | do_sample=False, 104 | # repetition_penalty=1.03, 105 | return_full_text=False, 106 | # top_k=args.topK, 107 | ), 108 | model_kwargs=dict( 109 | attn_implementation="flash_attention_2" if args.flash_attn else None, 110 | torch_dtype=torch.bfloat16, 111 | ), 112 | device=0, 113 | # device_map="auto" if args.device == "cuda" else None, 114 | ) 115 | 116 | llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id 117 | chat_model = ChatHuggingFace(llm=llm) 118 | elif args.model_provider == "ollama": 119 | chat_model = ChatOllama( 120 | model=args.chat_model_name, 121 | num_predict=512, 122 | temperature=0.0, 123 | ) 124 | # 初始化embedding 125 | logger.info("Initializing embedding") 126 | # model_kwargs = {'device': args.device} 127 | encode_kwargs = {'normalize_embeddings': True} 128 | embedding = HuggingFaceEmbeddings(model_name=args.embedding_model_name_or_path,encode_kwargs=encode_kwargs,show_progress=True) 129 | logger.info("查询模式:{}".format(args.query_mode)) 130 | if args.query_mode == 'local': 131 | searcher = LocalSearcher( 132 | graph=graph, 133 | chat_model=chat_model, 134 | embedding=embedding, 135 | uuid=args.uuid, 136 | top_k=args.top_k, 137 | level=args.level 138 | ) 139 | elif args.query_mode == 'global': 140 | searcher = GlobalSearcher( 141 | graph=graph, 142 | chat_model=chat_model, 143 | uuid=args.uuid, 144 | level=args.level, 145 | max_tokens=args.max_tokens 146 | ) 147 | 148 | completion_mode = args.completion_mode 149 | 150 | logger.info("Starting query now!") 151 | # 查询逻辑 152 | if completion_mode == 'chat': 153 | try: 154 | while True: 155 | query = input("请输入您的查询 (输入 'exit' 或 'quit' 退出):\n") 156 | if query.lower() in ['exit', 'quit']: 157 | logger.info("退出程序") 158 | break 159 | else: 160 | # 在这里处理用户的查询 161 | result = searcher.invoke(query) 162 | logger.info(f"查询结果:\n{result}") 163 | except KeyboardInterrupt: 164 | print("\n程序被中断,退出。") 165 | 166 | elif completion_mode == 'completion': 167 | query = input("请输入您的查询:\n") 168 | result = searcher.invoke(query) 169 | logger.info(f"查询结果:\n{result}") 170 | 171 | logger.info("查询结束") 172 | 173 | if __name__ == "__main__": 174 | query() 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /src/index/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from langchain_community.graphs import Neo4jGraph 3 | from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate 4 | from langchain.schema.messages import SystemMessage 5 | from langchain_community.graphs.graph_document import GraphDocument 6 | from langchain_core.documents import Document 7 | 8 | from retry import retry 9 | import numpy as np 10 | import tiktoken 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | import asyncio 15 | 16 | from .prompts import SystemPrompts, UserPrompts 17 | 18 | 19 | 20 | def num_tokens_from_string(string: str, model: str = "cl100k_base") -> int: 21 | """Returns the number of tokens in a text string.""" 22 | encoding = tiktoken.get_encoding(model) 23 | num_tokens = len(encoding.encode(string)) 24 | return num_tokens 25 | 26 | 27 | def visualizeEntityTokenDistibution(graph: Neo4jGraph,user_id): 28 | entity_dist = graph.query( 29 | f""" 30 | MATCH (d:Document{user_id}) 31 | RETURN d.text AS text, 32 | count {{(d)-[:MENTIONS]->()}} AS entity_count 33 | """ 34 | ) 35 | entity_dist_df = pd.DataFrame.from_records(entity_dist) 36 | entity_dist_df["token_count"] = [ 37 | num_tokens_from_string(str(el)) for el in entity_dist_df["text"] 38 | ] 39 | # Scatter plot with regression line 40 | sns.lmplot( 41 | x="token_count", y="entity_count", data=entity_dist_df, line_kws={"color": "red"} 42 | ) 43 | plt.title("Entity Count vs Token Count Distribution") 44 | plt.xlabel("Token Count") 45 | plt.ylabel("Entity Count") 46 | plt.show() 47 | plt.savefig('entity_token_distribution.png') 48 | 49 | 50 | def visualizeCommunityEntityDistribution(graph: Neo4jGraph, user_id): 51 | # 查询每个层次的社区包含的实体的数量 52 | community_size = graph.query(f""" 53 | MATCH (c:__Community__{user_id})<-[:IN_COMMUNITY*]-(e:__Entity__{user_id}) 54 | WITH c, count(distinct e) AS entities 55 | RETURN split(c.id, '-')[0] AS level, entities 56 | """ 57 | ) 58 | 59 | community_size_df = pd.DataFrame.from_records(community_size) 60 | 61 | # 计算百分位数 62 | percentiles_data = [] 63 | for level in community_size_df['level'].unique(): 64 | subset = community_size_df[community_size_df['level'] == level]['entities'] 65 | num_communities = len(subset) 66 | percentiles = np.percentile(subset, [25, 50, 75, 90, 99]) 67 | percentiles_data.append( 68 | [ 69 | level, 70 | num_communities, 71 | percentiles[0], 72 | percentiles[1], 73 | percentiles[2], 74 | percentiles[3], 75 | percentiles[4], 76 | max(subset), 77 | ] 78 | ) 79 | 80 | percentiles_df = pd.DataFrame( 81 | percentiles_data, 82 | columns=[ 83 | "Level", 84 | "Num Communities", 85 | "25th Percentile", 86 | "50th Percentile", 87 | "75th Percentile", 88 | "90th Percentile", 89 | "99th Percentile", 90 | "Max Communities", 91 | ], 92 | ) 93 | 94 | # 创建图形和子图 95 | fig, axs = plt.subplots(2, 1, figsize=(10, 12), sharex=True) 96 | 97 | # 可视化最大社群数量 98 | sns.barplot(data=percentiles_df, x='Level', y='Max Communities', ax=axs[0], color='skyblue', label='Max Communities', alpha=0.7) 99 | sns.barplot(data=percentiles_df, x='Level', y='50th Percentile', ax=axs[0], color='orange', label='50th Percentile', alpha=0.5) 100 | axs[0].set_title('Community Entity Distribution by Level', fontsize=16) 101 | axs[0].set_ylabel('Number of Communities', fontsize=14) 102 | axs[0].legend() 103 | axs[0].grid(axis='y') 104 | 105 | # 可视化社群个数 106 | sns.barplot(data=percentiles_df, x='Level', y='Num Communities', ax=axs[1], color='lightgreen') 107 | axs[1].set_title('Number of Communities by Level', fontsize=16) 108 | axs[1].set_xlabel('Community Level', fontsize=14) 109 | axs[1].set_ylabel('Number of Communities', fontsize=14) 110 | axs[1].grid(axis='y') 111 | 112 | plt.xticks(rotation=45) 113 | plt.tight_layout() 114 | 115 | # 保存集成的图像为 PNG 文件 116 | plt.savefig(f'community_distribution_combined_{user_id}.png', dpi=300) 117 | plt.show() 118 | 119 | 120 | def countNodesMerged(user_id,merged_entities,graph: Neo4jGraph): 121 | count = graph.query(""" 122 | UNWIND $data AS candidates 123 | CALL {{ 124 | WITH candidates 125 | MATCH (e:{label}) WHERE e.id IN candidates 126 | RETURN collect(e) AS nodes 127 | }} 128 | CALL apoc.refactor.mergeNodes(nodes, {{properties: {{`.*`: 'discard'}}}}) 129 | YIELD node 130 | RETURN count(*) 131 | """.format(label=f"__Entity__{user_id}"), params={"data": merged_entities} 132 | ) 133 | print(f"{count} nodes merged") 134 | 135 | def prepare_string(data): 136 | nodes_str = "Nodes are:\n" 137 | for node in data['nodes']: 138 | node_id = node['id'] 139 | node_type = node['type'] 140 | if 'description' in node and node['description']: 141 | node_description = f", description: {node['description']}" 142 | else: 143 | node_description = "" 144 | nodes_str += f"id: {node_id}, type: {node_type}{node_description}\n" 145 | rels_str = "Relationships are:\n" 146 | for rel in data['rels']: 147 | start = rel['start'] 148 | end = rel['end'] 149 | rel_type = rel['type'] 150 | if 'description' in rel and rel['description']: 151 | description = f", description: {rel['description']}" 152 | else: 153 | description = "" 154 | rels_str += f"({start})-[:{rel_type}]->({end}){description}\n" 155 | 156 | return nodes_str + "\n" + rels_str 157 | 158 | def create_prompt(model_name): 159 | system_prompt = SystemPrompts.GRAPHSYSTEMPROMPT.format(model_name=model_name) 160 | system_message = SystemMessage(content=system_prompt) 161 | human_message = HumanMessagePromptTemplate.from_template(UserPrompts.GRAPH_USER_PROMPT) 162 | chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message]) 163 | return chat_prompt 164 | 165 | 166 | async def aprocess_summaries(summaries, title_chain): 167 | tasks = [asyncio.create_task(title_chain.ainvoke({"summary": summary})) for summary in summaries] 168 | results = await asyncio.gather(*tasks) 169 | return results 170 | 171 | def process_summaries(summary, title_chain): 172 | result = title_chain.invoke({"summary": summary}) 173 | return result 174 | 175 | async def aprocess_communities(community_info, community_chain): 176 | string_info_list = [prepare_string(community) for community in community_info] 177 | tasks = [asyncio.create_task(community_chain.ainvoke({'community_info': string_info})) for string_info in string_info_list] 178 | results = await asyncio.gather(*tasks) 179 | info_summary = [] 180 | for community, result in zip(community_info, results): 181 | summary = result.output 182 | info_summary.append( 183 | {"community": community['communityId'], "summary": summary}) 184 | return info_summary 185 | 186 | def process_communities(community, community_chain): 187 | stringify_info = prepare_string(community) 188 | summary = community_chain.invoke({'community_info': stringify_info}) 189 | return {"community": community['communityId'], "summary": summary} 190 | 191 | 192 | def process_text(text: str, model) -> List[GraphDocument]: 193 | doc = Document(page_content=text) 194 | return model.convert_to_graph_documents([doc]) 195 | 196 | 197 | async def aprocess_text(texts: List[str], model) -> List[GraphDocument]: 198 | docs = [Document(page_content=text) for text in texts] 199 | return await model.aconvert_to_graph_documents(docs) 200 | 201 | @retry(tries=3, delay=2) 202 | async def aentity_resolution(entities: List[str], extraction_chain) -> Optional[List[str]]: 203 | results = await extraction_chain.ainvoke({"entities": entities}) 204 | return [el.entities for el in results.merge_entities] 205 | 206 | 207 | @retry(tries=3, delay=2) 208 | def entity_resolution(entities: List[str], extraction_chain) -> Optional[List[str]]: 209 | return [el.entities for el in extraction_chain.invoke({"entities": entities}).merge_entities] 210 | 211 | -------------------------------------------------------------------------------- /src/index/cypher_query.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class CypherQuery: 4 | def __init__(self,graph): 5 | self.graph = graph 6 | 7 | def set_entity(self,uuid=None): 8 | self.graph.query( 9 | f""" 10 | MATCH (n:`__Entity__`) 11 | REMOVE n:`__Entity__` 12 | SET n:`__Entity__{uuid}` 13 | """ 14 | ) 15 | 16 | def set_document(self,uuid=None): 17 | self.graph.query( 18 | f""" 19 | MATCH (n:`Document`) 20 | REMOVE n:`Document` 21 | SET n:`Document{uuid}` 22 | """ 23 | ) 24 | 25 | # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除 26 | def drop_entites(self): 27 | # 删除名字为entities的图 28 | try: 29 | self.graph.query( 30 | """ 31 | CALL gds.graph.drop('entities') 32 | """ 33 | ) 34 | except: 35 | print("`entities` does not exist") 36 | 37 | # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除 38 | def drop_communities(self): 39 | try: 40 | self.graph.query( 41 | f""" 42 | CALL gds.graph.drop('communities') 43 | """ 44 | ) 45 | except: 46 | print("`communities` does not exist") 47 | 48 | # 社区检测与聚类分析 49 | def detect(self,uuid,word_edit_distance): 50 | return self.graph.query( 51 | f"""MATCH (e:`__Entity__{uuid}`) 52 | WHERE size(e.id) > 4 // longer than 4 characters 53 | WITH e.wcc AS community, collect(e) AS nodes, count(*) AS count 54 | WHERE count > 1 55 | UNWIND nodes AS node 56 | // Add text distance 57 | WITH distinct 58 | [n IN nodes WHERE apoc.text.distance(toLower(node.id), toLower(n.id)) < $distance | n.id] AS intermediate_results 59 | WHERE size(intermediate_results) > 1 60 | WITH collect(intermediate_results) AS results 61 | // combine groups together if they share elements 62 | UNWIND range(0, size(results)-1, 1) as index 63 | WITH results, index, results[index] as result 64 | WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) | 65 | CASE WHEN index <> index2 AND 66 | size(apoc.coll.intersection(acc, results[index2])) > 0 67 | THEN apoc.coll.union(acc, results[index2]) 68 | ELSE acc 69 | END 70 | )) as combinedResult 71 | WITH distinct(combinedResult) as combinedResult 72 | // extra filtering 73 | WITH collect(combinedResult) as allCombinedResults 74 | UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex 75 | WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults 76 | WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1) 77 | WHERE x <> combinedResultIndex 78 | AND apoc.coll.containsAll(allCombinedResults[x], combinedResult) 79 | ) 80 | RETURN combinedResult 81 | """, params={'distance': word_edit_distance} 82 | ) 83 | 84 | def add_constraints_for_community(self,uuid=None): 85 | self.graph.query(f"CREATE CONSTRAINT IF NOT EXISTS FOR (c:`__Community__{uuid}`) REQUIRE c.id IS UNIQUE;") 86 | 87 | # 构造层次聚类 88 | def constructing_hierarchical_clustering(self,uuid=None): 89 | return self.graph.query(""" 90 | MATCH (e:`{entity_label}`) 91 | UNWIND range(0, size(e.communities) - 1 , 1) AS index 92 | CALL {{ 93 | WITH e, index 94 | WITH e, index 95 | WHERE index = 0 96 | MERGE (c:`{community_label}` {{id: toString(index) + '-' + toString(e.communities[index])}}) 97 | ON CREATE SET c.level = index 98 | MERGE (e)-[:IN_COMMUNITY]->(c) 99 | RETURN count(*) AS count_0 100 | }} 101 | CALL {{ 102 | WITH e, index 103 | WITH e, index 104 | WHERE index > 0 105 | MERGE (current:`{community_label}` {{id: toString(index) + '-' + toString(e.communities[index])}}) 106 | ON CREATE SET current.level = index 107 | MERGE (previous:`{community_label}` {{id: toString(index - 1) + '-' + toString(e.communities[index - 1])}}) 108 | ON CREATE SET previous.level = index - 1 109 | MERGE (previous)-[:IN_COMMUNITY]->(current) 110 | RETURN count(*) AS count_1 111 | }} 112 | RETURN count(*) 113 | """.format(entity_label=f"__Entity__{uuid}",community_label=f"__Community__{uuid}") 114 | ) 115 | 116 | def set_community_rank(self,uuid=None): 117 | self.graph.query(f""" 118 | MATCH (c:`__Community__{uuid}`)<-[:IN_COMMUNITY*]-(:`__Entity__{uuid}`)<-[:MENTIONS]-(d:`Document{uuid}`) // 匹配社区文档 119 | WITH c, count(distinct d) AS rank // 计算每个社区包含的不同的文档数量作为社区的排名 120 | SET c.community_rank = rank; // 设置社区排名 121 | """ 122 | ) 123 | 124 | def set_node_degree(self,uuid=None): 125 | node_degree_query = f""" 126 | MATCH (n) 127 | WHERE ANY(label IN labels(n) WHERE label ENDS WITH '{uuid}') 128 | SET n.degree = apoc.node.degree(n) 129 | RETURN count(n) AS modified_nodes; 130 | """ 131 | self.graph.query(node_degree_query) 132 | 133 | def set_relationship_degree(self,uuid=None): 134 | relationship_degree_query = f""" 135 | MATCH (n) 136 | WHERE n.degree is not NULL and ANY(label IN labels(n) WHERE label ENDS WITH '{uuid}') 137 | WITH n as source 138 | MATCH (source)-[r]->(target) 139 | WHERE target.degree is not null and ANY(label IN labels(target) WHERE label ENDS WITH '{uuid}') 140 | SET r.source_degree=source.degree,r.target_degree=target.degree,r.rank=source.degree+target.degree 141 | RETURN COUNT(r) AS modified_relationships; // 返回被修改的边的数量 142 | """ 143 | self.graph.query(relationship_degree_query) 144 | 145 | def set_text_unit_ids(self,uuid=None): 146 | text_unit_ids_query = f""" 147 | MATCH (n:`__Entity__{uuid}`) 148 | MATCH (p:`Document{uuid}`) 149 | WHERE p.text IS NOT NULL 150 | WITH n, collect(p) AS text_units 151 | UNWIND text_units AS text_unit 152 | WITH n,text_unit 153 | WHERE text_unit.text CONTAINS n.id // 使用 CONTAINS 检查 154 | WITH n, collect(text_unit.id) AS text_unit_ids 155 | SET n.text_unit_ids = text_unit_ids 156 | RETURN count(DISTINCT n) AS modified_nodes; 157 | """ 158 | self.graph.query(text_unit_ids_query) 159 | 160 | def set_relationship_source_and_target(self,uuid=None): 161 | self.graph.query( 162 | f""" 163 | MATCH (n:`__Entity__{uuid}`)-[r]->(m:`__Entity__{uuid}`) 164 | WITH n,r,m 165 | SET r.source = n.id, r.target = m.id 166 | RETURN count(r) AS modified_relationships 167 | """ 168 | ) 169 | 170 | def set_communities(self,uuid=None): 171 | self.graph.query( 172 | f""" 173 | MATCH (n:`__Entity__{uuid}`)-[:IN_COMMUNITY*]->(c:`__Community__{uuid}`) 174 | WITH n, collect(c.id) AS community_ids 175 | SET n.communities = community_ids 176 | RETURN count(n) AS modified_nodes; 177 | """ 178 | ) 179 | 180 | def get_community_info(self,user_id=None): 181 | return self.graph.query(""" 182 | MATCH (c:`{community_label}`)<-[:IN_COMMUNITY*]-(e:`{entity_label}`) // 匹配社区实体 183 | // WHERE c.level in [1] 184 | WITH c, collect(e) AS nodes 185 | WHERE size(nodes) > 1 186 | CALL apoc.path.subgraphAll(nodes[0], {{ 187 | whitelistNodes:nodes 188 | }}) 189 | YIELD relationships 190 | RETURN c.id AS communityId, 191 | [n in nodes | {{id: n.id, description: n.description, type: [el in labels(n) WHERE el <> '{entity_label}'][0]}}] AS nodes, 192 | [r in relationships | {{start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}}] AS rels 193 | """.format(entity_label=f"__Entity__{user_id}", community_label=f"__Community__{user_id}") 194 | ) 195 | 196 | def store_info(self,info,uuid=None): 197 | self.graph.query( 198 | f""" 199 | UNWIND $info AS info 200 | MATCH (c:`__Community__{uuid}` {{id: info.community}}) 201 | SET c.summary = info.summary,c.title = info.title 202 | """, params={"info": info} 203 | ) -------------------------------------------------------------------------------- /src/index/api_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from typing import Dict, List 4 | from tqdm.asyncio import tqdm 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | 7 | # langchain 8 | from langchain_core.output_parsers import StrOutputParser 9 | from langchain_openai import ChatOpenAI,OpenAI 10 | from langchain_community.vectorstores import Neo4jVector 11 | from langchain_community.graphs import Neo4jGraph 12 | from langchain_core.prompts import ChatPromptTemplate 13 | from langchain_experimental.graph_transformers import LLMGraphTransformer 14 | from langchain_core.language_models.chat_models import BaseChatModel 15 | from langchain_text_splitters.base import TextSplitter 16 | from langchain_core.embeddings.embeddings import Embeddings 17 | from langchain_core.documents import Document 18 | # Graph 19 | import json_repair 20 | from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship 21 | # utils 22 | from .utils import num_tokens_from_string,create_prompt,process_text,entity_resolution,process_communities,process_summaries,countNodesMerged 23 | from ..utils.logger import create_rotating_logger 24 | from logging import Logger 25 | from graphdatascience import GraphDataScience 26 | from .cypher_query import CypherQuery 27 | 28 | # prompt 29 | from .prompts import SystemPrompts, UserPrompts 30 | # hf 31 | from transformers import AutoTokenizer 32 | # pydantic models 33 | from .pydantic_models import Disambiguate, GetTitle 34 | 35 | COMMUNITY_TEMPLEATE = """Based on the provided nodes and relationships that belong to the same graph community, 36 | generate a natural language summary of the provided information: 37 | {community_info} 38 | 39 | Summary:""" # noqa: E501 40 | 41 | community_prompt = ChatPromptTemplate.from_messages( 42 | [ 43 | ( 44 | "system", 45 | "Given an input triples, generate the information summary. No pre-amble.", 46 | ), 47 | ("human", COMMUNITY_TEMPLEATE), 48 | ] 49 | ) 50 | 51 | TITLE_TEMPLATE = """Given the following summary, provide a title that best represents the content: 52 | {summary} 53 | 54 | Title:""" 55 | 56 | title_prompt = ChatPromptTemplate.from_messages( 57 | [ 58 | ( 59 | "system", 60 | "Given a summary, generate a title that best represents the content. No pre-amble.", 61 | ), 62 | ("human", TITLE_TEMPLATE), 63 | ] 64 | ) 65 | 66 | 67 | class ApiIndex(object): 68 | def __init__( 69 | self, 70 | graph:Neo4jGraph, 71 | chat_model:BaseChatModel, 72 | embedding:Embeddings, 73 | splitter:TextSplitter, 74 | gds:GraphDataScience, 75 | logger:Logger=None, 76 | max_workers:int=4, 77 | gds_similarity_threshold:float=0.95, 78 | word_edit_distance:int = 3, 79 | uuid:str="", 80 | model_name="gpt-4o-mini" 81 | ): 82 | self.graph = graph 83 | self.chat_model = chat_model 84 | self.model_name = model_name 85 | self.embedding = embedding 86 | self.splitter = splitter 87 | self.gds = gds 88 | self.cypherQuery = CypherQuery(graph=graph) 89 | if not logger: 90 | self.logger = create_rotating_logger("index") 91 | else: 92 | self.logger = logger 93 | 94 | self.MAX_WORKERS = max_workers 95 | self.GDS_SIMILARITY_THRESHOLD = gds_similarity_threshold 96 | self.WORD_EDIT_DISTANCE = word_edit_distance 97 | self.uuid = uuid 98 | 99 | def _preprocess(self,documents:List[Dict[str,str]]): 100 | self.logger.info("Chunking documents") 101 | data = [] 102 | for document in documents: 103 | title,text = document['title'],document['text'] 104 | chunks = self.splitter.split_text(text) 105 | for chunk in chunks: 106 | data.append({"title": title, "text": chunk}) 107 | 108 | return pd.DataFrame(data) 109 | 110 | def _parse_hf_ollama(self,content:str,source:Document): 111 | try: 112 | breakpoint() 113 | parsed_json = json_repair.loads(content) 114 | relationships = [] 115 | nodes_set = set() 116 | for rel in parsed_json: 117 | # Nodes need to be deduplicated using a set 118 | if "head_description" in rel.keys(): 119 | nodes_set.add((rel["head"], rel["head_type"], rel["head_description"])) 120 | else: 121 | nodes_set.add((rel["head"], rel["head_type"])) 122 | if "tail_description" in rel.keys(): 123 | nodes_set.add((rel["tail"], rel["tail_type"], rel["tail_description"])) 124 | else: 125 | nodes_set.add((rel["tail"], rel["tail_type"])) 126 | source_node = Node(id=rel["head"], type=rel["head_type"]) 127 | target_node = Node(id=rel["tail"], type=rel["tail_type"]) 128 | relationships.append( 129 | Relationship( 130 | source=source_node, target=target_node, type=rel["relation"] 131 | ) 132 | ) 133 | nodes = [] 134 | for el in list(nodes_set): 135 | if len(el) == 3: 136 | node = Node(id=el[0], type=el[1], properties={"description": el[2]}) 137 | else: 138 | node = Node(id=el[0], type=el[1]) 139 | nodes.append(node) 140 | 141 | return GraphDocument(nodes=nodes, relationships=relationships,source=source) 142 | except: 143 | self.logger.error(f"不是一个合法的Json") 144 | return None 145 | 146 | async def _create_nodes_and_relationships(self,documents:List[str]): 147 | data = self._preprocess(documents) 148 | documents = [Document(page_content=f"{row['title']} {row['text']}") for i, row in data.iterrows()] 149 | 150 | # 如果是openai模型,直接调用convert_to_graph_documents 151 | if isinstance(self.chat_model,ChatOpenAI): 152 | llm_transformer = LLMGraphTransformer( 153 | llm=self.chat_model, 154 | node_properties=["description"], 155 | relationship_properties=["description"], 156 | prompt=create_prompt(self.chat_model.name), 157 | ) 158 | graph_documents = await llm_transformer.aconvert_to_graph_documents(documents) 159 | else: 160 | chat_prompt = create_prompt(self.chat_model.name) 161 | processed_documents = [] 162 | for document in documents: 163 | prompt = chat_prompt.format_messages(input=document.page_content) 164 | processed_documents.append(self.chat_model.invoke(prompt)) 165 | graph_documents = [self._parse_hf_ollama(document.content,source) for (document,source) in zip(processed_documents,documents)] 166 | graph_documents = [graph_document for graph_document in graph_documents if graph_document] 167 | 168 | return graph_documents 169 | 170 | async def create_index(self,documents:List[str]): 171 | self.logger.info("Create_nodes_and_relationships") 172 | graph_documents = await self._create_nodes_and_relationships(documents) 173 | 174 | for graph_document in graph_documents: 175 | for node in graph_document.nodes: 176 | node.type = node.type + f"{self.uuid}" 177 | 178 | for relationship in graph_document.relationships: 179 | relationship.type = relationship.type + f"{self.uuid}" 180 | relationship.source.type += f"{self.uuid}" 181 | relationship.target.type += f"{self.uuid}" 182 | 183 | # 将结点和关系存入图数据库 184 | self.graph.add_graph_documents( 185 | graph_documents, 186 | baseEntityLabel=True, 187 | include_source=True 188 | ) 189 | # 查询所有标签是__Entity__的结点,并修改成__Entity__+用户id 190 | self.cypherQuery.set_entity(self.uuid) 191 | # 查询所有标签是Document的结点,并修改成Document+用户id 192 | self.cypherQuery.set_document(self.uuid) 193 | 194 | self.graph.refresh_schema() 195 | Neo4jVector.from_existing_graph( 196 | self.embedding, 197 | node_label=f'__Entity__{self.uuid}', 198 | text_node_properties=['id', 'description'], 199 | index_name=f"{self.uuid}" if (self.uuid != None and self.uuid != "") else "vector", 200 | embedding_node_property='embedding', 201 | graph=self.graph, 202 | ) 203 | try: 204 | self.cypherQuery.drop_entites() 205 | except: 206 | pass 207 | 208 | # 1.create the k-nearest neighbor graph 209 | G, _ = self.gds.graph.project( 210 | "entities", # Graph name # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除 211 | f"__Entity__{self.uuid}", # Node projection 212 | "*", # Relationship projection 213 | nodeProperties=["embedding"] # Configuration parameters 214 | ) 215 | # 2.algorithm: k-nearest neighbors 216 | self.gds.knn.mutate( 217 | G, 218 | nodeProperties=['embedding'], 219 | mutateRelationshipType='SIMILAR', 220 | mutateProperty='score', 221 | similarityCutoff=self.GDS_SIMILARITY_THRESHOLD, 222 | ) 223 | # 3.store graph with weak connected components 224 | self.gds.wcc.write( 225 | G, 226 | writeProperty="wcc", 227 | relationshipTypes=["SIMILAR"] 228 | ) 229 | # 4. KEY:社区检测与聚类分析 230 | 231 | potential_duplicate_candidates = self.cypherQuery.detect(self.uuid,self.WORD_EDIT_DISTANCE) 232 | extraction_llm = self.chat_model.with_structured_output(Disambiguate) 233 | extraction_prompt = ChatPromptTemplate.from_messages( 234 | [ 235 | ( 236 | "system", 237 | SystemPrompts.IDENTIFY_SYSTEM_PROMPT, 238 | ), 239 | ( 240 | "human", 241 | UserPrompts.IDENTIFY_USER_PROMPT, # noqa: E501, 242 | ), 243 | ] 244 | ) 245 | extraction_chain = extraction_prompt | extraction_llm 246 | merged_entities = [] 247 | with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: 248 | futures = [executor.submit(entity_resolution, el['combinedResult'],extraction_chain) for el in potential_duplicate_candidates] 249 | for future in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"): 250 | try: 251 | to_merge = future.result() 252 | if to_merge: 253 | merged_entities.extend(to_merge) 254 | except Exception as e: 255 | self.logger.error("模型没法进行这条任务的实体解析") 256 | self.logger.info(countNodesMerged(self.uuid,merged_entities,self.graph)) 257 | 258 | G.drop() 259 | 260 | self.cypherQuery.drop_communities() 261 | 262 | # 1.project into memory 263 | G, _ = self.gds.graph.project( 264 | f"communities", # Graph name # FIXME: 注册gds.graph时也要加上uuid,不然可能导致多进程误删除 265 | f"__Entity__{self.uuid}", # Node projection 266 | { 267 | "_ALL_": { 268 | "type": "*", 269 | "orientation": "UNDIRECTED", 270 | "properties": {"weight": {"property": "*", "aggregation": "COUNT"}}, 271 | } 272 | }, 273 | ) 274 | 275 | # 2. LeiDen聚类 276 | self.gds.leiden.write( 277 | G, 278 | writeProperty=f"communities", 279 | includeIntermediateCommunities=True, 280 | relationshipWeightProperty="weight", 281 | ) 282 | 283 | # 添加约束 284 | self.cypherQuery.add_constraints_for_community(self.uuid) 285 | 286 | # 构造层次聚类 287 | merged_nodes = self.cypherQuery.constructing_hierarchical_clustering(self.uuid) 288 | self.logger.info(f"{merged_nodes[0]['count(*)']} nodes merged") 289 | 290 | # 设置社区rank 291 | self.cypherQuery.set_community_rank(self.uuid) 292 | 293 | # 设置结点与边的额外信息---用于后续的查询 294 | # 我们需要给所有实体结点设置度数;给边设置`source_degree`, `target_degree`, `rank`属性 295 | # 此外需要给每个实体设置其包含的text_unit_ids 296 | # 还需要给relationship设置source和target的属性,表示其链接到的结点的内容 297 | # 增加:需要给每个结点设置communities属性,是一个列表id,表示结点所在的社区 298 | # 1. node degree 299 | self.cypherQuery.set_node_degree(self.uuid) 300 | # 2. relationship degree 301 | self.cypherQuery.set_relationship_degree(self.uuid) 302 | # 3. text_unit_ids 303 | self.cypherQuery.set_text_unit_ids(self.uuid) 304 | # 4. relationship设置source和target的属性 305 | self.cypherQuery.set_relationship_source_and_target(self.uuid) 306 | # 5. 设置communities属性 307 | self.cypherQuery.set_communities(self.uuid) 308 | 309 | # 准备工作结束,开始summarization 310 | 311 | community_info = self.cypherQuery.get_community_info(self.uuid) 312 | 313 | 314 | community_chain = community_prompt | self.chat_model | StrOutputParser() # TODO:增加报错处理 315 | summaries = [] 316 | 317 | with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: 318 | futures = {executor.submit(process_communities, community, community_chain) for community in community_info} 319 | for future in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"): 320 | summary = future.result() 321 | summaries.append(summary) 322 | 323 | 324 | title_chain = title_prompt | self.chat_model | StrOutputParser() 325 | titles = [] 326 | with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: 327 | futures = {executor.submit(process_summaries, summary, title_chain) for summary in summaries} 328 | for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Title"): 329 | title = future.result() 330 | titles.append(title) 331 | 332 | assert len(summaries) == len(titles) 333 | info = [{**summary, 'title': title} for summary, title in zip(summaries, titles)] 334 | 335 | # Store info 336 | self.cypherQuery.store_info(info,uuid=self.uuid) 337 | 338 | G.drop() -------------------------------------------------------------------------------- /index.log: -------------------------------------------------------------------------------- 1 | 2024-11-25 21:36:12,336 - build_index - INFO - Start building index 2 | 2024-11-25 21:36:12,342 - build_index - INFO - Connecting to Neo4j 3 | 2024-11-25 21:42:31,210 - build_index - INFO - Start building index 4 | 2024-11-25 21:42:31,216 - build_index - INFO - Connecting to Neo4j 5 | 2024-11-25 21:43:43,507 - build_index - INFO - Start building index 6 | 2024-11-25 21:43:43,513 - build_index - INFO - Connecting to Neo4j 7 | 2024-11-25 21:43:43,530 - build_index - ERROR - Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag 8 | 2024-11-25 21:44:33,680 - build_index - INFO - Start building index 9 | 2024-11-25 21:44:33,686 - build_index - INFO - Connecting to Neo4j 10 | 2024-11-25 21:44:33,702 - build_index - ERROR - Meet error Could not use APOC procedures. Please ensure the APOC plugin is installed in Neo4j and that 'apoc.meta.data()' is allowed in Neo4j configuration when connecting to Neo4j 11 | Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag 12 | 2024-11-25 21:45:45,003 - build_index - INFO - Start building index 13 | 2024-11-25 21:45:45,008 - build_index - INFO - Connecting to Neo4j 14 | 2024-11-25 21:45:45,024 - build_index - ERROR - Meet error Could not use APOC procedures. Please ensure the APOC plugin is installed in Neo4j and that 'apoc.meta.data()' is allowed in Neo4j configuration when connecting to Neo4j 15 | Failed to connect to Neo4jURI: bolt://localhost:7687, Username: neo4j, Password: langchaingraphrag 16 | 2024-11-26 10:17:25,487 - build_index - INFO - Start building index 17 | 2024-11-26 10:17:25,494 - build_index - INFO - Connecting to Neo4j 18 | 2024-11-26 10:17:52,886 - build_index - ERROR - Failed to get OpenAI API key 19 | 2024-11-26 10:18:51,127 - build_index - INFO - Start building index 20 | 2024-11-26 10:18:51,132 - build_index - INFO - Connecting to Neo4j 21 | 2024-11-26 10:18:56,390 - build_index - INFO - Initializing chat model 22 | 2024-11-26 10:18:56,447 - build_index - INFO - Initializing embedding 23 | 2024-11-26 10:22:15,703 - build_index - INFO - Start building index 24 | 2024-11-26 10:22:15,709 - build_index - INFO - Connecting to Neo4j 25 | 2024-11-26 10:22:34,162 - build_index - INFO - Initializing chat model 26 | 2024-11-26 10:22:34,224 - build_index - INFO - Initializing embedding 27 | 2024-11-26 10:25:28,630 - build_index - INFO - Initializing splitter 28 | 2024-11-26 10:25:28,633 - build_index - INFO - Initializing GDS 29 | 2024-11-26 10:25:28,727 - build_index - INFO - Initializing index 30 | 2024-11-26 10:25:28,741 - build_index - INFO - Building index now! 31 | 2024-11-26 10:30:37,380 - build_index - INFO - Start building index 32 | 2024-11-26 10:30:37,387 - build_index - INFO - Connecting to Neo4j 33 | 2024-11-26 10:36:32,881 - build_index - INFO - Initializing chat model 34 | 2024-11-26 10:36:32,980 - build_index - INFO - Initializing embedding 35 | 2024-11-26 10:36:39,310 - build_index - INFO - Initializing splitter 36 | 2024-11-26 10:36:39,311 - build_index - INFO - Initializing GDS 37 | 2024-11-26 10:36:39,337 - build_index - INFO - Initializing index 38 | 2024-11-26 10:36:39,349 - build_index - INFO - Building index now! 39 | 2024-11-26 10:38:56,895 - build_index - INFO - Start building index 40 | 2024-11-26 10:38:56,899 - build_index - INFO - Connecting to Neo4j 41 | 2024-11-26 10:44:37,513 - build_index - INFO - Initializing chat model 42 | 2024-11-26 10:44:37,595 - build_index - INFO - Initializing embedding 43 | 2024-11-26 10:44:43,994 - build_index - INFO - Initializing splitter 44 | 2024-11-26 10:44:43,994 - build_index - INFO - Initializing GDS 45 | 2024-11-26 10:44:44,025 - build_index - INFO - Initializing index 46 | 2024-11-26 10:44:44,036 - build_index - INFO - Building index now! 47 | 2024-11-26 10:46:05,949 - build_index - INFO - Chunking documents 48 | 2024-11-26 10:46:14,868 - build_index - INFO - Start building index 49 | 2024-11-26 10:46:14,873 - build_index - INFO - Connecting to Neo4j 50 | 2024-11-26 10:46:29,714 - build_index - INFO - Initializing chat model 51 | 2024-11-26 10:46:29,771 - build_index - INFO - Initializing embedding 52 | 2024-11-26 10:46:36,397 - build_index - INFO - Initializing splitter 53 | 2024-11-26 10:46:36,398 - build_index - INFO - Initializing GDS 54 | 2024-11-26 10:46:36,437 - build_index - INFO - Initializing index 55 | 2024-11-26 10:46:36,451 - build_index - INFO - Building index now! 56 | 2024-11-26 10:46:40,097 - build_index - INFO - Chunking documents 57 | 2024-11-26 16:32:30,675 - build_index - INFO - Start building index 58 | 2024-11-26 16:32:30,680 - build_index - INFO - Connecting to Neo4j 59 | 2024-11-26 16:35:18,556 - build_index - ERROR - Failed to get OpenAI API key 60 | 2024-11-26 16:35:34,137 - build_index - INFO - Start building index 61 | 2024-11-26 16:35:34,141 - build_index - INFO - Connecting to Neo4j 62 | 2024-11-26 16:36:07,814 - build_index - INFO - Initializing chat model 63 | 2024-11-26 16:36:07,879 - build_index - INFO - Initializing embedding 64 | 2024-11-26 16:36:15,558 - build_index - INFO - Initializing splitter 65 | 2024-11-26 16:36:15,558 - build_index - INFO - Initializing GDS 66 | 2024-11-26 16:36:15,578 - build_index - INFO - Initializing index 67 | 2024-11-26 16:36:15,588 - build_index - INFO - Building index now! 68 | 2024-11-26 16:36:15,589 - build_index - INFO - Chunking documents 69 | 2024-11-26 16:39:40,385 - build_index - INFO - Start building index 70 | 2024-11-26 16:39:40,391 - build_index - INFO - Connecting to Neo4j 71 | 2024-11-26 16:39:48,032 - build_index - INFO - Initializing chat model 72 | 2024-11-26 16:39:48,090 - build_index - INFO - Initializing embedding 73 | 2024-11-26 16:39:55,944 - build_index - INFO - Initializing splitter 74 | 2024-11-26 16:39:55,944 - build_index - INFO - Initializing GDS 75 | 2024-11-26 16:39:55,967 - build_index - INFO - Initializing index 76 | 2024-11-26 16:39:55,978 - build_index - INFO - Building index now! 77 | 2024-11-26 16:39:55,978 - build_index - INFO - Chunking documents 78 | 2024-11-26 16:46:11,163 - build_index - INFO - Start building index 79 | 2024-11-26 16:46:11,167 - build_index - INFO - Connecting to Neo4j 80 | 2024-11-26 16:46:35,611 - build_index - INFO - Initializing chat model 81 | 2024-11-26 16:46:35,698 - build_index - INFO - Initializing embedding 82 | 2024-11-26 16:46:43,186 - build_index - INFO - Initializing splitter 83 | 2024-11-26 16:46:43,187 - build_index - INFO - Initializing GDS 84 | 2024-11-26 16:46:43,211 - build_index - INFO - Initializing index 85 | 2024-11-26 16:46:43,224 - build_index - INFO - Building index now! 86 | 2024-11-26 16:46:43,224 - build_index - INFO - Chunking documents 87 | 2024-11-26 16:47:40,182 - build_index - INFO - Start building index 88 | 2024-11-26 16:47:40,188 - build_index - INFO - Connecting to Neo4j 89 | 2024-11-26 16:47:45,687 - build_index - INFO - Initializing chat model 90 | 2024-11-26 16:47:45,742 - build_index - INFO - Initializing embedding 91 | 2024-11-26 16:47:53,678 - build_index - INFO - Initializing splitter 92 | 2024-11-26 16:47:53,678 - build_index - INFO - Initializing GDS 93 | 2024-11-26 16:47:53,699 - build_index - INFO - Initializing index 94 | 2024-11-26 16:47:53,712 - build_index - INFO - Building index now! 95 | 2024-11-26 16:47:53,712 - build_index - INFO - Chunking documents 96 | 2024-11-26 16:48:10,278 - build_index - INFO - None 97 | 2024-11-26 16:48:10,873 - build_index - INFO - 76 nodes merged 98 | 2024-11-26 16:48:18,765 - build_index - INFO - Index built successfully 99 | 2024-11-26 16:48:39,576 - build_index - INFO - Start building index 100 | 2024-11-26 16:48:39,581 - build_index - INFO - Connecting to Neo4j 101 | 2024-11-26 16:48:39,852 - build_index - INFO - Initializing chat model 102 | 2024-11-26 16:50:40,671 - build_index - INFO - Start building index 103 | 2024-11-26 16:50:40,678 - build_index - INFO - Connecting to Neo4j 104 | 2024-11-26 16:50:40,805 - build_index - INFO - Initializing chat model 105 | 2024-11-26 16:51:21,814 - build_index - INFO - Start building index 106 | 2024-11-26 16:51:21,819 - build_index - INFO - Connecting to Neo4j 107 | 2024-11-26 16:51:21,943 - build_index - INFO - Initializing chat model 108 | 2024-11-26 16:59:38,367 - build_index - INFO - Start building index 109 | 2024-11-26 16:59:38,376 - build_index - INFO - Connecting to Neo4j 110 | 2024-11-26 16:59:38,461 - build_index - INFO - Initializing chat model 111 | 2024-11-26 17:00:25,413 - build_index - INFO - Start building index 112 | 2024-11-26 17:00:25,419 - build_index - INFO - Connecting to Neo4j 113 | 2024-11-26 17:00:25,506 - build_index - INFO - Initializing chat model 114 | 2024-11-26 17:00:27,891 - build_index - INFO - Initializing embedding 115 | 2024-11-26 17:00:35,649 - build_index - INFO - Initializing splitter 116 | 2024-11-26 17:00:35,650 - build_index - INFO - Initializing GDS 117 | 2024-11-26 17:00:35,696 - build_index - INFO - Initializing index 118 | 2024-11-26 17:09:01,914 - build_index - INFO - Start building index 119 | 2024-11-26 17:09:01,919 - build_index - INFO - Connecting to Neo4j 120 | 2024-11-26 17:09:02,000 - build_index - INFO - Initializing chat model 121 | 2024-11-26 17:09:04,283 - build_index - INFO - Initializing embedding 122 | 2024-11-26 17:09:11,614 - build_index - INFO - Initializing splitter 123 | 2024-11-26 17:09:11,614 - build_index - INFO - Initializing GDS 124 | 2024-11-26 17:09:11,632 - build_index - INFO - Initializing index 125 | 2024-11-26 17:09:11,645 - build_index - INFO - Building index now! 126 | 2024-11-26 17:09:11,645 - build_index - INFO - Chunking documents 127 | 2024-11-26 17:29:13,540 - build_index - INFO - Start building index 128 | 2024-11-26 17:29:13,545 - build_index - INFO - Connecting to Neo4j 129 | 2024-11-26 17:29:13,634 - build_index - INFO - Initializing chat model 130 | 2024-11-26 17:29:54,826 - build_index - INFO - Start building index 131 | 2024-11-26 17:29:54,832 - build_index - INFO - Connecting to Neo4j 132 | 2024-11-26 17:29:54,901 - build_index - INFO - Initializing chat model 133 | 2024-11-26 17:30:00,447 - build_index - INFO - Initializing embedding 134 | 2024-11-26 17:30:07,313 - build_index - INFO - Initializing splitter 135 | 2024-11-26 17:30:07,313 - build_index - INFO - Initializing GDS 136 | 2024-11-26 17:30:07,328 - build_index - INFO - Initializing index 137 | 2024-11-26 17:35:05,456 - build_index - INFO - Start building index 138 | 2024-11-26 17:35:05,462 - build_index - INFO - Connecting to Neo4j 139 | 2024-11-26 17:35:05,567 - build_index - INFO - Initializing chat model 140 | 2024-11-26 17:35:18,345 - build_index - INFO - Initializing embedding 141 | 2024-11-26 17:35:25,155 - build_index - INFO - Initializing splitter 142 | 2024-11-26 17:35:25,155 - build_index - INFO - Initializing GDS 143 | 2024-11-26 17:35:25,171 - build_index - INFO - Initializing index 144 | 2024-11-26 17:35:25,182 - build_index - INFO - Building index now! 145 | 2024-11-26 17:35:25,182 - build_index - INFO - Chunking documents 146 | 2024-11-26 17:36:01,287 - build_index - INFO - Start building index 147 | 2024-11-26 17:36:01,293 - build_index - INFO - Connecting to Neo4j 148 | 2024-11-26 17:36:01,384 - build_index - INFO - Initializing chat model 149 | 2024-11-26 17:36:09,907 - build_index - INFO - Initializing embedding 150 | 2024-11-26 17:36:16,445 - build_index - INFO - Initializing splitter 151 | 2024-11-26 17:36:16,445 - build_index - INFO - Initializing GDS 152 | 2024-11-26 17:36:16,458 - build_index - INFO - Initializing index 153 | 2024-11-26 17:36:16,468 - build_index - INFO - Building index now! 154 | 2024-11-26 17:36:16,468 - build_index - INFO - Chunking documents 155 | 2024-11-26 17:38:39,049 - build_index - INFO - Start building index 156 | 2024-11-26 17:38:39,055 - build_index - INFO - Connecting to Neo4j 157 | 2024-11-26 17:38:39,176 - build_index - INFO - Initializing chat model 158 | 2024-11-26 17:38:47,877 - build_index - INFO - Initializing embedding 159 | 2024-11-26 17:38:53,344 - build_index - INFO - Initializing splitter 160 | 2024-11-26 17:38:53,344 - build_index - INFO - Initializing GDS 161 | 2024-11-26 17:38:53,361 - build_index - INFO - Initializing index 162 | 2024-11-26 17:38:53,368 - build_index - INFO - Building index now! 163 | 2024-11-26 17:38:53,368 - build_index - INFO - Chunking documents 164 | 2024-11-26 17:40:23,615 - build_index - INFO - Start building index 165 | 2024-11-26 17:40:23,620 - build_index - INFO - Connecting to Neo4j 166 | 2024-11-26 17:40:23,720 - build_index - INFO - Initializing chat model 167 | 2024-11-26 17:40:37,214 - build_index - INFO - Initializing embedding 168 | 2024-11-26 17:40:42,511 - build_index - INFO - Initializing splitter 169 | 2024-11-26 17:40:42,511 - build_index - INFO - Initializing GDS 170 | 2024-11-26 17:40:42,531 - build_index - INFO - Initializing index 171 | 2024-11-26 17:40:42,550 - build_index - INFO - Building index now! 172 | 2024-11-26 17:40:42,550 - build_index - INFO - Chunking documents 173 | 2024-11-26 17:46:53,003 - build_index - INFO - Start building index 174 | 2024-11-26 17:46:53,007 - build_index - INFO - Connecting to Neo4j 175 | 2024-11-26 17:46:53,095 - build_index - INFO - Initializing chat model 176 | 2024-11-26 17:50:24,139 - build_index - INFO - Start building index 177 | 2024-11-26 17:50:24,145 - build_index - INFO - Connecting to Neo4j 178 | 2024-11-26 17:50:24,215 - build_index - INFO - Initializing chat model 179 | 2024-11-26 17:53:01,027 - build_index - INFO - Start building index 180 | 2024-11-26 17:53:01,032 - build_index - INFO - Connecting to Neo4j 181 | 2024-11-26 17:53:01,126 - build_index - INFO - Initializing chat model 182 | 2024-11-26 17:53:20,141 - build_index - INFO - Initializing embedding 183 | 2024-11-26 17:53:25,196 - build_index - INFO - Initializing splitter 184 | 2024-11-26 17:53:25,196 - build_index - INFO - Initializing GDS 185 | 2024-11-26 17:53:25,217 - build_index - INFO - Initializing index 186 | 2024-11-26 17:56:22,868 - build_index - INFO - Start building index 187 | 2024-11-26 17:56:22,871 - build_index - INFO - Connecting to Neo4j 188 | 2024-11-26 17:56:22,939 - build_index - INFO - Initializing chat model 189 | 2024-11-26 17:56:39,295 - build_index - INFO - Initializing embedding 190 | 2024-11-26 17:56:45,291 - build_index - INFO - Initializing splitter 191 | 2024-11-26 17:56:45,291 - build_index - INFO - Initializing GDS 192 | 2024-11-26 17:56:45,307 - build_index - INFO - Initializing index 193 | 2024-11-26 17:58:41,061 - build_index - INFO - Start building index 194 | 2024-11-26 17:58:41,065 - build_index - INFO - Connecting to Neo4j 195 | 2024-11-26 17:58:41,156 - build_index - INFO - Initializing chat model 196 | 2024-11-26 17:58:55,289 - build_index - INFO - Initializing embedding 197 | 2024-11-26 17:59:00,747 - build_index - INFO - Initializing splitter 198 | 2024-11-26 17:59:00,747 - build_index - INFO - Initializing GDS 199 | 2024-11-26 17:59:00,764 - build_index - INFO - Initializing index 200 | 2024-11-26 17:59:00,784 - build_index - INFO - Building index now! 201 | 2024-11-26 17:59:00,784 - build_index - INFO - Chunking documents 202 | 2024-11-26 18:00:53,958 - build_index - INFO - Start building index 203 | 2024-11-26 18:00:53,962 - build_index - INFO - Connecting to Neo4j 204 | 2024-11-26 18:00:54,042 - build_index - INFO - Initializing chat model 205 | 2024-11-26 18:01:09,640 - build_index - INFO - Initializing embedding 206 | 2024-11-26 18:01:15,182 - build_index - INFO - Initializing splitter 207 | 2024-11-26 18:01:15,182 - build_index - INFO - Initializing GDS 208 | 2024-11-26 18:01:15,207 - build_index - INFO - Initializing index 209 | 2024-11-26 18:01:15,219 - build_index - INFO - Building index now! 210 | 2024-11-26 18:01:15,219 - build_index - INFO - Chunking documents 211 | 2024-11-26 19:19:54,841 - build_index - INFO - Start building index 212 | 2024-11-26 19:19:54,846 - build_index - INFO - Connecting to Neo4j 213 | 2024-11-26 19:20:10,153 - build_index - INFO - Initializing chat model 214 | 2024-11-26 19:20:10,221 - build_index - INFO - Initializing embedding 215 | 2024-11-26 19:20:16,778 - build_index - INFO - Initializing splitter 216 | 2024-11-26 19:20:16,778 - build_index - INFO - Initializing GDS 217 | 2024-11-26 19:20:16,792 - build_index - INFO - Initializing index 218 | 2024-11-26 19:20:16,801 - build_index - INFO - Building index now! 219 | 2024-11-26 19:20:57,615 - build_index - INFO - Start building index 220 | 2024-11-26 19:20:57,619 - build_index - INFO - Connecting to Neo4j 221 | 2024-11-26 19:20:58,527 - build_index - INFO - Initializing chat model 222 | 2024-11-26 19:20:58,587 - build_index - INFO - Initializing embedding 223 | 2024-11-26 19:21:04,649 - build_index - INFO - Initializing splitter 224 | 2024-11-26 19:21:04,649 - build_index - INFO - Initializing GDS 225 | 2024-11-26 19:21:04,665 - build_index - INFO - Initializing index 226 | 2024-11-26 19:21:04,678 - build_index - INFO - Building index now! 227 | 2024-11-26 19:21:04,678 - build_index - INFO - Chunking documents 228 | 2024-11-26 19:22:16,734 - build_index - INFO - Start building index 229 | 2024-11-26 19:22:16,738 - build_index - INFO - Connecting to Neo4j 230 | 2024-11-26 19:22:21,750 - build_index - INFO - Initializing chat model 231 | 2024-11-26 19:22:21,819 - build_index - INFO - Initializing embedding 232 | 2024-11-26 19:22:27,614 - build_index - INFO - Initializing splitter 233 | 2024-11-26 19:22:27,614 - build_index - INFO - Initializing GDS 234 | 2024-11-26 19:22:27,630 - build_index - INFO - Initializing index 235 | 2024-11-26 19:22:27,641 - build_index - INFO - Building index now! 236 | 2024-11-26 19:28:25,200 - build_index - INFO - Start building index 237 | 2024-11-26 19:28:25,207 - build_index - INFO - Connecting to Neo4j 238 | 2024-11-26 19:28:25,302 - build_index - INFO - Initializing chat model 239 | 2024-11-26 19:28:49,977 - build_index - INFO - Start building index 240 | 2024-11-26 19:28:49,980 - build_index - INFO - Connecting to Neo4j 241 | 2024-11-26 19:28:50,094 - build_index - INFO - Initializing chat model 242 | 2024-11-26 19:29:09,209 - build_index - INFO - Initializing embedding 243 | 2024-11-26 19:29:14,393 - build_index - INFO - Initializing splitter 244 | 2024-11-26 19:29:14,394 - build_index - INFO - Initializing GDS 245 | 2024-11-26 19:29:14,408 - build_index - INFO - Initializing index 246 | 2024-11-26 19:29:14,425 - build_index - INFO - Building index now! 247 | 2024-11-26 19:33:39,829 - build_index - INFO - Start building index 248 | 2024-11-26 19:33:39,835 - build_index - INFO - Connecting to Neo4j 249 | 2024-11-26 19:33:39,921 - build_index - INFO - Initializing chat model 250 | 2024-11-26 19:33:49,296 - build_index - INFO - Initializing embedding 251 | 2024-11-26 19:33:54,975 - build_index - INFO - Initializing splitter 252 | 2024-11-26 19:33:54,976 - build_index - INFO - Initializing GDS 253 | 2024-11-26 19:33:54,995 - build_index - INFO - Initializing index 254 | 2024-11-26 19:33:55,004 - build_index - INFO - Building index now! 255 | 2024-11-26 19:56:09,305 - build_index - INFO - Start building index 256 | 2024-11-26 19:56:09,310 - build_index - INFO - Connecting to Neo4j 257 | 2024-11-26 19:56:09,401 - build_index - INFO - Initializing chat model 258 | 2024-11-27 15:38:59,703 - build_index - INFO - Start building index 259 | 2024-11-27 15:38:59,708 - build_index - INFO - Connecting to Neo4j 260 | 2024-11-27 15:38:59,820 - build_index - INFO - Initializing chat model 261 | 2024-11-27 15:39:37,924 - build_index - INFO - Start building index 262 | 2024-11-27 15:39:37,930 - build_index - INFO - Connecting to Neo4j 263 | 2024-11-27 15:39:38,023 - build_index - INFO - Initializing chat model 264 | 2024-11-27 15:39:58,834 - build_index - INFO - Start building index 265 | 2024-11-27 15:39:58,839 - build_index - INFO - Connecting to Neo4j 266 | 2024-11-27 15:39:58,932 - build_index - INFO - Initializing chat model 267 | 2024-11-27 15:46:38,512 - build_index - INFO - Start building index 268 | 2024-11-27 15:46:38,519 - build_index - INFO - Connecting to Neo4j 269 | 2024-11-27 15:46:38,604 - build_index - INFO - Initializing chat model 270 | 2024-11-27 15:47:24,345 - build_index - INFO - Start building index 271 | 2024-11-27 15:47:24,352 - build_index - INFO - Connecting to Neo4j 272 | 2024-11-27 15:47:24,427 - build_index - INFO - Initializing chat model 273 | 2024-11-27 15:48:44,601 - build_index - INFO - Start building index 274 | 2024-11-27 15:48:44,607 - build_index - INFO - Connecting to Neo4j 275 | 2024-11-27 15:48:44,692 - build_index - INFO - Initializing chat model 276 | 2024-11-27 15:49:17,986 - build_index - INFO - Initializing embedding 277 | 2024-11-27 15:49:23,916 - build_index - INFO - Initializing splitter 278 | 2024-11-27 15:49:23,916 - build_index - INFO - Initializing GDS 279 | 2024-11-27 15:49:23,930 - build_index - INFO - Initializing index 280 | 2024-11-27 15:49:23,940 - build_index - INFO - Building index now! 281 | 2024-11-27 15:49:23,941 - build_index - INFO - Chunking documents 282 | 2024-11-27 15:54:20,436 - build_index - INFO - None 283 | 2024-11-27 15:54:20,974 - build_index - INFO - 76 nodes merged 284 | 2024-11-27 15:55:44,524 - build_index - INFO - Index built successfully 285 | 2024-11-27 15:57:37,600 - build_index - INFO - Start building index 286 | 2024-11-27 15:57:37,607 - build_index - INFO - Connecting to Neo4j 287 | 2024-11-27 15:57:37,696 - build_index - INFO - Initializing chat model 288 | 2024-11-27 15:57:47,421 - build_index - INFO - Initializing embedding 289 | 2024-11-27 15:57:53,005 - build_index - INFO - Initializing splitter 290 | 2024-11-27 15:57:53,005 - build_index - INFO - Initializing GDS 291 | 2024-11-27 15:57:53,025 - build_index - INFO - Initializing index 292 | 2024-11-27 15:57:53,039 - build_index - INFO - Building index now! 293 | 2024-11-27 15:57:53,040 - build_index - INFO - Chunking documents 294 | 2024-11-27 16:11:49,150 - build_index - INFO - Start building index 295 | 2024-11-27 16:11:49,155 - build_index - INFO - Connecting to Neo4j 296 | 2024-11-27 16:11:49,247 - build_index - INFO - Initializing chat model 297 | 2024-11-27 16:11:59,511 - build_index - INFO - Initializing embedding 298 | 2024-11-27 16:12:04,741 - build_index - INFO - Initializing splitter 299 | 2024-11-27 16:12:04,742 - build_index - INFO - Initializing GDS 300 | 2024-11-27 16:12:04,759 - build_index - INFO - Initializing index 301 | 2024-11-27 16:45:04,730 - build_index - INFO - Start building index 302 | 2024-11-27 16:45:04,734 - build_index - INFO - Connecting to Neo4j 303 | 2024-11-27 16:45:04,835 - build_index - INFO - Initializing chat model 304 | 2024-11-27 16:45:13,826 - build_index - INFO - Initializing embedding 305 | 2024-11-27 16:45:19,272 - build_index - INFO - Initializing splitter 306 | 2024-11-27 16:45:19,273 - build_index - INFO - Initializing GDS 307 | 2024-11-27 16:45:19,287 - build_index - INFO - Initializing index 308 | 2024-11-27 16:48:11,915 - build_index - INFO - Start building index 309 | 2024-11-27 16:48:11,922 - build_index - INFO - Connecting to Neo4j 310 | 2024-11-27 16:48:12,004 - build_index - INFO - Initializing chat model 311 | 2024-11-27 16:48:21,656 - build_index - INFO - Initializing embedding 312 | 2024-11-27 16:48:27,064 - build_index - INFO - Initializing splitter 313 | 2024-11-27 16:48:27,065 - build_index - INFO - Initializing GDS 314 | 2024-11-27 16:48:27,083 - build_index - INFO - Initializing index 315 | 2024-11-27 16:50:36,285 - build_index - INFO - Start building index 316 | 2024-11-27 16:50:36,291 - build_index - INFO - Connecting to Neo4j 317 | 2024-11-27 16:50:36,414 - build_index - INFO - Initializing chat model 318 | 2024-11-27 16:50:46,283 - build_index - INFO - Initializing embedding 319 | 2024-11-27 16:50:52,377 - build_index - INFO - Initializing splitter 320 | 2024-11-27 16:50:52,378 - build_index - INFO - Initializing GDS 321 | 2024-11-27 16:50:52,391 - build_index - INFO - Initializing index 322 | 2024-11-27 16:53:24,515 - build_index - INFO - Start building index 323 | 2024-11-27 16:53:24,520 - build_index - INFO - Connecting to Neo4j 324 | 2024-11-27 16:53:24,606 - build_index - INFO - Initializing chat model 325 | 2024-11-27 16:53:33,942 - build_index - INFO - Initializing embedding 326 | 2024-11-27 16:53:39,870 - build_index - INFO - Initializing splitter 327 | 2024-11-27 16:53:39,870 - build_index - INFO - Initializing GDS 328 | 2024-11-27 16:53:39,889 - build_index - INFO - Initializing index 329 | 2024-11-27 17:22:08,489 - build_index - INFO - Start building index 330 | 2024-11-27 17:22:08,494 - build_index - INFO - Connecting to Neo4j 331 | 2024-11-27 17:22:08,580 - build_index - INFO - Initializing chat model 332 | 2024-11-27 17:22:22,722 - build_index - INFO - Initializing embedding 333 | 2024-11-27 17:22:29,242 - build_index - INFO - Initializing splitter 334 | 2024-11-27 17:22:29,243 - build_index - INFO - Initializing GDS 335 | 2024-11-27 17:22:29,255 - build_index - INFO - Initializing index 336 | 2024-11-27 17:26:25,686 - build_index - INFO - Start building index 337 | 2024-11-27 17:26:25,692 - build_index - INFO - Connecting to Neo4j 338 | 2024-11-27 17:26:25,762 - build_index - INFO - Initializing chat model 339 | 2024-11-27 17:26:35,453 - build_index - INFO - Initializing embedding 340 | 2024-11-27 17:26:41,559 - build_index - INFO - Initializing splitter 341 | 2024-11-27 17:26:41,560 - build_index - INFO - Initializing GDS 342 | 2024-11-27 17:26:41,571 - build_index - INFO - Initializing index 343 | 2024-11-27 17:28:40,654 - build_index - INFO - Start building index 344 | 2024-11-27 17:28:40,657 - build_index - INFO - Connecting to Neo4j 345 | 2024-11-27 17:28:40,740 - build_index - INFO - Initializing chat model 346 | 2024-11-27 17:28:50,760 - build_index - INFO - Initializing embedding 347 | 2024-11-27 17:28:56,168 - build_index - INFO - Initializing splitter 348 | 2024-11-27 17:28:56,169 - build_index - INFO - Initializing GDS 349 | 2024-11-27 17:28:56,186 - build_index - INFO - Initializing index 350 | 2024-11-27 17:31:11,075 - build_index - INFO - Start building index 351 | 2024-11-27 17:31:11,082 - build_index - INFO - Connecting to Neo4j 352 | 2024-11-27 17:31:11,158 - build_index - INFO - Initializing chat model 353 | 2024-11-27 17:31:20,756 - build_index - INFO - Initializing embedding 354 | 2024-11-27 17:31:26,119 - build_index - INFO - Initializing splitter 355 | 2024-11-27 17:31:26,122 - build_index - INFO - Initializing GDS 356 | 2024-11-27 17:31:26,135 - build_index - INFO - Initializing index 357 | 2024-11-27 17:35:22,597 - build_index - INFO - Start building index 358 | 2024-11-27 17:35:22,603 - build_index - INFO - Connecting to Neo4j 359 | 2024-11-27 17:35:22,687 - build_index - INFO - Initializing chat model 360 | 2024-11-27 17:35:33,015 - build_index - INFO - Initializing embedding 361 | 2024-11-27 17:35:38,954 - build_index - INFO - Initializing splitter 362 | 2024-11-27 17:35:38,957 - build_index - INFO - Initializing GDS 363 | 2024-11-27 17:35:38,973 - build_index - INFO - Initializing index 364 | 2024-11-27 17:39:45,753 - build_index - INFO - Start building index 365 | 2024-11-27 17:39:45,758 - build_index - INFO - Connecting to Neo4j 366 | 2024-11-27 17:39:45,847 - build_index - INFO - Initializing chat model 367 | 2024-11-27 17:39:54,926 - build_index - INFO - Initializing embedding 368 | 2024-11-27 17:39:59,959 - build_index - INFO - Initializing splitter 369 | 2024-11-27 17:39:59,960 - build_index - INFO - Initializing GDS 370 | 2024-11-27 17:39:59,976 - build_index - INFO - Initializing index 371 | 2024-11-27 17:53:22,940 - build_index - INFO - Start building index 372 | 2024-11-27 17:53:22,944 - build_index - INFO - Connecting to Neo4j 373 | 2024-11-27 17:53:23,040 - build_index - INFO - Initializing chat model 374 | 2024-11-27 17:53:32,284 - build_index - INFO - Initializing embedding 375 | 2024-11-27 17:53:37,421 - build_index - INFO - Initializing splitter 376 | 2024-11-27 17:53:37,422 - build_index - INFO - Initializing GDS 377 | 2024-11-27 17:53:37,436 - build_index - INFO - Initializing index 378 | 2024-11-27 18:00:21,963 - build_index - INFO - Start building index 379 | 2024-11-27 18:00:21,968 - build_index - INFO - Connecting to Neo4j 380 | 2024-11-27 18:00:22,062 - build_index - INFO - Initializing chat model 381 | 2024-11-27 18:00:31,137 - build_index - INFO - Initializing embedding 382 | 2024-11-27 18:00:36,304 - build_index - INFO - Initializing splitter 383 | 2024-11-27 18:00:36,304 - build_index - INFO - Initializing GDS 384 | 2024-11-27 18:00:36,320 - build_index - INFO - Initializing index 385 | 2024-11-27 18:13:29,143 - build_index - INFO - Start building index 386 | 2024-11-27 18:13:29,149 - build_index - INFO - Connecting to Neo4j 387 | 2024-11-27 18:13:29,237 - build_index - INFO - Initializing chat model 388 | 2024-11-27 18:13:38,381 - build_index - INFO - Initializing embedding 389 | 2024-11-27 18:13:43,803 - build_index - INFO - Initializing splitter 390 | 2024-11-27 18:13:43,804 - build_index - INFO - Initializing GDS 391 | 2024-11-27 18:13:43,822 - build_index - INFO - Initializing index 392 | 2024-11-27 18:21:01,468 - build_index - INFO - Start building index 393 | 2024-11-27 18:21:01,473 - build_index - INFO - Connecting to Neo4j 394 | 2024-11-27 18:21:01,559 - build_index - INFO - Initializing chat model 395 | 2024-11-27 18:21:10,604 - build_index - INFO - Initializing embedding 396 | 2024-11-27 18:21:15,706 - build_index - INFO - Initializing splitter 397 | 2024-11-27 18:21:15,708 - build_index - INFO - Initializing GDS 398 | 2024-11-27 18:21:15,727 - build_index - INFO - Initializing index 399 | 2024-11-27 18:28:37,643 - build_index - INFO - Start building index 400 | 2024-11-27 18:28:37,647 - build_index - INFO - Connecting to Neo4j 401 | 2024-11-27 18:28:37,733 - build_index - INFO - Initializing chat model 402 | 2024-11-27 18:28:46,870 - build_index - INFO - Initializing embedding 403 | 2024-11-27 18:28:52,600 - build_index - INFO - Initializing splitter 404 | 2024-11-27 18:28:52,600 - build_index - INFO - Initializing GDS 405 | 2024-11-27 18:28:52,613 - build_index - INFO - Initializing index 406 | 2024-11-27 18:31:30,272 - build_index - INFO - Start building index 407 | 2024-11-27 18:31:30,279 - build_index - INFO - Connecting to Neo4j 408 | 2024-11-27 18:31:30,361 - build_index - INFO - Initializing chat model 409 | 2024-11-27 18:31:38,960 - build_index - INFO - Initializing embedding 410 | 2024-11-27 18:31:44,430 - build_index - INFO - Initializing splitter 411 | 2024-11-27 18:31:44,430 - build_index - INFO - Initializing GDS 412 | 2024-11-27 18:31:44,440 - build_index - INFO - Initializing index 413 | 2024-11-27 18:36:30,250 - build_index - INFO - Start building index 414 | 2024-11-27 18:36:30,256 - build_index - INFO - Connecting to Neo4j 415 | 2024-11-27 18:36:30,352 - build_index - INFO - Initializing chat model 416 | 2024-11-27 18:36:39,474 - build_index - INFO - Initializing embedding 417 | 2024-11-27 18:36:44,792 - build_index - INFO - Initializing splitter 418 | 2024-11-27 18:36:44,792 - build_index - INFO - Initializing GDS 419 | 2024-11-27 18:36:44,809 - build_index - INFO - Initializing index 420 | 2024-11-27 18:40:29,293 - build_index - INFO - Start building index 421 | 2024-11-27 18:40:29,297 - build_index - INFO - Connecting to Neo4j 422 | 2024-11-27 18:40:29,386 - build_index - INFO - Initializing chat model 423 | 2024-11-27 18:40:38,948 - build_index - INFO - Initializing embedding 424 | 2024-11-27 18:40:44,261 - build_index - INFO - Initializing splitter 425 | 2024-11-27 18:40:44,262 - build_index - INFO - Initializing GDS 426 | 2024-11-27 18:40:44,277 - build_index - INFO - Initializing index 427 | 2024-11-27 18:46:20,376 - build_index - INFO - Start building index 428 | 2024-11-27 18:46:20,381 - build_index - INFO - Connecting to Neo4j 429 | 2024-11-27 18:46:20,539 - build_index - INFO - Initializing chat model 430 | 2024-11-27 18:46:29,440 - build_index - INFO - Initializing embedding 431 | 2024-11-27 18:46:34,900 - build_index - INFO - Initializing splitter 432 | 2024-11-27 18:46:34,904 - build_index - INFO - Initializing GDS 433 | 2024-11-27 18:46:34,949 - build_index - INFO - Initializing index 434 | 2024-11-27 18:46:34,955 - build_index - INFO - Building index now! 435 | 2024-11-27 18:46:34,956 - build_index - INFO - Chunking documents 436 | 2024-11-27 18:50:23,332 - build_index - INFO - Start building index 437 | 2024-11-27 18:50:23,337 - build_index - INFO - Connecting to Neo4j 438 | 2024-11-27 18:50:23,357 - build_index - INFO - Initializing chat model 439 | 2024-11-27 18:50:33,206 - build_index - INFO - Initializing embedding 440 | 2024-11-27 18:50:38,077 - build_index - INFO - Initializing splitter 441 | 2024-11-27 18:50:38,079 - build_index - INFO - Initializing GDS 442 | 2024-11-27 18:50:38,092 - build_index - INFO - Initializing index 443 | 2024-11-27 18:52:14,399 - build_index - INFO - Start building index 444 | 2024-11-27 18:52:14,404 - build_index - INFO - Connecting to Neo4j 445 | 2024-11-27 18:52:14,431 - build_index - INFO - Initializing chat model 446 | 2024-11-27 18:52:23,457 - build_index - INFO - Initializing embedding 447 | 2024-11-27 18:52:28,450 - build_index - INFO - Initializing splitter 448 | 2024-11-27 18:52:28,450 - build_index - INFO - Initializing GDS 449 | 2024-11-27 18:52:28,466 - build_index - INFO - Initializing index 450 | 2024-11-27 18:53:04,926 - build_index - INFO - Start building index 451 | 2024-11-27 18:53:04,932 - build_index - INFO - Connecting to Neo4j 452 | 2024-11-27 18:53:04,956 - build_index - INFO - Initializing chat model 453 | 2024-11-27 18:53:13,685 - build_index - INFO - Initializing embedding 454 | 2024-11-27 18:53:19,011 - build_index - INFO - Initializing splitter 455 | 2024-11-27 18:53:19,011 - build_index - INFO - Initializing GDS 456 | 2024-11-27 18:53:19,027 - build_index - INFO - Initializing index 457 | 2024-11-27 18:55:17,198 - build_index - INFO - Start building index 458 | 2024-11-27 18:55:17,203 - build_index - INFO - Connecting to Neo4j 459 | 2024-11-27 18:55:17,225 - build_index - INFO - Initializing chat model 460 | 2024-11-27 18:55:26,237 - build_index - INFO - Initializing embedding 461 | 2024-11-27 18:55:31,408 - build_index - INFO - Initializing splitter 462 | 2024-11-27 18:55:31,408 - build_index - INFO - Initializing GDS 463 | 2024-11-27 18:55:31,423 - build_index - INFO - Initializing index 464 | 2024-11-27 18:56:36,139 - build_index - INFO - Start building index 465 | 2024-11-27 18:56:36,145 - build_index - INFO - Connecting to Neo4j 466 | 2024-11-27 18:56:36,173 - build_index - INFO - Initializing chat model 467 | 2024-11-27 18:56:44,847 - build_index - INFO - Initializing embedding 468 | 2024-11-27 18:56:49,770 - build_index - INFO - Initializing splitter 469 | 2024-11-27 18:56:49,771 - build_index - INFO - Initializing GDS 470 | 2024-11-27 18:56:49,787 - build_index - INFO - Initializing index 471 | 2024-11-27 18:57:42,954 - build_index - INFO - Start building index 472 | 2024-11-27 18:57:42,961 - build_index - INFO - Connecting to Neo4j 473 | 2024-11-27 18:57:42,989 - build_index - INFO - Initializing chat model 474 | 2024-11-27 18:57:52,303 - build_index - INFO - Initializing embedding 475 | 2024-11-27 18:57:57,542 - build_index - INFO - Initializing splitter 476 | 2024-11-27 18:57:57,543 - build_index - INFO - Initializing GDS 477 | 2024-11-27 18:57:57,559 - build_index - INFO - Initializing index 478 | 2024-11-27 19:06:09,582 - build_index - INFO - Start building index 479 | 2024-11-27 19:06:09,587 - build_index - INFO - Connecting to Neo4j 480 | 2024-11-27 19:06:09,608 - build_index - INFO - Initializing chat model 481 | 2024-11-27 19:06:18,464 - build_index - INFO - Initializing embedding 482 | 2024-11-27 19:06:23,628 - build_index - INFO - Initializing splitter 483 | 2024-11-27 19:06:23,630 - build_index - INFO - Initializing GDS 484 | 2024-11-27 19:06:23,647 - build_index - INFO - Initializing index 485 | 2024-11-27 19:06:57,679 - build_index - INFO - Building index now! 486 | 2024-11-27 19:06:57,683 - build_index - INFO - Chunking documents 487 | 2024-11-27 19:17:59,354 - build_index - INFO - Start building index 488 | 2024-11-27 19:17:59,360 - build_index - ERROR - 目前只支持openai,hf 489 | 2024-11-27 19:18:22,893 - build_index - INFO - Start building index 490 | 2024-11-27 19:18:22,894 - build_index - INFO - Connecting to Neo4j 491 | 2024-11-27 19:18:22,924 - build_index - INFO - Initializing chat model 492 | 2024-11-27 19:22:55,011 - build_index - INFO - Start building index 493 | 2024-11-27 19:22:55,017 - build_index - INFO - Connecting to Neo4j 494 | 2024-11-27 19:22:55,040 - build_index - INFO - Initializing chat model 495 | 2024-11-27 19:22:55,128 - build_index - INFO - Initializing embedding 496 | 2024-11-27 19:23:01,763 - build_index - INFO - Initializing splitter 497 | 2024-11-27 19:23:01,763 - build_index - INFO - Initializing GDS 498 | 2024-11-27 19:23:01,777 - build_index - INFO - Initializing index 499 | 2024-11-27 19:23:03,430 - build_index - INFO - Building index now! 500 | 2024-11-27 19:23:03,430 - build_index - INFO - Chunking documents 501 | 2024-11-27 19:25:04,123 - build_index - INFO - Start building index 502 | 2024-11-27 19:25:04,128 - build_index - INFO - Connecting to Neo4j 503 | 2024-11-27 19:25:04,151 - build_index - INFO - Initializing chat model 504 | 2024-11-27 19:25:04,206 - build_index - INFO - Initializing embedding 505 | 2024-11-27 19:25:10,355 - build_index - INFO - Initializing splitter 506 | 2024-11-27 19:25:10,355 - build_index - INFO - Initializing GDS 507 | 2024-11-27 19:25:10,365 - build_index - INFO - Initializing index 508 | 2024-11-27 21:16:05,607 - build_index - INFO - Start building index 509 | 2024-11-27 21:16:05,612 - build_index - INFO - Connecting to Neo4j 510 | 2024-11-27 21:16:05,640 - build_index - INFO - Initializing chat model 511 | 2024-11-27 21:16:05,694 - build_index - INFO - Initializing embedding 512 | 2024-11-27 21:16:12,629 - build_index - INFO - Initializing splitter 513 | 2024-11-27 21:16:12,629 - build_index - INFO - Initializing GDS 514 | 2024-11-27 21:16:12,640 - build_index - INFO - Initializing index 515 | 2024-11-27 21:16:12,641 - build_index - INFO - Building index now! 516 | 2024-11-27 21:16:12,641 - build_index - INFO - Chunking documents 517 | 2024-11-27 21:18:17,197 - build_index - INFO - Start building index 518 | 2024-11-27 21:18:17,202 - build_index - INFO - Connecting to Neo4j 519 | 2024-11-27 21:18:17,227 - build_index - INFO - Initializing chat model 520 | 2024-11-27 21:18:17,282 - build_index - INFO - Initializing embedding 521 | 2024-11-27 21:18:23,074 - build_index - INFO - Initializing splitter 522 | 2024-11-27 21:18:23,075 - build_index - INFO - Initializing GDS 523 | 2024-11-27 21:18:23,085 - build_index - INFO - Initializing index 524 | 2024-11-27 21:18:23,085 - build_index - INFO - Building index now! 525 | 2024-11-27 21:18:23,085 - build_index - INFO - Chunking documents 526 | 2024-11-27 21:20:44,548 - build_index - INFO - Start building index 527 | 2024-11-27 21:20:44,553 - build_index - INFO - Connecting to Neo4j 528 | 2024-11-27 21:20:44,572 - build_index - INFO - Initializing chat model 529 | 2024-11-27 21:20:44,630 - build_index - INFO - Initializing embedding 530 | 2024-11-27 21:20:50,684 - build_index - INFO - Initializing splitter 531 | 2024-11-27 21:20:50,684 - build_index - INFO - Initializing GDS 532 | 2024-11-27 21:20:50,698 - build_index - INFO - Initializing index 533 | 2024-11-27 21:20:50,698 - build_index - INFO - Building index now! 534 | 2024-11-27 21:20:50,698 - build_index - INFO - Chunking documents 535 | 2024-11-27 21:25:58,544 - build_index - INFO - Start building index 536 | 2024-11-27 21:25:58,547 - build_index - INFO - Connecting to Neo4j 537 | 2024-11-27 21:25:58,580 - build_index - INFO - Initializing chat model 538 | 2024-11-27 21:25:58,634 - build_index - INFO - Initializing embedding 539 | 2024-11-27 21:26:04,300 - build_index - INFO - Initializing splitter 540 | 2024-11-27 21:26:04,301 - build_index - INFO - Initializing GDS 541 | 2024-11-27 21:26:04,309 - build_index - INFO - Initializing index 542 | 2024-11-27 21:26:04,310 - build_index - INFO - Building index now! 543 | 2024-11-27 21:26:04,310 - build_index - INFO - Chunking documents 544 | 2024-11-27 21:26:43,405 - build_index - INFO - Start building index 545 | 2024-11-27 21:26:43,410 - build_index - INFO - Connecting to Neo4j 546 | 2024-11-27 21:26:43,432 - build_index - INFO - Initializing chat model 547 | 2024-11-27 21:26:43,496 - build_index - INFO - Initializing embedding 548 | 2024-11-27 21:26:48,893 - build_index - INFO - Initializing splitter 549 | 2024-11-27 21:26:48,893 - build_index - INFO - Initializing GDS 550 | 2024-11-27 21:26:48,904 - build_index - INFO - Initializing index 551 | 2024-11-27 21:26:48,905 - build_index - INFO - Building index now! 552 | 2024-11-27 21:26:48,905 - build_index - INFO - Start building index 553 | 2024-11-27 21:26:48,906 - build_index - INFO - Chunking documents 554 | 2024-11-27 21:31:41,877 - build_index - ERROR - ```json 555 | { 556 | "nodes":...不是一个合法的 Json 557 | 2024-11-27 21:54:10,171 - build_index - INFO - Start building index 558 | 2024-11-27 21:54:10,176 - build_index - INFO - Connecting to Neo4j 559 | 2024-11-27 21:54:10,206 - build_index - INFO - Initializing chat model 560 | 2024-11-27 21:54:10,259 - build_index - INFO - Initializing embedding 561 | 2024-11-27 21:54:16,612 - build_index - INFO - Initializing splitter 562 | 2024-11-27 21:54:16,612 - build_index - INFO - Initializing GDS 563 | 2024-11-27 21:54:16,628 - build_index - INFO - Initializing index 564 | 2024-11-27 21:54:16,628 - build_index - INFO - Building index now! 565 | 2024-11-27 21:54:16,628 - build_index - INFO - Create_nodes_and_relationships 566 | 2024-11-27 21:54:16,629 - build_index - INFO - Chunking documents 567 | 2024-11-27 21:55:21,171 - build_index - ERROR - ```json 568 | { 569 | "head": ...不是一个合法的 Json 570 | 2024-11-27 21:55:21,176 - build_index - ERROR - ```json 571 | { 572 | "head": ...不是一个合法的 Json 573 | 2024-11-27 21:55:21,178 - build_index - ERROR - ```json 574 | { 575 | "head": ...不是一个合法的 Json 576 | 2024-11-27 21:55:21,180 - build_index - ERROR - ```json 577 | { 578 | "head": ...不是一个合法的 Json 579 | 2024-11-27 21:55:21,182 - build_index - ERROR - ```json 580 | { 581 | "head": ...不是一个合法的 Json 582 | 2024-11-27 21:55:21,183 - build_index - ERROR - ```json 583 | { 584 | "head": ...不是一个合法的 Json 585 | 2024-11-27 21:55:21,184 - build_index - ERROR - ```json 586 | { 587 | "head": ...不是一个合法的 Json 588 | 2024-11-27 21:55:21,185 - build_index - ERROR - ```json 589 | { 590 | "head": ...不是一个合法的 Json 591 | 2024-11-27 21:55:21,186 - build_index - ERROR - ```json 592 | { 593 | "head": ...不是一个合法的 Json 594 | 2024-11-27 21:55:21,188 - build_index - ERROR - ```json 595 | { 596 | "head": ...不是一个合法的 Json 597 | 2024-11-27 21:55:21,189 - build_index - ERROR - ```json 598 | { 599 | "head": ...不是一个合法的 Json 600 | 2024-11-27 21:56:50,070 - build_index - INFO - Start building index 601 | 2024-11-27 21:56:50,075 - build_index - INFO - Connecting to Neo4j 602 | 2024-11-27 21:56:50,100 - build_index - INFO - Initializing chat model 603 | 2024-11-27 21:56:50,159 - build_index - INFO - Initializing embedding 604 | 2024-11-27 21:56:55,922 - build_index - INFO - Initializing splitter 605 | 2024-11-27 21:56:55,922 - build_index - INFO - Initializing GDS 606 | 2024-11-27 21:56:55,933 - build_index - INFO - Initializing index 607 | 2024-11-27 21:56:55,933 - build_index - INFO - Building index now! 608 | 2024-11-27 21:56:55,933 - build_index - INFO - Create_nodes_and_relationships 609 | 2024-11-27 21:56:55,934 - build_index - INFO - Chunking documents 610 | 2024-11-28 19:17:59,906 - build_index - INFO - Start building index 611 | 2024-11-28 19:17:59,913 - build_index - INFO - Connecting to Neo4j 612 | 2024-11-28 19:17:59,944 - build_index - INFO - Initializing chat model 613 | 2024-11-28 19:18:00,001 - build_index - INFO - Initializing embedding 614 | 2024-11-28 19:18:06,162 - build_index - INFO - Initializing splitter 615 | 2024-11-28 19:18:06,163 - build_index - INFO - Initializing GDS 616 | 2024-11-28 19:18:06,173 - build_index - INFO - Initializing index 617 | 2024-11-28 19:18:06,173 - build_index - INFO - Building index now! 618 | 2024-11-28 19:18:06,173 - build_index - INFO - Create_nodes_and_relationships 619 | 2024-11-28 19:18:06,173 - build_index - INFO - Chunking documents 620 | 2024-11-28 19:19:38,126 - build_index - ERROR - ```json 621 | { 622 | "head": ...不是一个合法的 Json 623 | 2024-11-28 19:22:00,757 - build_index - ERROR - ```json 624 | { 625 | "head": ...不是一个合法的 Json 626 | 2024-11-28 19:25:24,348 - build_index - INFO - Start building index 627 | 2024-11-28 19:25:24,352 - build_index - INFO - Connecting to Neo4j 628 | 2024-11-28 19:25:24,373 - build_index - INFO - Initializing chat model 629 | 2024-11-28 19:25:24,441 - build_index - INFO - Initializing embedding 630 | 2024-11-28 19:25:30,723 - build_index - INFO - Initializing splitter 631 | 2024-11-28 19:25:30,724 - build_index - INFO - Initializing GDS 632 | 2024-11-28 19:25:30,735 - build_index - INFO - Initializing index 633 | 2024-11-28 19:25:30,735 - build_index - INFO - Building index now! 634 | 2024-11-28 19:25:30,736 - build_index - INFO - Create_nodes_and_relationships 635 | 2024-11-28 19:25:30,736 - build_index - INFO - Chunking documents 636 | 2024-11-28 19:25:57,604 - build_index - INFO - Start building index 637 | 2024-11-28 19:25:57,609 - build_index - INFO - Connecting to Neo4j 638 | 2024-11-28 19:25:57,631 - build_index - INFO - Initializing chat model 639 | 2024-11-28 19:25:57,686 - build_index - INFO - Initializing embedding 640 | 2024-11-28 19:26:03,827 - build_index - INFO - Initializing splitter 641 | 2024-11-28 19:26:03,828 - build_index - INFO - Initializing GDS 642 | 2024-11-28 19:26:03,837 - build_index - INFO - Initializing index 643 | 2024-11-28 19:26:03,837 - build_index - INFO - Building index now! 644 | 2024-11-28 19:26:03,837 - build_index - INFO - Create_nodes_and_relationships 645 | 2024-11-28 19:26:03,837 - build_index - INFO - Chunking documents 646 | 2024-11-28 19:27:21,921 - build_index - INFO - Start building index 647 | 2024-11-28 19:27:21,926 - build_index - INFO - Connecting to Neo4j 648 | 2024-11-28 19:27:21,947 - build_index - INFO - Initializing chat model 649 | 2024-11-28 19:27:22,002 - build_index - INFO - Initializing embedding 650 | 2024-11-28 19:27:28,430 - build_index - INFO - Initializing splitter 651 | 2024-11-28 19:27:28,430 - build_index - INFO - Initializing GDS 652 | 2024-11-28 19:27:28,443 - build_index - INFO - Initializing index 653 | 2024-11-28 19:27:28,444 - build_index - INFO - Building index now! 654 | 2024-11-28 19:27:28,444 - build_index - INFO - Create_nodes_and_relationships 655 | 2024-11-28 19:27:28,444 - build_index - INFO - Chunking documents 656 | 2024-11-28 19:28:16,926 - build_index - ERROR - ```json 657 | { 658 | "head": ...不是一个合法的 Json 659 | 2024-11-28 19:29:34,121 - build_index - ERROR - ```json 660 | { 661 | "head": ...不是一个合法的 Json 662 | 2024-11-28 19:29:41,073 - build_index - INFO - Start building index 663 | 2024-11-28 19:29:41,079 - build_index - INFO - Connecting to Neo4j 664 | 2024-11-28 19:29:41,101 - build_index - INFO - Initializing chat model 665 | 2024-11-28 19:29:41,154 - build_index - INFO - Initializing embedding 666 | 2024-11-28 19:29:47,396 - build_index - INFO - Initializing splitter 667 | 2024-11-28 19:29:47,396 - build_index - INFO - Initializing GDS 668 | 2024-11-28 19:29:47,408 - build_index - INFO - Initializing index 669 | 2024-11-28 19:29:47,408 - build_index - INFO - Building index now! 670 | 2024-11-28 19:29:47,408 - build_index - INFO - Create_nodes_and_relationships 671 | 2024-11-28 19:29:47,408 - build_index - INFO - Chunking documents 672 | 2024-11-28 19:31:00,430 - build_index - ERROR - ```json 673 | { 674 | "head": ...不是一个合法的 Json 675 | 2024-11-28 19:31:02,214 - build_index - ERROR - ```json 676 | { 677 | "head": ...不是一个合法的 Json 678 | 2024-11-28 19:33:12,127 - build_index - INFO - Start building index 679 | 2024-11-28 19:33:12,131 - build_index - INFO - Connecting to Neo4j 680 | 2024-11-28 19:33:12,152 - build_index - INFO - Initializing chat model 681 | 2024-11-28 19:33:12,209 - build_index - INFO - Initializing embedding 682 | 2024-11-28 19:33:18,265 - build_index - INFO - Initializing splitter 683 | 2024-11-28 19:33:18,265 - build_index - INFO - Initializing GDS 684 | 2024-11-28 19:33:18,277 - build_index - INFO - Initializing index 685 | 2024-11-28 19:33:18,278 - build_index - INFO - Building index now! 686 | 2024-11-28 19:33:18,278 - build_index - INFO - Create_nodes_and_relationships 687 | 2024-11-28 19:33:18,278 - build_index - INFO - Chunking documents 688 | 2024-11-28 19:33:36,725 - build_index - INFO - Start building index 689 | 2024-11-28 19:33:36,726 - build_index - INFO - Connecting to Neo4j 690 | 2024-11-28 19:33:36,745 - build_index - INFO - Initializing chat model 691 | 2024-11-28 19:33:36,817 - build_index - INFO - Initializing embedding 692 | 2024-11-28 19:33:42,784 - build_index - INFO - Initializing splitter 693 | 2024-11-28 19:33:42,786 - build_index - INFO - Initializing GDS 694 | 2024-11-28 19:33:42,796 - build_index - INFO - Initializing index 695 | 2024-11-28 19:33:42,797 - build_index - INFO - Building index now! 696 | 2024-11-28 19:33:42,797 - build_index - INFO - Create_nodes_and_relationships 697 | 2024-11-28 19:33:42,797 - build_index - INFO - Chunking documents 698 | 2024-11-28 19:34:50,773 - build_index - ERROR - ```json 699 | { 700 | "head": ...不是一个合法的 Json 701 | 2024-11-28 19:34:52,850 - build_index - INFO - None 702 | 2024-11-28 19:34:59,600 - build_index - INFO - Start building index 703 | 2024-11-28 19:34:59,603 - build_index - INFO - Connecting to Neo4j 704 | 2024-11-28 19:34:59,635 - build_index - INFO - Initializing chat model 705 | 2024-11-28 19:34:59,690 - build_index - INFO - Initializing embedding 706 | 2024-11-28 19:35:08,152 - build_index - INFO - Initializing splitter 707 | 2024-11-28 19:35:08,152 - build_index - INFO - Initializing GDS 708 | 2024-11-28 19:35:08,190 - build_index - INFO - Initializing index 709 | 2024-11-28 19:35:08,191 - build_index - INFO - Building index now! 710 | 2024-11-28 19:35:08,191 - build_index - INFO - Create_nodes_and_relationships 711 | 2024-11-28 19:35:08,191 - build_index - INFO - Chunking documents 712 | 2024-11-28 19:36:50,938 - build_index - ERROR - 模型没法进行这条任务的实体解析 713 | 2024-11-28 19:36:51,264 - build_index - ERROR - 模型没法进行这条任务的实体解析 714 | 2024-11-28 19:36:51,728 - build_index - ERROR - 模型没法进行这条任务的实体解析 715 | 2024-11-28 19:36:51,783 - build_index - INFO - None 716 | 2024-11-28 19:36:52,008 - build_index - INFO - 104 nodes merged 717 | 2024-11-28 19:43:17,314 - build_index - INFO - Start building index 718 | 2024-11-28 19:43:17,318 - build_index - INFO - Connecting to Neo4j 719 | 2024-11-28 19:43:18,939 - build_index - INFO - Initializing chat model 720 | 2024-11-28 19:43:19,025 - build_index - INFO - Initializing embedding 721 | 2024-11-28 19:43:24,815 - build_index - INFO - Initializing splitter 722 | 2024-11-28 19:43:24,816 - build_index - INFO - Initializing GDS 723 | 2024-11-28 19:43:24,824 - build_index - INFO - Initializing index 724 | 2024-11-28 19:43:24,826 - build_index - INFO - Building index now! 725 | 2024-11-28 19:43:24,826 - build_index - INFO - Create_nodes_and_relationships 726 | 2024-11-28 19:43:24,826 - build_index - INFO - Chunking documents 727 | 2024-11-28 19:47:59,182 - build_index - INFO - Start building index 728 | 2024-11-28 19:47:59,187 - build_index - INFO - Connecting to Neo4j 729 | 2024-11-28 19:47:59,210 - build_index - INFO - Initializing chat model 730 | 2024-11-28 19:47:59,305 - build_index - INFO - Initializing embedding 731 | 2024-11-28 19:48:07,018 - build_index - INFO - Initializing splitter 732 | 2024-11-28 19:48:07,019 - build_index - INFO - Initializing GDS 733 | 2024-11-28 19:48:07,030 - build_index - INFO - Initializing index 734 | 2024-11-28 19:48:07,031 - build_index - INFO - Building index now! 735 | 2024-11-28 19:48:07,031 - build_index - INFO - Create_nodes_and_relationships 736 | 2024-11-28 19:48:07,031 - build_index - INFO - Chunking documents 737 | 2024-11-28 20:00:59,145 - build_index - INFO - Start building index 738 | 2024-11-28 20:00:59,149 - build_index - INFO - Connecting to Neo4j 739 | 2024-11-28 20:00:59,180 - build_index - INFO - Initializing chat model 740 | 2024-11-28 20:00:59,238 - build_index - INFO - Initializing embedding 741 | 2024-11-28 20:01:05,761 - build_index - INFO - Initializing splitter 742 | 2024-11-28 20:01:05,762 - build_index - INFO - Initializing GDS 743 | 2024-11-28 20:01:05,769 - build_index - INFO - Initializing index 744 | 2024-11-28 20:01:05,769 - build_index - INFO - Building index now! 745 | 2024-11-28 20:01:05,770 - build_index - INFO - Create_nodes_and_relationships 746 | 2024-11-28 20:01:05,770 - build_index - INFO - Chunking documents 747 | 2024-11-28 20:07:38,053 - build_index - ERROR - ```json 748 | { 749 | "head": ...不是一个合法的 Json 750 | 2024-11-28 20:07:43,435 - build_index - ERROR - ```json 751 | { 752 | "head": ...不是一个合法的 Json 753 | 2024-11-28 20:07:46,243 - build_index - ERROR - ```json 754 | { 755 | "head": ...不是一个合法的 Json 756 | 2024-11-28 20:07:50,135 - build_index - ERROR - ```json 757 | { 758 | "head": ...不是一个合法的 Json 759 | 2024-11-28 20:07:51,310 - build_index - ERROR - ```json 760 | { 761 | "head": ...不是一个合法的 Json 762 | 2024-11-28 20:07:51,491 - build_index - ERROR - ```json 763 | { 764 | "head": ...不是一个合法的 Json 765 | 2024-11-28 20:07:52,302 - build_index - ERROR - ```json 766 | { 767 | "head": ...不是一个合法的 Json 768 | 2024-11-28 20:07:52,472 - build_index - ERROR - ```json 769 | { 770 | "head": ...不是一个合法的 Json 771 | 2024-11-28 20:07:52,642 - build_index - ERROR - ```json 772 | { 773 | "head": ...不是一个合法的 Json 774 | 2024-11-28 20:07:52,804 - build_index - ERROR - ```json 775 | { 776 | "head": ...不是一个合法的 Json 777 | 2024-11-28 20:08:06,078 - build_index - INFO - Start building index 778 | 2024-11-28 20:08:06,081 - build_index - INFO - Connecting to Neo4j 779 | 2024-11-28 20:08:06,098 - build_index - INFO - Initializing chat model 780 | 2024-11-28 20:08:06,152 - build_index - INFO - Initializing embedding 781 | 2024-11-28 20:08:12,337 - build_index - INFO - Initializing splitter 782 | 2024-11-28 20:08:12,338 - build_index - INFO - Initializing GDS 783 | 2024-11-28 20:08:12,350 - build_index - INFO - Initializing index 784 | 2024-11-28 20:08:12,351 - build_index - INFO - Building index now! 785 | 2024-11-28 20:08:12,351 - build_index - INFO - Create_nodes_and_relationships 786 | 2024-11-28 20:08:12,352 - build_index - INFO - Chunking documents 787 | 2024-11-28 20:09:49,913 - build_index - ERROR - ```json 788 | { 789 | "head": ...不是一个合法的 Json 790 | 2024-11-28 20:09:49,918 - build_index - ERROR - ```json 791 | { 792 | "head": ...不是一个合法的 Json 793 | 2024-11-28 20:09:49,920 - build_index - ERROR - ```json 794 | { 795 | "head": ...不是一个合法的 Json 796 | 2024-11-28 20:09:49,924 - build_index - ERROR - ```json 797 | { 798 | "head": ...不是一个合法的 Json 799 | 2024-11-28 20:37:28,154 - build_index - INFO - Start building index 800 | 2024-11-28 20:37:28,160 - build_index - INFO - Connecting to Neo4j 801 | 2024-11-28 20:37:57,323 - build_index - INFO - Initializing chat model 802 | 2024-11-28 20:37:57,377 - build_index - INFO - Initializing embedding 803 | 2024-11-28 20:38:03,272 - build_index - INFO - Initializing splitter 804 | 2024-11-28 20:38:03,274 - build_index - INFO - Initializing GDS 805 | 2024-11-28 20:38:03,318 - build_index - INFO - Initializing index 806 | 2024-11-28 20:38:03,319 - build_index - INFO - Building index now! 807 | 2024-11-28 20:38:03,319 - build_index - INFO - Create_nodes_and_relationships 808 | 2024-11-28 20:38:03,319 - build_index - INFO - Chunking documents 809 | 2024-11-28 20:38:45,183 - build_index - INFO - None 810 | 2024-11-28 20:38:45,447 - build_index - INFO - 44 nodes merged 811 | 2024-11-28 20:38:52,067 - build_index - INFO - Index built successfully 812 | 2024-11-28 20:47:35,215 - build_index - INFO - Start building index 813 | 2024-11-28 20:47:35,219 - build_index - INFO - Connecting to Neo4j 814 | 2024-11-28 20:47:35,379 - build_index - INFO - Initializing chat model 815 | 2024-11-28 20:47:35,453 - build_index - INFO - Initializing embedding 816 | 2024-11-28 20:47:42,616 - build_index - INFO - Initializing splitter 817 | 2024-11-28 20:47:42,617 - build_index - INFO - Initializing GDS 818 | 2024-11-28 20:47:42,646 - build_index - INFO - Initializing index 819 | 2024-11-28 20:47:42,646 - build_index - INFO - Building index now! 820 | 2024-11-28 20:47:42,647 - build_index - INFO - Create_nodes_and_relationships 821 | 2024-11-28 20:47:42,647 - build_index - INFO - Chunking documents 822 | 2024-11-28 20:49:16,020 - build_index - ERROR - ```json 823 | { 824 | "head": ...不是一个合法的 Json 825 | 2024-11-28 20:49:16,025 - build_index - ERROR - ```json 826 | { 827 | "head": ...不是一个合法的 Json 828 | 2024-11-28 20:49:16,027 - build_index - ERROR - ```json 829 | { 830 | "head": ...不是一个合法的 Json 831 | 2024-11-28 20:49:16,031 - build_index - ERROR - ```json 832 | { 833 | "head": ...不是一个合法的 Json 834 | 2024-11-28 20:49:48,687 - build_index - INFO - None 835 | 2024-11-28 20:49:48,855 - build_index - INFO - 84 nodes merged 836 | 2024-11-28 20:50:19,571 - build_index - INFO - Index built successfully 837 | 2024-11-28 20:51:36,090 - build_index - INFO - Start building index 838 | 2024-11-28 20:51:36,094 - build_index - INFO - Connecting to Neo4j 839 | 2024-11-28 20:51:36,231 - build_index - INFO - Initializing chat model 840 | 2024-11-28 20:52:03,682 - build_index - INFO - Start building index 841 | 2024-11-28 20:52:03,683 - build_index - INFO - Connecting to Neo4j 842 | 2024-11-28 20:52:03,700 - build_index - INFO - Initializing chat model 843 | 2024-11-28 20:52:12,767 - build_index - INFO - Initializing embedding 844 | 2024-11-28 20:52:17,901 - build_index - INFO - Initializing splitter 845 | 2024-11-28 20:52:17,903 - build_index - INFO - Initializing GDS 846 | 2024-11-28 20:52:17,934 - build_index - INFO - Initializing index 847 | 2024-11-28 20:52:17,935 - build_index - INFO - Building index now! 848 | 2024-11-28 20:52:17,935 - build_index - INFO - Create_nodes_and_relationships 849 | 2024-11-28 20:52:17,935 - build_index - INFO - Chunking documents 850 | 2024-11-28 20:55:20,998 - build_index - ERROR - ```json 851 | { 852 | "head": ...不是一个合法的 Json 853 | 2024-11-28 20:55:21,004 - build_index - ERROR - ```json 854 | { 855 | "head": ...不是一个合法的 Json 856 | 2024-11-28 20:55:21,006 - build_index - ERROR - ```json 857 | { 858 | "head": ...不是一个合法的 Json 859 | 2024-11-28 20:55:21,007 - build_index - ERROR - ```json 860 | { 861 | "head": ...不是一个合法的 Json 862 | 2024-11-28 20:55:21,008 - build_index - ERROR - ```json 863 | { 864 | "head": ...不是一个合法的 Json 865 | 2024-11-28 20:55:21,010 - build_index - ERROR - ```json 866 | { 867 | "head": ...不是一个合法的 Json 868 | 2024-11-28 20:55:21,012 - build_index - ERROR - ```json 869 | { 870 | "head": ...不是一个合法的 Json 871 | 2024-11-28 20:55:21,013 - build_index - ERROR - ```json 872 | { 873 | "head": ...不是一个合法的 Json 874 | 2024-11-28 20:55:21,014 - build_index - ERROR - ```json 875 | { 876 | "head": ...不是一个合法的 Json 877 | 2024-11-28 20:55:21,015 - build_index - ERROR - ```json 878 | { 879 | "head": ...不是一个合法的 Json 880 | 2024-11-28 20:55:21,811 - build_index - INFO - None 881 | 2024-11-28 20:55:22,146 - build_index - INFO - 6 nodes merged 882 | 2024-11-28 20:55:26,850 - build_index - INFO - Index built successfully 883 | 2024-11-28 20:57:07,398 - build_index - INFO - Start building index 884 | 2024-11-28 20:57:07,402 - build_index - INFO - Connecting to Neo4j 885 | 2024-11-28 20:57:07,441 - build_index - INFO - Initializing chat model 886 | 2024-11-28 20:57:17,702 - build_index - INFO - Initializing embedding 887 | 2024-11-28 20:57:23,980 - build_index - INFO - Initializing splitter 888 | 2024-11-28 20:57:23,980 - build_index - INFO - Initializing GDS 889 | 2024-11-28 20:57:23,997 - build_index - INFO - Initializing index 890 | 2024-11-28 20:57:23,997 - build_index - INFO - Building index now! 891 | 2024-11-28 20:57:23,997 - build_index - INFO - Create_nodes_and_relationships 892 | 2024-11-28 20:57:23,997 - build_index - INFO - Chunking documents 893 | 2024-11-28 21:04:04,549 - build_index - ERROR - ```json 894 | { 895 | "head": ...不是一个合法的 Json 896 | 2024-11-28 21:04:08,310 - build_index - ERROR - ```json 897 | { 898 | "head": ...不是一个合法的 Json 899 | 2024-11-28 21:04:09,591 - build_index - ERROR - ```json 900 | { 901 | "head": ...不是一个合法的 Json 902 | 2024-11-28 21:04:09,776 - build_index - ERROR - ```json 903 | { 904 | "head": ...不是一个合法的 Json 905 | 2024-11-28 21:04:09,946 - build_index - ERROR - ```json 906 | { 907 | "head": ...不是一个合法的 Json 908 | 2024-11-28 21:04:10,106 - build_index - ERROR - ```json 909 | { 910 | "head": ...不是一个合法的 Json 911 | 2024-11-28 21:04:10,259 - build_index - ERROR - ```json 912 | { 913 | "head": ...不是一个合法的 Json 914 | 2024-11-28 21:04:10,419 - build_index - ERROR - ```json 915 | { 916 | "head": ...不是一个合法的 Json 917 | 2024-11-28 21:04:10,578 - build_index - ERROR - ```json 918 | { 919 | "head": ...不是一个合法的 Json 920 | 2024-11-28 21:04:10,735 - build_index - ERROR - ```json 921 | { 922 | "head": ...不是一个合法的 Json 923 | 2024-11-28 21:04:10,895 - build_index - ERROR - ```json 924 | { 925 | "head": ...不是一个合法的 Json 926 | 2024-11-28 21:04:23,019 - build_index - INFO - Start building index 927 | 2024-11-28 21:04:23,023 - build_index - INFO - Connecting to Neo4j 928 | 2024-11-28 21:04:23,053 - build_index - INFO - Initializing chat model 929 | 2024-11-28 21:04:32,215 - build_index - INFO - Initializing embedding 930 | 2024-11-28 21:04:37,683 - build_index - INFO - Initializing splitter 931 | 2024-11-28 21:04:37,683 - build_index - INFO - Initializing GDS 932 | 2024-11-28 21:04:37,697 - build_index - INFO - Initializing index 933 | 2024-11-28 21:04:37,697 - build_index - INFO - Building index now! 934 | 2024-11-28 21:04:37,697 - build_index - INFO - Create_nodes_and_relationships 935 | 2024-11-28 21:04:37,697 - build_index - INFO - Chunking documents 936 | 2024-11-28 21:08:45,257 - build_index - ERROR - 不是一个合法的Json 937 | 2024-11-28 21:08:47,059 - build_index - ERROR - 不是一个合法的Json 938 | 2024-11-28 21:08:47,227 - build_index - ERROR - 不是一个合法的Json 939 | 2024-11-28 21:08:47,398 - build_index - ERROR - 不是一个合法的Json 940 | 2024-11-28 21:08:47,559 - build_index - ERROR - 不是一个合法的Json 941 | 2024-11-28 21:08:47,721 - build_index - ERROR - 不是一个合法的Json 942 | 2024-11-28 21:08:47,892 - build_index - ERROR - 不是一个合法的Json 943 | 2024-11-28 21:08:48,055 - build_index - ERROR - 不是一个合法的Json 944 | 2024-11-28 21:10:12,876 - build_index - INFO - Start building index 945 | 2024-11-28 21:10:12,880 - build_index - INFO - Connecting to Neo4j 946 | 2024-11-28 21:10:12,905 - build_index - INFO - Initializing chat model 947 | 2024-11-28 21:10:21,153 - build_index - INFO - Initializing embedding 948 | 2024-11-28 21:10:25,984 - build_index - INFO - Initializing splitter 949 | 2024-11-28 21:10:25,984 - build_index - INFO - Initializing GDS 950 | 2024-11-28 21:10:25,991 - build_index - INFO - Initializing index 951 | 2024-11-28 21:10:25,992 - build_index - INFO - Building index now! 952 | 2024-11-28 21:10:25,992 - build_index - INFO - Create_nodes_and_relationships 953 | 2024-11-28 21:10:25,992 - build_index - INFO - Chunking documents 954 | 2024-11-28 21:15:06,719 - build_index - ERROR - 不是一个合法的Json 955 | 2024-11-28 21:15:09,765 - build_index - ERROR - 不是一个合法的Json 956 | 2024-11-28 21:15:13,253 - build_index - INFO - None 957 | 2024-11-28 21:15:13,621 - build_index - INFO - 110 nodes merged 958 | 2024-11-28 21:16:35,372 - build_index - INFO - Index built successfully 959 | --------------------------------------------------------------------------------