├── .devcontainer
    └── devcontainer.json
├── .gitignore
├── README.md
├── app.py
├── llm_utils.py
├── requirements.txt
└── utils.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "app.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Knowledge Graph Generator
  2 | 
  3 | 基于大语言模型LLM 的知识图谱生成工具，支持从文本中自动提取实体关系并可视化展示。
  4 | ## demo
  5 | 试用网址：https://knowledgegraph-app.streamlit.app/  
  6 | 介绍文章：[使用大语言模型 + streamlit-agraph 生成和可视化知识图谱](https://mp.weixin.qq.com/s/HAIzAcODJZLL20eshMf3qQ)
  7 | 
  8 | ## 项目介绍
  9 | 
 10 | 本项目是一个基于大语言模型的知识图谱生成工具，可以自动从输入文本中提取实体和关系，并生成可视化的知识图谱。目前支持智谱 AI 和 Azure OpenAI 两种模型供应商。
 11 | 
 12 | ### 主要特性
 13 | 
 14 | - 🤖 支持多个 LLM 供应商
 15 |   - 智谱 AI (GLM-4)
 16 |   - Azure OpenAI
 17 | - 📊 知识图谱可视化
 18 |   - 交互式图形界面
 19 |   - 节点和边的自定义样式
 20 |   - 支持图谱缩放和拖拽
 21 | - 🎯 简单易用的 Web 界面
 22 |   - Streamlit 构建的直观界面
 23 |   - 实时反馈和错误提示
 24 |   - 支持大段文本输入
 25 | 
 26 | ## 安装说明
 27 | 
 28 | ### 环境要求
 29 | 
 30 | - Python 3.8+
 31 | - pip
 32 | - langchain==0.2.16
 33 | - langchain-community==0.2.16
 34 | - langchain-core==0.2.38
 35 | - langchain-text-splitters==0.2.4
 36 | - langdetect==1.0.9
 37 | - streamlit==1.38.0
 38 | - streamlit-agraph==0.0.45
 39 | - zhipuai==2.1.5.20230904
 40 | 
 41 | ### 安装步骤
 42 | 
 43 | 1. 克隆项目
 44 | ```bash
 45 | git clone https://github.com/yourusername/KnowledgeGraph-Streamlit.git
 46 | cd KnowledgeGraph-Streamlit
 47 | ```
 48 | 
 49 | 2. 安装依赖
 50 | ```bash
 51 | pip install -r requirements.txt
 52 | ```
 53 | 
 54 | 3. 运行应用
 55 | ```bash
 56 | streamlit run app.py
 57 | ```
 58 | 
 59 | ## 使用指南
 60 | 
 61 | ### 配置 API
 62 | 
 63 | 1. 智谱 AI
 64 |    - 访问 [智谱 AI 开放平台](https://open.bigmodel.cn/)
 65 |    - 注册账号并创建应用
 66 |    - 获取 API 密钥
 67 | 
 68 | 2. Azure OpenAI
 69 |    - 访问 [Azure Portal](https://portal.azure.com/)
 70 |    - 创建 Azure OpenAI 服务
 71 |    - 获取 API 密钥和配置信息
 72 | 
 73 | ### 使用步骤
 74 | 
 75 | 1. 启动应用后，在侧边栏：
 76 |    - 选择 LLM 供应商（默认为智谱 AI）
 77 |    - 输入对应的 API 密钥
 78 | 
 79 | 2. 在主界面：
 80 |    - 在文本框中输入待分析文本
 81 |    - 点击"Extract Knowledge"按钮
 82 |    - 等待知识图谱生成
 83 |    - 查看可视化结果
 84 | 
 85 | ### 示例输入
 86 | 
 87 | ```text
 88 | 人工智能是计算机科学的一个分支，它包含机器学习和深度学习两个重要领域。
 89 | 机器学习使用统计方法来让计算机系统逐步改善性能，而深度学习则是基于神经网络的一种特殊机器学习方法。
 90 | ```
 91 | 
 92 | ## 项目结构
 93 | 
 94 | ```
 95 | knowledge-graph-generator/
 96 | ├── app.py              # 主应用程序
 97 | ├── llm_utils.py        # LLM 调用工具
 98 | ├── utils.py            # 通用工具函数
 99 | ├── requirements.txt    # 项目依赖
100 | └── README.md           # 项目文档
101 | ```
102 | 
103 | ## 技术栈
104 | 
105 | - **前端框架**: Streamlit
106 | - **图形可视化**: streamlit-agraph
107 | - **LLM 接口**: 
108 |   - langchain-community
109 |   - Azure OpenAI
110 | - **数据处理**: Python 标准库
111 | 
112 | ## 注意事项
113 | 
114 | 1. API 密钥安全
115 |    - 不要在代码中硬编码 API 密钥
116 |    - 建议使用环境变量管理密钥
117 |    - 定期更换 API 密钥
118 | 
119 | 2. 使用限制
120 |    - 注意 API 调用频率限制
121 |    - 单次文本长度限制
122 |    - 知识图谱节点数量至少需要 3 个
123 | 
124 | 3. 性能优化
125 |    - 大文本建议分段处理
126 |    - 适当调整模型参数
127 |    - 缓存常用结果
128 | 
129 | ## 常见问题
130 | 
131 | 1. Q: 为什么图谱生成失败？
132 |    A: 常见原因：
133 |    - API 密钥无效或过期
134 |    - 输入文本过短或实体关系不足
135 |    - 网络连接问题
136 | 
137 | 2. Q: 如何提高图谱质量？
138 |    A: 建议：
139 |    - 使用结构化的输入文本
140 |    - 调整模型温度参数
141 |    - 增加输入文本的实体密度
142 | 
143 | ## 贡献指南
144 | 
145 | 
146 | 
147 | ## 致谢
148 | 
149 | 感谢以下项目和工具：
150 | - [Streamlit](https://streamlit.io/)
151 | - [智谱 AI](https://open.bigmodel.cn/)
152 | - [Azure OpenAI](https://azure.microsoft.com/products/cognitive-services/openai-service/)
153 | ```
154 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import json
  3 | from utils import generate_graph_data
  4 | from llm_utils import call_llm
  5 | from streamlit_agraph import agraph, Node, Edge, Config
  6 | 
  7 | st.set_page_config(page_title="Knowledge Graph Generator", layout="wide")
  8 | # Set page title
  9 | st.title("Knowledge Graph Generator")
 10 | 
 11 | # Add description
 12 | st.write("Please enter the text below for knowledge graph extraction.")
 13 | 
 14 | # Add text input area
 15 | text_input = st.text_area("Input Text", height=200)
 16 | 
 17 | # Initialize session state
 18 | if 'graph_data' not in st.session_state:
 19 |     st.session_state.graph_data = None
 20 | if 'agraph_config' not in st.session_state:
 21 |     st.session_state.agraph_config = None
 22 | if 'graph_ready' not in st.session_state:
 23 |     st.session_state.graph_ready = False
 24 | 
 25 | # Initialize session state for API key, supplier and temperature
 26 | if 'api_key' not in st.session_state:
 27 |     st.session_state.api_key = ''
 28 | if 'current_supplier' not in st.session_state:
 29 |     st.session_state.current_supplier = 'zhipu'  # Default to zhipu
 30 | if 'temperature' not in st.session_state:
 31 |     st.session_state.temperature = 0.1  # Default temperature
 32 | 
 33 | def prepare_graph_visualization(nodes_data, edges_data):
 34 |     """Prepare graph visualization data and config"""
 35 |     # Convert to agraph format nodes and edges
 36 |     nodes = [
 37 |         Node(
 38 |             id=str(node['id']),  # Ensure id is string
 39 |             label=str(node['label']),
 40 |             size=25,
 41 |             color=f"#{hash(str(node['group'])) % 0xFFFFFF:06x}"
 42 |         ) for node in nodes_data
 43 |     ]
 44 |     
 45 |     edges = [
 46 |         Edge(
 47 |             source=str(edge['from']),  # Ensure source is string
 48 |             target=str(edge['to']),    # Ensure target is string
 49 |             label=str(edge['label'])
 50 |         ) for edge in edges_data
 51 |     ]
 52 | 
 53 |     # Configure graph display
 54 |     config = Config(
 55 |         width=1000,
 56 |         height=500,
 57 |         directed=True,
 58 |         physics=True,
 59 |         hierarchical=True,
 60 |         nodeHighlightBehavior=True,
 61 |         highlightColor="#F7A7A6",
 62 |         collapsible=True,
 63 |         node={'labelProperty': 'label'},
 64 |         link={'labelProperty': 'label', 'renderLabel': True}
 65 |     )
 66 |     
 67 |     return nodes, edges, config
 68 | 
 69 | # Add extract button
 70 | def extract_knowledge():
 71 |     if not text_input:
 72 |         st.warning("Please enter text first.")
 73 |     else:
 74 |         # with st.spinner('Extracting knowledge graph...'):
 75 |         try:
 76 |             # Call OpenAI API to generate nodes and edges
 77 |             nodes_data, edges_data = generate_graph_data(text_input)
 78 |             
 79 |             if not nodes_data or not edges_data:
 80 |                 st.warning("Failed to extract valid knowledge graph. Please modify your input text.")
 81 |                 st.session_state.graph_ready = False
 82 |             else:
 83 |                 # Store graph data in session state
 84 |                 st.session_state.graph_data = (nodes_data, edges_data)
 85 |                 # Prepare visualization data
 86 |                 nodes, edges, config = prepare_graph_visualization(nodes_data, edges_data)
 87 |                 st.session_state.agraph_config = {
 88 |                     'nodes': nodes,
 89 |                     'edges': edges,
 90 |                     'config': config
 91 |                 }
 92 |                 st.session_state.graph_ready = True
 93 |                 # st.success("Knowledge extraction completed! Click 'Show Graph' to view the result.")
 94 |                 
 95 |         except Exception as e:
 96 |             st.error(f"Error during knowledge extraction: {str(e)}")
 97 |             st.session_state.graph_ready = False
 98 |     return nodes_data, edges_data
 99 | 
100 | 
101 | if st.button("Extract Knowledge"):
102 |     with st.spinner('Extracting knowledge graph...'):
103 |         nodes_data, edges_data = extract_knowledge()
104 | 
105 | # if st.button("Show Graph") or st.session_state.graph_ready:
106 |     st.write("Showing graph...")
107 |     with st.expander("Graph", expanded=True):
108 |         st.write("Nodes:")
109 |         st.write(nodes_data)
110 |         st.write("Edges:")
111 |         st.write(edges_data)
112 |         # st.write(st.session_state.agraph_config['nodes'])
113 |         # st.write(st.session_state.agraph_config['edges'])
114 |         # st.write(st.session_state.agraph_config['config'])
115 |     try:
116 |         # Display graph using stored configuration
117 |         return_value = agraph(
118 |             nodes=st.session_state.agraph_config['nodes'],
119 |             edges=st.session_state.agraph_config['edges'],
120 |             config=st.session_state.agraph_config['config']
121 |         )
122 |             
123 |     except Exception as e:
124 |         st.error(f"Error displaying knowledge graph: {str(e)}")
125 | 
126 | if not st.session_state.graph_ready:
127 |     st.warning("Please extract knowledge first.")
128 | 
129 | def setup_sidebar():
130 |     """Setup sidebar for API key inputs"""
131 |     with st.sidebar:
132 |         st.header("API Configuration")
133 |         
134 |         # Add supplier selection dropdown
135 |         supplier = st.selectbox(
136 |             "Select LLM Provider",
137 |             options=["zhipu", "azure"],
138 |             index=0,  # Default to zhipu
139 |             key="supplier_select"
140 |         )
141 |         st.session_state.current_supplier = supplier
142 |         
143 |         # Add temperature slider
144 |         temperature = st.slider(
145 |             "Temperature",
146 |             min_value=0.0,
147 |             max_value=1.0,
148 |             value=st.session_state.temperature,
149 |             step=0.1,
150 |             help="Higher values make the output more random, lower values make it more focused and deterministic"
151 |         )
152 |         st.session_state.temperature = temperature
153 |         
154 |         st.markdown("---")  # Add a divider
155 |         
156 |         # Single API key input
157 |         api_key = st.text_input(
158 |             f"Enter {supplier.upper()} API Key",
159 |             type="password",
160 |             value=st.session_state.api_key,
161 |             key="api_key_input"
162 |         )
163 |         if api_key:
164 |             st.session_state.api_key = api_key
165 | 
166 | def main():
167 |     setup_sidebar()
168 |     # Rest of your application code...
169 | 
170 | if __name__ == "__main__":
171 |     main()
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/llm_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import streamlit as st
 4 | from langchain.chat_models import AzureChatOpenAI
 5 | from langchain_community.chat_models import ChatZhipuAI
 6 | from langchain_core.messages import SystemMessage, HumanMessage
 7 | 
 8 | # Initialize session state for API key, supplier and temperature
 9 | if 'api_key' not in st.session_state:
10 |     st.session_state.api_key = ''
11 | if 'current_supplier' not in st.session_state:
12 |     st.session_state.current_supplier = 'zhipu'  # Default to zhipu
13 | if 'temperature' not in st.session_state:
14 |     st.session_state.temperature = 0.1  # Default temperature
15 | 
16 | # Azure OpenAI Configuration
17 | AZURE_CONFIGS = {
18 |     "xxx"
19 | }
20 | 
21 | def call_llm(system_msg, user_msg, supplier=st.session_state.current_supplier):
22 |     st.info(system_msg, icon="🔥")
23 |     st.info(user_msg, icon="🔥")
24 |     if not st.session_state.api_key:
25 |         raise ValueError(f"{supplier.upper()} API Key not set. Please enter it in the sidebar.")
26 | 
27 |     try:
28 |         if supplier == "azure":
29 |             os.environ["AZURE_OPENAI_API_KEY"] = st.session_state.api_key
30 |             os.environ["AZURE_OPENAI_ENDPOINT"] = AZURE_CONFIGS["base_url"]
31 |             
32 |             llm = AzureChatOpenAI(
33 |                 openai_api_version=AZURE_CONFIGS['api_version'],
34 |                 azure_endpoint=AZURE_CONFIGS["base_url"],
35 |                 azure_deployment=AZURE_CONFIGS["model_deployment"],
36 |                 model=AZURE_CONFIGS["model_name"],
37 |                 validate_base_url=False,
38 |                 temperature=st.session_state.temperature,  # Use temperature from session state
39 |                 max_tokens=None,
40 |                 timeout=None,
41 |                 max_retries=2
42 |             )
43 | 
44 |             messages = [
45 |                 ('system', system_msg),
46 |                 ('human', user_msg)
47 |             ]
48 |         elif supplier == "zhipu":
49 |             llm = ChatZhipuAI(
50 |                 model='glm-4-flash',
51 |                 temperature=st.session_state.temperature,  # Use temperature from session state
52 |                 api_key=st.session_state.api_key
53 |             )
54 |             
55 |             messages = [
56 |                 SystemMessage(system_msg),
57 |                 HumanMessage(user_msg)
58 |             ]
59 |         else:
60 |             raise ValueError(f'Invalid LLM supplier: {supplier}')
61 | 
62 |         # Call LLM
63 |         res = llm.invoke(messages)
64 |         if not res or not res.content:
65 |             raise ValueError("API returned empty response")
66 |             
67 |         output = res.content
68 |         
69 |         # Validate JSON format and structure
70 |         data = json.loads(output)
71 |         if not isinstance(data, dict):
72 |             raise ValueError("API response is not a valid JSON object")
73 |         
74 |         if 'nodes' not in data or 'edges' not in data:
75 |             raise ValueError("API response missing required 'nodes' or 'edges' fields")
76 |             
77 |         if not isinstance(data['nodes'], list) or not isinstance(data['edges'], list):
78 |             raise ValueError("'nodes' and 'edges' must be arrays")
79 |             
80 |         if len(data['nodes']) < 3:
81 |             raise ValueError("At least 3 nodes are required")
82 |             
83 |         # Print output for debugging
84 |         st.write("Extraction result:")
85 |         st.info(output, icon="🎯")
86 |         return output
87 |             
88 |     except json.JSONDecodeError as je:
89 |         st.error(f"Invalid JSON format in API response: {str(je)}")
90 |         return '{"nodes": [], "edges": []}'
91 |     except ValueError as ve:
92 |         st.error(str(ve))
93 |         return '{"nodes": [], "edges": []}'
94 |     except Exception as e:
95 |         st.error(f"API call error: {str(e)}")
96 |         return '{"nodes": [], "edges": []}' 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.2.16
2 | langchain-community==0.2.16
3 | langchain-core==0.2.38
4 | langchain-text-splitters==0.2.4
5 | langdetect==1.0.9
6 | streamlit==1.38.0
7 | streamlit-agraph==0.0.45
8 | zhipuai==2.1.5.20230904
9 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import json
  3 | from llm_utils import call_llm
  4 | 
  5 | def detect_language(text):
  6 |     """Detect the primary language of the input text"""
  7 |     # Simple language detection based on character sets
  8 |     chinese_chars = len([c for c in text if '\u4e00' <= c <= '\u9fff'])
  9 |     english_chars = len([c for c in text if c.isascii() and c.isalpha()])
 10 |     
 11 |     return 'chinese' if chinese_chars > english_chars else 'english'
 12 | 
 13 | def get_system_prompt(language):
 14 |     """Get system prompt based on language"""
 15 |     if language == 'chinese':
 16 |         return """You are a professional text analysis assistant. Please analyze the input text and extract key concepts and their relationships.
 17 | 
 18 | You must output ONLY a JSON object in the following format, with NO additional text or explanation:
 19 | 
 20 | {
 21 |     "nodes": [
 22 |         {
 23 |             "id": "1",           // Must be a unique string
 24 |             "label": "概念1",     // Concept name in Chinese
 25 |             "group": "类别1"      // Category in Chinese
 26 |         }
 27 |     ],
 28 |     "edges": [
 29 |         {
 30 |             "from": "1",         // Must match an existing node id
 31 |             "to": "2",           // Must match an existing node id
 32 |             "label": "包含"       // Relationship description in Chinese
 33 |         }
 34 |     ]
 35 | }
 36 | 
 37 | Requirements:
 38 | 1. Output ONLY the JSON object, no other text
 39 | 2. All node IDs must be unique strings
 40 | 3. All 'from' and 'to' in edges must reference existing node IDs
 41 | 4. All labels and descriptions MUST be in Chinese
 42 | 5. The output must be valid JSON format
 43 | 6. Extract at least 3 key concepts and their relationships
 44 | 7. Group similar concepts under the same category
 45 | 8. Use natural and idiomatic Chinese expressions
 46 | 9. Ensure relationship descriptions are clear and meaningful
 47 | 
 48 | DO NOT include any explanations or markdown formatting in the output."""
 49 |     else:
 50 |         return """You are a professional text analysis assistant. Please analyze the input text and extract key concepts and their relationships.
 51 | 
 52 | You must output ONLY a JSON object in the following format, with NO additional text or explanation:
 53 | 
 54 | {
 55 |     "nodes": [
 56 |         {
 57 |             "id": "1",           // Must be a unique string
 58 |             "label": "Concept1", // Concept name in English
 59 |             "group": "Group1"    // Category in English
 60 |         }
 61 |     ],
 62 |     "edges": [
 63 |         {
 64 |             "from": "1",         // Must match an existing node id
 65 |             "to": "2",           // Must match an existing node id
 66 |             "label": "contains"  // Relationship description in English
 67 |         }
 68 |     ]
 69 | }
 70 | 
 71 | Requirements:
 72 | 1. Output ONLY the JSON object, no other text
 73 | 2. All node IDs must be unique strings
 74 | 3. All 'from' and 'to' in edges must reference existing node IDs
 75 | 4. All labels and descriptions MUST be in English
 76 | 5. The output must be valid JSON format
 77 | 6. Extract at least 3 key concepts and their relationships
 78 | 7. Group similar concepts under the same category
 79 | 8. Use natural and idiomatic English expressions
 80 | 9. Ensure relationship descriptions are clear and meaningful
 81 | 
 82 | DO NOT include any explanations or markdown formatting in the output."""
 83 | 
 84 | def generate_graph_data(text):
 85 |     """Call OpenAI API to generate graph nodes and edges data"""
 86 |     
 87 |     # Detect the language of input text
 88 |     language = detect_language(text)
 89 |     
 90 |     # Get appropriate system prompt based on language
 91 |     system_msg = get_system_prompt(language)
 92 |     
 93 |     user_msg = "Please analyze the following text and generate relationship graph data:\n" + text
 94 |     
 95 |     try:
 96 |         # Call OpenAI API
 97 |         output = call_llm(system_msg, user_msg)
 98 |         if not output:
 99 |             raise ValueError("API returned empty response")
100 |             
101 |         # Clean potential extra content from output
102 |         output = output.strip()
103 |         if output.startswith("```json"):
104 |             output = output[7:]
105 |         if output.endswith("```"):
106 |             output = output[:-3]
107 |         output = output.strip()
108 |         
109 |         # Parse JSON data
110 |         result = json.loads(output)
111 |         
112 |         # Validate data format
113 |         if not isinstance(result, dict):
114 |             raise ValueError("Response is not a JSON object")
115 |         if 'nodes' not in result or 'edges' not in result:
116 |             raise ValueError("Missing required 'nodes' or 'edges' fields")
117 |         if not isinstance(result['nodes'], list) or not isinstance(result['edges'], list):
118 |             raise ValueError("'nodes' or 'edges' is not an array")
119 |         if len(result['nodes']) < 3:
120 |             raise ValueError("At least 3 nodes are required")
121 |             
122 |         # Validate nodes and edges data
123 |         node_ids = set()
124 |         groups = set()
125 |         for node in result['nodes']:
126 |             if not all(k in node for k in ('id', 'label', 'group')):
127 |                 raise ValueError("Invalid node format - missing required fields")
128 |             if not all(isinstance(node[k], str) for k in ('id', 'label', 'group')):
129 |                 raise ValueError("Node fields must be strings")
130 |             if str(node['id']) in node_ids:
131 |                 raise ValueError(f"Duplicate node ID found: {node['id']}")
132 |             node_ids.add(str(node['id']))
133 |             groups.add(node['group'])
134 |             
135 |         if len(groups) < 2:
136 |             raise ValueError("Nodes should be categorized into at least 2 groups")
137 |             
138 |         for edge in result['edges']:
139 |             if not all(k in edge for k in ('from', 'to', 'label')):
140 |                 raise ValueError("Invalid edge format - missing required fields")
141 |             if not all(isinstance(edge[k], str) for k in ('from', 'to', 'label')):
142 |                 raise ValueError("Edge fields must be strings")
143 |             if str(edge['from']) not in node_ids:
144 |                 raise ValueError(f"Edge references non-existent source node: {edge['from']}")
145 |             if str(edge['to']) not in node_ids:
146 |                 raise ValueError(f"Edge references non-existent target node: {edge['to']}")
147 |         
148 |         return result['nodes'], result['edges']
149 |         
150 |     except json.JSONDecodeError as je:
151 |         st.error(f"JSON parsing error: {str(je)}\nActual output: {output}")
152 |         return [], []
153 |     except Exception as e:
154 |         st.error(f"Error generating graph data: {str(e)}")
155 |         return [], [] 


--------------------------------------------------------------------------------