├── .gitignore
├── LICENSE
├── README.md
├── all_tables_schema.txt
├── assets
├── all_tables_schema.txt
├── question.json
└── question_en.json
├── baseline
├── bus_baseline
│ ├── README.md
│ ├── requirements.txt
│ └── run.ipynb
├── howard_baseline
│ ├── .env-template
│ ├── .gitignore
│ ├── README.md
│ ├── agents.py
│ ├── assets.ipynb
│ ├── config.py
│ ├── database.ipynb
│ ├── llms.py
│ ├── main.py
│ ├── manual.ipynb
│ ├── requirements.txt
│ ├── src
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── database.py
│ │ ├── llm.py
│ │ ├── log.py
│ │ ├── teamwork.py
│ │ ├── utils.py
│ │ └── workflow.py
│ ├── utils.py
│ └── workflows.py
├── sample
│ ├── README.md
│ ├── main.py
│ ├── prepare_dataset.py
│ ├── prompt.py
│ ├── requirements.txt
│ └── utils.py
└── soldier_baseline
│ ├── README.md
│ ├── requirements.txt
│ └── soldier_baseline.ipynb
└── pyproject.toml
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__/
2 | *.DS_Store
3 | *.idea
4 | output*
5 | test*
6 | *.ipynb_checkpoints
7 | venv
8 | *.jsonl
9 | *.csv
10 | *.xlsx
11 | *.xls
12 | *.doc
13 | *.docx
14 | zr_baseline
15 | check
16 | *venv
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2025 @ FinGLM Project of ZhipuAI
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FinGLM
2 |
3 | ## 仓库介绍
4 |
5 | 本仓库是关于 [2024金融行业·大模型挑战赛](https://competitions.zhipuai.cn/matchDetail?id=120241202000000003) 的开源代码汇总,
6 | 旨在探索大语言模型在法律行业的应用潜力。 本仓库存放了数个比赛团队的竞赛原始代码,均经过整理并开源。
7 |
8 | ## 项目更新
9 |
10 | - **News**: ```2024/12/11```: 本仓库关于BaseLine提交要求和例子提供。
11 |
12 | ## 初赛 Baseline 提交说明
13 |
14 | ### 评测环境说明
15 |
16 | - Python 3.12.7 / Python 3.10.13
17 | - Ubuntu 22.04 操作系统, 每个参赛选手会分配到16G运行内存。
18 | - **不提供** GPU / NPU,仅保证`UTF-8` 编码格式能正常解码。
19 | - 仅能访问智谱AI API,pip清华源仓库, 比赛数据库, 不提供联网服务。评测环境提供API KEY,但不具备联网功能,**禁止** 流式输出。
20 | > 如果你提交的代码依赖源码安装的pip包,请在README中写明。主办方将在审核后从github拉取对应pip包主分支。
21 | >
22 | > 你所使用的pip包必须是在2024年12月1日之前发布的版本,主办方将有权对你使用的pip包提问其作用。
23 |
24 | ### 提交规范
25 |
26 | - 不允许提交`.idea, .DS_Store` 等无效文件和本地缓存文件。
27 | - 本仓库会提供比赛的参考材料,放置在[assets](assets),
28 | 所有的预处理的数据工作必须基于这些材料进行,不允许提交明文信息参考材料,比如公司股票信息,公司介绍等用于辅助回答答案的内容,无论这些参考材料用于何处。
29 | - 提交代码文件请使用`ruff`格式刷。确保满足`PEP 8`规范。规范文件请参考这里: [规范文件](pyproject.toml)
30 | - 必须提交一个`README.md`用以解释你的思路,确保复现人员能够理解你的代码, 以及根据`requirements.txt`来安装依赖。若运行失败,则提交无效。
31 | - 必须提交一个`jupyter notebook`文件,用以展示你的代码运行结果。确保代码能够正常运行,否则提交无效。
32 | - 所有`API_KEY`必须用环境变量,或者外部传递变量的方式传递。
33 |
34 | 请不要直接贡献到这个仓库,而是通过比赛链接上传到比赛官方进行评测。不满足提交规范的作品将不被收录。你可以参考 [例子](baseline/sample/README.md)。
35 | - 每个组最多提交三次。和已有的已经开源的 Baseline 方案相似方案将不会被收录 包含思维方式相似,预处理方式相似,工作流相似。因此,请明确在README说明你的创新点。
36 |
37 | ###
38 |
39 | ## 开源协议
40 |
41 | 本代码中无特殊说明或者无注名额外协议的,均使用 [Apache 2.0](LICENSE) 协议。
42 |
--------------------------------------------------------------------------------
/baseline/bus_baseline/README.md:
--------------------------------------------------------------------------------
1 | # FinanceFast_Bus
2 |
3 | ***
4 | 成绩验证分数: 42.36分
5 |
6 | 成绩验证时间: 2024年12月18日 9:00 PM
7 |
8 | 完整运行时长: 3小时12分钟
9 |
10 | 完整运行Token消耗: GLM-4-Plus 620万 Tokens
11 | ***
12 |
13 | ## 队伍简介
14 |
15 | FinanceFast_Bus 提供频繁、可靠且迅速的服务,帮助您在不同地点之间转移金融资源,就如同您的财务的公共公交系统一样。
16 |
17 | 作者: **公交车的轮子转啊转**
18 |
19 | ## 方案简介
20 |
21 | 本方案通过多轮交互式对话,结合GLM模型与数据库技术,实现高效、精准的命名实体识别、SQL语句生成与优化、数据库查询,以及结果的交互式处理。旨在为用户提供快速、灵活且智能化的问答体验。
22 |
23 | ### 方案亮点
24 |
25 | 1. **多轮交互,精准回答**:支持与模型进行多轮对话,动态生成SQL语句并查询数据库,逐步优化查询,直至输出完整、准确的答案。
26 | 2. **SQL简单优化**:自动识别并优化SQL语句,解决常见的`date`格式不匹配等问题,提高查询效率与兼容性。
27 | 3. **召回列名和中文注释简单优化**:在召回列名和中文注释时进行智能化处理,确保结果更加贴合用户需求,减少理解障碍。
28 |
29 | ## 主要功能
30 |
31 | ### 1. **问答流程**
32 |
33 | - 读取用户问题,结合数据库内容与表结构,生成相应SQL语句进行查询。
34 | - 支持多轮对话交互,在每一步都对结果进行验证和后续查询。
35 |
36 | ### 2. **SQL优化**
37 |
38 | - 对生成的SQL语句进行优化,包括日期格式化、条件处理等,使SQL语句在指定数据库中能高效运行。
39 | - 支持对语句进行语法检查,防止执行错误。
40 |
41 | ### 3. **命名实体识别**
42 |
43 | - 自动识别问题中的关键实体,包括公司名称、基金名称、证券代码等。
44 | - 根据实体类型映射到数据库表,生成适配的SQL查询语句。
45 |
46 | ### 4. **数据查询与结果输出**
47 |
48 | - 提供对数据库的查询接口,通过SQL获取表数据,并进行后续处理。
49 | - 输出标准化、结构化的JSON格式结果,并可直接保存为文件。
50 |
51 | ---
52 |
53 | ## 运行复现
54 |
55 | 1. 运行以下命令以安装所需依赖:
56 |
57 | ```bash
58 | pip install -r requirements.txt
59 | ```
60 |
61 | 2. 打开run.ipynb文件。确保所有依赖库已正确安装。依次运行文件中的各个单元格,系统将自动完成问答流程。运行完成后,程序会将结果保存到
62 | **`result.json`** 文件中。如果你只想运行部分问题。可以调整`start_idx`和`end_idx`的值。具体位置相见 jupyter
63 | notebook最后一个代码块。
64 |
65 | 3. 本方案均使用题目数据,因此,你可以在主仓库的`assets`文件夹找到。
66 |
67 | ## 核心功能详解
68 |
69 | ### **1. 工具函数**
70 |
71 | 项目内实现了一系列工具函数,主要功能包括:
72 |
73 | - **`create_chat_completion`**:与模型交互,生成对话结果。
74 | - **`replace_date_with_day`**:优化SQL语句中的日期格式。
75 | - **`filter_table_comments`**:从数据库表注释中提取与问题相关的内容。
76 | - **`process_company_name` / `process_code`**:根据公司名称或代码查询数据库。
77 |
78 | ### **2. 多轮交互**
79 |
80 | - 项目支持与模型多轮交互,每轮生成SQL语句并查询数据库,直到获取完整答案。
81 | - 提供 `run_conversation_until_complete` 函数,实现动态查询。
82 |
83 | ### **3. 命名实体识别**
84 |
85 | - 使用示例和模板对问题中的关键实体进行抽取。
86 | - 支持公司名称(中英文全称、简称)、基金名称、证券代码等多种实体类型。
87 |
88 | ---
89 |
90 | ## 输出结果示例
91 |
92 | 设置参数:
93 | ```
94 | start_idx = 63 # 起始问题索引
95 | end_idx = 64 # 结束问题索引
96 | ```
97 |
98 | **输入问题**:
99 |
100 | ```json
101 | [
102 | {
103 | "tid": "tttt----64",
104 | "team": [
105 | {
106 | "id": "tttt----64----36-4-1",
107 | "question": "最新更新的2021年度报告中,机构持有无限售流通A股数量合计最多的公司简称是?"
108 | },
109 | {
110 | "id": "tttt----64----36-4-2",
111 | "question": "在这份报告中,该公司机构持有无限售流通A股比例合计是多少,保留2位小数?"
112 | },
113 | {
114 | "id": "tttt----64----36-4-3",
115 | "question": "该公司前十大股东持股比例合计是多少?"
116 | }
117 | ]
118 | }
119 | ]
120 | ```
121 |
122 | **输出结果**:
123 |
124 | ```json
125 | [
126 | {
127 | "tid": "tttt----64",
128 | "team": [
129 | {
130 | "id": "tttt----64----36-4-1",
131 | "question": "最新更新的2021年度报告中,机构持有无限售流通A股数量合计最多的公司简称是?",
132 | "answer": "公司简称 帝尔激光"
133 | },
134 | {
135 | "id": "tttt----64----36-4-2",
136 | "question": "在这份报告中,该公司机构持有无限售流通A股比例合计是多少,保留2位小数?",
137 | "answer": "机构持有无限售流通A股比例合计(%) 10.1309"
138 | },
139 | {
140 | "id": "tttt----64----36-4-3",
141 | "question": "该公司前十大股东持股比例合计是多少?",
142 | "answer": "Top10StockholdersProp 64.51"
143 | }
144 | ]
145 | }
146 | ]
147 | ```
148 |
149 | ---
150 |
151 | ## 注意事项
152 |
153 | 1. **数据库连接**:确保数据库接口可用,并正确设置访问令牌(`ACCESS_TOKEN`、`ZhipuAI_API_KEY`)。
154 | 2. **SQL语句优化**:确保 SQL 语句符合目标数据库的语法规则。
155 |
156 | ---
157 |
158 | ## 写在最后
159 |
160 | `命名实体识别` 函数得到了 **@躺躺不想动了** 老师的大力支持,**@开源专家zR** 对代码进行了辛勤整理,在此表示衷心的感谢。
161 |
162 | 希望我的开源方案能够成为大家的一点灵感和参考。如果其中有不足之处,还请多多包涵!
163 |
164 | 衷心期望它能为大家的夺冠之路增添一丝助力!💪✨
165 |
166 | 希望你喜欢这个项目! 😊
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/baseline/bus_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba>=0.42.1
2 | zhipuai>=2.1.5.20241204
3 | transformers>=4.47.0
4 | pandas>=2.2.3
5 | openpyxl>=3.1.5
6 | tqdm>=4.67.1
--------------------------------------------------------------------------------
/baseline/howard_baseline/.env-template:
--------------------------------------------------------------------------------
1 | OLLAMA_HOST=xxx
2 | ZHIPU_API_KEY=xxx
3 | ZHIPU_ACCESS_TOKEN=xxx
4 | OPENAI_API_KEY=xxx
5 | OPENAI_BASE_URL=xxx
6 | # HTTPS_PROXY=
7 | # HTTP_PROXY=
--------------------------------------------------------------------------------
/baseline/howard_baseline/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | src/__pycache__
3 | __pycache__
4 | .env
--------------------------------------------------------------------------------
/baseline/howard_baseline/README.md:
--------------------------------------------------------------------------------
1 | # Howard-baseline
2 |
3 | ***
4 | 成绩验证分数: 52.3分
5 |
6 | 成绩验证时间: 2025年2月18日 9:00 AM
7 |
8 | 完整运行时长: 7小时10分钟
9 |
10 | 完整运行Token消耗: GLM-4-Plus 1030万 Tokens
11 |
12 | ***
13 |
14 | ### 准备环境变量
15 |
16 | 这个文件存储了所有的Keys和API接口,你需要在这里填写 GLM API Keys 和 比赛平台 API。
17 |
18 | ```
19 | cp .env-template .env
20 | ```
21 |
22 | ### 准备数据
23 |
24 | 执行 assets.ipynb 里的代码,会在 assets 目录下生成以下几个文件:
25 |
26 | - db_info.json
27 | - db_table.json
28 | - table_column.json
29 |
30 | 它们是从比赛原数据里提取出来结构化成json的数据,因为后续跑的过程,会按照
31 | `“选择数据库”->“选择数据表”->“选择字段”` 三个步骤逐层提取跟题目相关的数据库信息。 另外,为了节省tokens消耗,借助了LLM进行了一些文字归纳。
32 |
33 | ### 配置
34 |
35 | 在 config.py 文件里可以设定一些配置项。
36 |
37 | ```
38 | MAX_ITERATE_NUM = 20 # 配置SQL Query工作流的最大迭代次数
39 | MAX_SQL_RESULT_ROWS = 100 # 配置智谱SQL查询接口的LIMIT参数
40 |
41 | START_INDEX = [0, 0] # 起始下标 [team_index, question_idx]
42 | END_INDEX = [len(all_question)-1, len(all_question[-1]["team"])-1] # 结束下标 [team_index, question_idx] (包含)
43 | SAVE_FILE_SUBFIX = ""
44 |
45 | llm_plus = llms.llm_glm_4_plus # 配置使用的LLM
46 | ```
47 |
48 | 其中 START_INDEX 和 END_INDEX 配置要跑的题目范围。
49 |
50 | ### 执行命令
51 |
52 | ```
53 | PYTHONUNBUFFERED=1 python main.py | tee -a output/main.log
54 | ```
55 |
56 | 执行以上命令,就可以开始跑了。
57 |
58 |
59 | ### DEBUG
60 |
61 | 在 manual.ipynb 里,可以针对某道题手工跑,精调prompt。请注意,这仅作为调试使用。
62 |
63 | ### 输出
64 |
65 | + 输出目录在 output 目录下,结构为 `ttt----题目编号----题目大题.log`
66 | + `all_question.json` 里会缓存一些中间数据。
67 | + 运行结束后,得到 `Eva_Now_result.json` 作为最终提交的答案。
68 |
69 |
70 | ### 思路
71 |
72 | 1. 召回跟题目相关的数据库信息
73 |
74 | 由于数据库表总共有77张,涉及到的数据库结构信息非常多,全部给到LLM,一来浪费token,二来可能带来信息丟失的问题,
75 | 三来也非常没必要。
76 | 本方案的思路是,从数据库->数据表->表字段这三层的顺序,逐层去找。但这个思路严重依赖两点:上层的信息归纳准确度、完备度、区分度;LLM的理解能力。
77 |
78 | 2. 模拟人机交互,使LLM自问自答,最后找到想要的答案。
79 | 这个重点有两个:
80 | - 保留数据库查询到的结果,继承到后面的LLM的上下文。
81 | - 每一轮跟LLM的问答里,模拟user的问题,是的LLM可以继续思考下去。总之,就是prompt工程。
82 |
83 |
84 | ### 改进思路
85 |
86 | 数据库信息召回这个,实践下来,发现本方案的三层召回法,效果并不太理想,需要大量的人工精调prompt。
87 | 可以考虑构建向量库进行搜索,并且对外链字段做关联召回。
--------------------------------------------------------------------------------
/baseline/howard_baseline/agents.py:
--------------------------------------------------------------------------------
1 | """
2 | This module defines various agents for processing SQL queries and rewriting questions.
3 | Each agent is configured with specific roles, constraints, and output formats.
4 | """
5 |
6 | from src.agent import Agent, AgentConfig
7 | from utils import extract_company_code
8 | import config
9 |
10 | agent_rewrite_question = Agent(
11 | AgentConfig(
12 | name="rewrite_question",
13 | role=("""你的工作是,根据要求和已有信息,重写用户的问题,让问题清晰明确,把必要的前述含义加进去。"""),
14 | constraint=(
15 | """- 不改变原意,不要遗漏信息,特别是时间、回答的格式要求,只返回问题。\n"""
16 | """- 如果有历史对话,那么根据历史对话,将原问题中模糊的实体(公司、文件、时间等)替换为具体的表述。\n"""
17 | """- 要注意主语在历史对答中存在继承关系,不能改变了,例如:"问:A的最大股东是谁?答:B。问:有多少股东?"改写后应该是"A有多少股东?"\n"""
18 | """- 如果原问题里存在"假设xxx"这种表述,请一定要保留到重写的问题里,因为它代表了突破某种既定的规则限制,设立了新规则,这是重要信息\n"""
19 | """- 如果原问题里的时间很模糊,那么考虑是否值得是前一个问答里发生的事件的时间\n"""
20 | ),
21 | output_format=("""要求只返回重写后的问题,不要有其他任何多余的输出\n"""),
22 | llm=config.llm_plus,
23 | stream=False,
24 | )
25 | )
26 | agent_extract_company = Agent(
27 | AgentConfig(
28 | llm=config.llm_plus,
29 | name="extract_company",
30 | role="接受用户给的一段文字,提取里面的实体(如公司名、股票代码、拼音缩写等)。",
31 | output_format=(
32 | """```json
33 | ["实体名_1", "实体名_2", ...]
34 | ```
35 | 注意,有可能识别结果为空。"""
36 | ),
37 | post_process=extract_company_code,
38 | enable_history=False,
39 | stream=False,
40 | )
41 | )
42 | agent_extract_company.add_system_prompt_kv(
43 | {
44 | "ENTITY EXAMPLE": (
45 | "居然之家",
46 | "ABCD",
47 | ),
48 | }
49 | )
50 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/assets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 预处理assets,整理出后续要用的数据"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "metadata": {
13 | "ExecuteTime": {
14 | "end_time": "2025-02-17T06:24:12.796941Z",
15 | "start_time": "2025-02-17T06:24:11.726134Z"
16 | }
17 | },
18 | "source": [
19 | "from dotenv import load_dotenv\n",
20 | "load_dotenv()\n",
21 | "import os\n",
22 | "import pandas as pd\n",
23 | "import json\n",
24 | "import llms\n",
25 | "os.environ[\"DEBUG\"] = \"1\"\n",
26 | "os.environ[\"SHOW_LLM_INPUT_MSG\"] = \"1\"\n"
27 | ],
28 | "outputs": [],
29 | "execution_count": 1
30 | },
31 | {
32 | "cell_type": "code",
33 | "metadata": {
34 | "ExecuteTime": {
35 | "end_time": "2025-02-17T06:24:13.054823Z",
36 | "start_time": "2025-02-17T06:24:12.804268Z"
37 | }
38 | },
39 | "source": [
40 | "# Preprocess the competition questions here\n",
41 | "root_dir = os.getcwd()\n",
42 | "question_data_path = root_dir + \"/../../assets/question.json\"\n",
43 | "df1 = pd.read_excel(root_dir + \"/../../assets/data_dictionary.xlsx\", sheet_name=\"库表关系\")\n",
44 | "df2 = pd.read_excel(root_dir + \"/../../assets/data_dictionary.xlsx\", sheet_name=\"表字段信息\")\n",
45 | "file_path = root_dir + \"/../../assets/all_tables_schema.txt\""
46 | ],
47 | "outputs": [],
48 | "execution_count": 2
49 | },
50 | {
51 | "cell_type": "code",
52 | "metadata": {
53 | "ExecuteTime": {
54 | "end_time": "2025-02-17T06:24:13.151625Z",
55 | "start_time": "2025-02-17T06:24:13.137908Z"
56 | }
57 | },
58 | "source": [
59 | "# 假设 df1 的列名为 '库名英文', '库名中文', '表英文', '表中文', '表描述'\n",
60 | "db_table = (\n",
61 | " df1.groupby(\"库名英文\")\n",
62 | " .apply(\n",
63 | " lambda x: {\n",
64 | " \"库名中文\": x[\"库名中文\"].iloc[0],\n",
65 | " \"表\": x[[\"表英文\", \"表中文\", \"表描述\"]]\n",
66 | " .apply(lambda y: y.str.lower() if y.name == \"表英文\" else y)\n",
67 | " .to_dict(\"records\"),\n",
68 | " }\n",
69 | " )\n",
70 | " .to_dict()\n",
71 | ")\n",
72 | "db_table = {k.lower(): v for k, v in db_table.items()}\n",
73 | "\n",
74 | "os.makedirs(root_dir + \"/assets\", exist_ok=True)\n",
75 | "with open(root_dir + \"/assets/db_table.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
76 | " json.dump(db_table, json_file, ensure_ascii=False, indent=2)"
77 | ],
78 | "outputs": [
79 | {
80 | "name": "stderr",
81 | "output_type": "stream",
82 | "text": [
83 | "/var/folders/cz/76xt997d6kl_rwv3_7mx_nm40000gn/T/ipykernel_42959/1447885678.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
84 | " .apply(\n"
85 | ]
86 | }
87 | ],
88 | "execution_count": 3
89 | },
90 | {
91 | "cell_type": "code",
92 | "metadata": {
93 | "ExecuteTime": {
94 | "end_time": "2025-02-17T06:24:13.271121Z",
95 | "start_time": "2025-02-17T06:24:13.203292Z"
96 | }
97 | },
98 | "source": [
99 | "# 假设 df2 的列名为 'table_name', 'column_name', 'column_description', '注释'\n",
100 | "table_column = (\n",
101 | " df2.groupby(\"table_name\")\n",
102 | " .apply(\n",
103 | " lambda x: [\n",
104 | " {\n",
105 | " \"column\": row[\"column_name\"],\n",
106 | " \"desc\": f\"{row['column_description']}\"\n",
107 | " + (f\";{row['注释']}\" if pd.notna(row[\"注释\"]) and row[\"注释\"] else \"\").strip(),\n",
108 | " }\n",
109 | " for _, row in x.iterrows()\n",
110 | " if pd.notna(row[\"column_name\"])\n",
111 | " and row[\"column_name\"] not in [\"JSID\", \"UpdateTime\", \"InsertTime\", \"ID\", \"XGRQ\"]\n",
112 | " ]\n",
113 | " )\n",
114 | " .to_dict()\n",
115 | ")\n",
116 | "table_column = {k.lower(): v for k, v in table_column.items()}\n",
117 | "\n",
118 | "# 打印结果以验证\n",
119 | "with open(root_dir + \"/assets/table_column.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
120 | " json.dump(table_column, json_file, ensure_ascii=False, indent=2)"
121 | ],
122 | "outputs": [
123 | {
124 | "name": "stderr",
125 | "output_type": "stream",
126 | "text": [
127 | "/var/folders/cz/76xt997d6kl_rwv3_7mx_nm40000gn/T/ipykernel_42959/3674784635.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
128 | " .apply(\n"
129 | ]
130 | }
131 | ],
132 | "execution_count": 4
133 | },
134 | {
135 | "cell_type": "code",
136 | "metadata": {
137 | "ExecuteTime": {
138 | "end_time": "2025-02-17T06:28:46.858688Z",
139 | "start_time": "2025-02-17T06:24:13.277618Z"
140 | }
141 | },
142 | "source": [
143 | "for db_name, db in db_table.items():\n",
144 | " for table in db[\"表\"]:\n",
145 | " table_name = table[\"表英文\"]\n",
146 | " print(f\"{db_name}.{table_name}\")\n",
147 | " column_list = []\n",
148 | " for column in table_column[table_name]:\n",
149 | " column_name = column[\"column\"]\n",
150 | " column_simple_desc = str(column[\"desc\"]).split(\";\")[0]\n",
151 | " column_list.append(f\"{column_simple_desc}({column_name})\")\n",
152 | " all_cols = \",\".join(column_list)\n",
153 | " table[\"all_cols\"] = all_cols\n",
154 | " cols_summary, token, ok = llms.llm_glm_4_plus.generate_response(\n",
155 | " system='''你善于对数据表的字段信息进行总结,把同类信息归类,比如\"联系人电话、联系人传真\"等总结为\"联系方式如电话、传真等。\n",
156 | " 输出一段文字,不换行。\"''',\n",
157 | " messages=[\n",
158 | " {\n",
159 | " \"role\": \"user\",\n",
160 | " \"content\": f\"下面是一个数据表的所有表字段,请帮我为这个数据表写一段介绍,把字段信息压缩进去:\\n{all_cols}\",\n",
161 | " }\n",
162 | " ],\n",
163 | " stream=False,\n",
164 | " )\n",
165 | " table[\"cols_summary\"] = cols_summary\n",
166 | " if not ok:\n",
167 | " print(f\"err: {db_name}.{table_name}\")"
168 | ],
169 | "outputs": [
170 | {
171 | "name": "stdout",
172 | "output_type": "stream",
173 | "text": [
174 | "astockbasicinfodb.lc_stockarchives\n",
175 | "该数据表涵盖公司基本信息、联系方式、注册及办公地址、证券信息、高管及顾问详情、经营范围及特殊标识等。包括公司代码、国别、董事会秘书及其联系方式、证券事务代表及其联系方式、公司及董秘授权代表的电话、传真、邮箱,注册及办公地址及邮编,公司联系地址及邮编,公司网址、信息披露渠道,成立日期、注册地、法人代表、总经理、法律顾问、会计师事务所,证监会行业分类,主营兼营范围,A股、B股、H股及CDR证券简称及代码,公司简介、中文名称、营业执照号,地区代码,以及尚未盈利、特殊表决权、协议控制架构、红筹企业等特殊标识和所属区县信息。\n",
176 | "astockbasicinfodb.lc_namechange\n",
177 | "该数据表涵盖公司基本信息及股东大会决议情况,包括公司代码、信息发布日期及来源、股东大会决议公告日期、决议是否通过、公司全称及简称的更改日期,以及中英文全称和缩写。\n",
178 | "astockbasicinfodb.lc_business\n",
179 | "该数据表涵盖公司基本信息及业务范围,包括公司代码、信息发布日期及来源、股东大会决议公告日期、否决情况、主营及兼营范围、主要业务与产品名称、行业相关代码及类别、涉足行业以及简称变更原因等关键信息。\n",
180 | "astockeventsdb.lc_warrant\n",
181 | "该数据表涵盖公司事件相关信息,包括公司代码、信息发布日期及来源、公告类型与披露方式、事件内容描述及进展状态、事件主体及进程、行为方式、货币单位、主体与对象名称及编号、关联关系、协议签署及终止情况、担保原因及金额、担保企业及资产、反担保信息、借贷银行及期限、担保期限及解除详情、违规与逾期情况、事项编码等。\n",
182 | "astockeventsdb.lc_credit\n",
183 | "该数据表详细记录了公司借贷及相关事件的全面信息,包括公司代码、信息发布日期及来源、公告类型与披露方式、事件内容与行为描述、最新进展及主体信息、事件进程与行为方式、货币单位、主体与对象名称及编号、关联关系、协议签署及终止情况、借贷条件、借入借出方及担保方信息、抵押置押资产、借贷金额与还款情况、年利率及期限、借贷与担保起止日期等关键财务与事件数据。\n",
184 | "astockeventsdb.lc_suitarbitration\n",
185 | "该数据表涵盖公司事件相关信息,包括基础标识(ID、公司代码)、信息发布时间(首次及最新发布日期)、信息来源与公告类型、披露方式及事件详情(内容、行为描述、最新进展、进程、方式)。涉及主体信息(名称、编号、关联关系)、交易对象(名称、编号、关联关系)、协议情况(签署日期、终止状态)。同时记录诉讼仲裁相关数据,如金额(首次及最新)、各方信息(原告、被告、连带责任人、其他方及其关联关系)、案由描述及状态(审理机构、仲裁及各级诉讼状态)、财产执行情况(被执行财产、归属及判决执行状态)。备注字段补充额外信息。\n",
186 | "astockeventsdb.lc_entrustinv\n",
187 | "该数据表涵盖公司代码、信息发布相关日期(首次及最新发布日期)、信息来源及公告类型,披露方式及事件详情(包括事件内容、行为描述、最新进展及进程),涉及事件主体和交易对象的基本信息(名称、企业编号及与上市公司关联关系),协议签署情况及状态(是否终止),委托融资相关数据(金额、期限、起始及截止日期、约定及实际收益),并提供备注栏供额外信息记录。\n",
188 | "astockeventsdb.lc_regroup\n",
189 | "该数据表涵盖公司事件相关信息,包括公司代码、事件发布日期(首次及更新)、信息来源与公告类型、披露方式及事件内容描述。涉及事件主体(名称、编号、与上市公司关系)、交易对象(编号、与上市公司关系)及行为方式与进程。财务数据方面,记录资产帐面与评估价值、出售与转让收益、置出置入资产详情及债务重组金额。其他字段涵盖协议签署日期、终止状态、备注、事项类型与编码、序号等。\n",
190 | "astockeventsdb.lc_majorcontract\n",
191 | "该数据表详细记录了公司事件相关信息,包括公司代码、信息发布日期(首次及最新)、来源及公告类型;事件内容涵盖行为描述、最新进展状态、主体及进程、行为方式;涉及财务信息如货币单位、金额;主体及交易对象信息包括名称、企业编号、与上市公司关联关系;合同相关数据包括签署日期、标的、获得方式、中标日期、起始及截止日、期限、对公司影响;并提供备注说明。\n",
192 | "astockeventsdb.lc_investorra\n",
193 | "本数据表记录了证券相关信息发布与接待活动的详细情况,包括信息发布日期、证券内部编码及编号等基本信息;接待活动的具体日期、时间、序号及类别;参与单位及人员、地点、上市公司接待人员等参与信息;活动主要内容;相关附件及其格式;信息标题、链接地址;以及接待日期的截止日等。\n",
194 | "astockeventsdb.lc_investordetail\n",
195 | "该数据表记录投资者关系活动相关信息,包括活动唯一标识(RID)、参与调研的机构名称及编码(Participant、ParticipantID)、调研人员姓名及编码(PersonalName、PersonalID)以及人员职位名称(PostName)和活动序号(SerialNumber)。\n",
196 | "astockfinancedb.lc_ashareseasonednewissue\n",
197 | "该数据表涵盖证券发行全流程信息,包括内部编码、各阶段发布日期(如首次信息、预案、股东大会决议、招股公告等)、预案有效期、发行细节(股票类型、定价方式、发行量、发行对象等)、价格区间及上下限、承销商指导价、发行量上下限、超额配售权、发行及承销日期、A股除权情况、股权登记及除权日、停牌时间、老股东及各类投资者配售信息(日期、比例、代码等)、网上网下发行详情(申购代码、上限、单位等)、各类费用(承销、会计师、评估、律师等)、募集资金及到账情况、上市日期及流通股数、回拨及配售股数、盈利预测、承销方式、方案变动、市盈率、股东认购情况、国资委及证监会批准日期、事件进程、发行对象类型、最新发行价及量调整、发行目的、定价基准、增发类别、有效性、认购方式、控股股东认购详情、预计募集资金、费用合计、面值单位、折扣率、认购邀请日、价格比例、追加认购日期、验资日、最新预案公布日及简易程序标识。\n",
198 | "astockfinancedb.lc_ashareplacement\n",
199 | "该数据表详细记录了配股相关信息,包括证券内部编码、首次信息发布日期、配股年度、发行股票类型、预案及决议公告日期、定价方式及说明、预案有效期、计划及实际配股比例和数量、配股价格上下限、决案及说明书刊登日期、配股简称和代码、股本基数、面值、每股配股价格、转配比及费用、零股处理、募集资金及费用明细、股权登记及除权日、缴费时间、资金到账情况、上市日、大股东认配说明、方案变更情况、承销方式及余股包销、公众股东认配情况、计划及最新配股比例和数量、承销保荐及审计评估费用、各级审批公告日、发行方式、事件进程、最新公告日期、配股说明及结果公告日、配股对象类别及基准日、大股东持股及认配情况、配股后预计每股净资产和收益等。\n",
200 | "astockfinancedb.lc_dividend\n",
201 | "该数据表详细记录了证券分红的各项信息,包括证券内部编码、关键日期(如预案发布、股东大会决议公告、股权登记、除权除息、送转股上市、红利到账、最后交易、红利发放起止、分红实施公告等)、分红情况(是否分红、每股收益、送转股比例、派现金额及币种、分红股本基数、送转后总股本等)、分红对象及金额明细(公司合计及A股、B股派现金额)、方案变动情况(是否变更、变动说明及原因、变更前分红情况等)、利润分配情况(分配次数、不分配说明、分配形式及上下限等)、事件进程及描述、除权除息参考价、分红意向公布日、分红派息股本基准日、议案编号及最新信息发布日期等。\n",
202 | "astockfinancedb.lc_capitalinvest\n",
203 | "该数据表涵盖公司募资及收购兼并相关信息,包括公司代码、信息发布日期及来源、募资方式及投向(项目名称、内容、金额、行业、领域)、实际投入情况(金额、股票数量、截至日期)、项目进展及收益、改投详情(是否改投、新项目、金额)、证券内部编码、收购兼并类型、收购资产价值及价格、权益比例、出让方信息(名称、关联关系、相关股票、企业性质)及收购标的信息(名称、企业性质)。\n",
204 | "astockfinancedb.lc_balancesheetall\n",
205 | "该数据表涵盖公司财务报表详细信息,包括信息发布日期、来源及公告类别;公司代码及会计相关日期;调整与合并状态;会计准则及企业类型;流动资产(如现金、存款、应收款项等)、非流动资产(如长期投资、固定资产、无形资产等)及其特殊与调整项目;负债(如短期借款、应付账款等)、所有者权益(如资本公积、盈余公积等)及其特殊与调整项目;其他财务项目(如投资性房地产、衍生金融资产等)及特殊字段说明。此外,还涉及资产、负债及权益的合计与细分项,如优先股、永续债等,以及各类应收、应付款项及金融投资细节。\n",
206 | "astockfinancedb.lc_incomestatementall\n",
207 | "该数据表涵盖公司财务信息,包括发布日期、来源及公告类别等基本信息;公司代码及会计准则等企业属性;营业收入、成本、利润等经营数据,细分至利息、手续费、保费等收入支出项;非经营性收益、投资收益等财务指标;以及净利润、每股收益等盈利能力数据。此外,还涉及特殊项目调整、综合收益、权益工具投资等信息,全面反映公司财务状况及经营成果。\n",
208 | "astockfinancedb.lc_cashflowstatementall\n",
209 | "该数据表涵盖公司财务及现金流量详细信息,包括信息发布日期、来源及公告类别,公司代码及财务日期,调整与合并标志,会计准则及企业类型。涉及经营活动的现金流入如销售商品、税费返还等,流出如购买商品、支付税费等,并细分各现金流量调整项目。投资活动现金流入包括收回投资、处置资产等,流出如购建资产、投资支付等。筹资活动现金流入涵盖吸收投资、发行债券等,流出如偿还债务、分配股利等。此外,还包括汇率变动影响、净利润及各类财务调整项目,如资产减值、折旧摊销等,以及特殊字段说明和完整标志,全面反映公司财务状况及现金流量变动。\n",
210 | "astockfinancedb.lc_intassetsdetail\n",
211 | "该数据表涵盖公司研发投入及相关人员信息,包括内部编码、公司代码、信息发布与截止日期、来源编码等基础信息;是否合并及调整状态;费用化与资本化研发投入及其合计金额;研发投入占营业收入比例及资本化投入占比;研发人员及核心技术人员数量与占比;核心技术营业收入及其占比等关键财务与人力资源指标。\n",
212 | "astockfinancedb.lc_mainoperincome\n",
213 | "该数据表涵盖公司经营与财务信息,包括公司代码、日期及类型、信息来源及编码、合并与调整标志、分类与层级、显示序号、经营项目及所属行业、地区与业务性质、主营业务收入/成本/利润及其同比与上年同期数据、毛利率及其同比与增减情况、信息发布日期、货币单位、上级科目名称等关键字段。\n",
214 | "astockfinancedb.lc_operatingstatus\n",
215 | "该数据表涵盖公司代码、统计截止日期、信息来源、经营述评及信息发布日期等关键信息,用于综合反映公司经营状况及相关信息披露情况。\n",
216 | "astockfinancedb.lc_auditopinion\n",
217 | "该数据表涵盖公司审计相关信息,包括公司代码、审计截止日期、信息来源、会计师事务所及所属机构、注册会计师、审计意见类型及全文、信息发布日期和审计报告类型等关键字段。\n",
218 | "astockindustrydb.lc_exgindustry\n",
219 | "该数据表涵盖公司基本信息及行业分类,包括公司代码、信息发布日期及来源,行业划分标准及多级(一至四级)行业代码与名称,并记录是否执行及取消日期等执行状态。\n",
220 | "astockindustrydb.lc_exgindchange\n",
221 | "该数据表涵盖公司基本信息及行业分类,包括公司代码、信息发布日期及来源,行业划分标准及多级(一至四级)行业代码与名称,并记录信息执行状态及取消日期。\n",
222 | "astockindustrydb.lc_industryvaluation\n",
223 | "该数据表涵盖行业内部编码、交易日、统计类型及板块、行业分类标准与代码,包含总市值、A股流通及自由流通市值,涉及滚动与静态市盈率、市净率、股息率、市现率及市销率等多维度财务指标,并附行业名称、分类及上市证券数量等详细信息。\n",
224 | "astockindustrydb.lc_indfinindicators\n",
225 | "该数据表涵盖行业基本信息、统计数据及财务指标,包括行业编码、名称、级别、代码及划分标准,统计类型与板块,信息发布及截止日期,数据标志,上市证券数量,营业收入、成本、利润、净利润及其TTM值,净资产、总资产、总股本,每股收益及多种净资产收益率、总资产净利率、销售毛利率与净利率,财务及销售费用比率,营业成本占比,投入资本回报率,流动与速动比率,利息保障倍数,净利润与毛利润增长率,存货与应收账款周转率及天数,总资产周转率,经营现金流占比,资产负债率,有形资产净值占比,对外投资占比,预收及应收账款占比等。\n",
226 | "astockindustrydb.lc_coconcept\n",
227 | "该数据表涵盖证券内部编码、概念代码、调入调出日期、状态、备注及信息发布日期等关键信息,用于记录证券概念归属及变动情况。\n",
228 | "astockindustrydb.lc_conceptlist\n",
229 | "该数据表涵盖概念分类信息,包括各级概念代码及名称(如1级概念代码及名称、2级概念代码及名称)、概念板块编码及名称、起止日期、状态、备注,以及信息发布日期和概念英文名称。\n",
230 | "astockmarketquotesdb.cs_stockcapflowindex\n",
231 | "该数据表详细记录了证券交易的多维度数据,包括证券内部编码、交易日期及时间区间,涵盖小单、中单、大单、超大单及全单的流入、流出、净流入额(元)和量(股),以及相应的笔数、买入率、卖出率、净买入率等指标,并进一步细分主动流入、流出情况及其比率,同时提供与流通市值和股本比的关联分析,全面反映市场交易动态和资金流向。\n",
232 | "astockmarketquotesdb.cs_turnovervoltecindex\n",
233 | "该数据表涵盖证券市场交易数据,包括证券内部编码(InnerCode)、聚源代码(GilCode)、交易日期(TradingDay)及指标周期(IndexCycle)。涉及证券市场(SecuMarket)的成交额和成交量多周期简单移动平均(AMA5/10/20/30/60/120/250日, VMA5/10/20/30/60/120/250日)及成交量指数平滑异同平均(VMACD_EMA12/26, VMACD_DIFF/DEA/MACD)。还包括量比(VolumeRatio)、VOSC成交量震荡指标(VOSC)、TAPI加权指数成交量(TAPI_TAPI/TAPIMA)、成交量标准差(VSTD)、量变动速率(VROC)、成交量比率(VR)及量相对强弱(VRSI)等指标。\n",
234 | "astockmarketquotesdb.cs_stockpatterns\n",
235 | "该数据表涵盖证券市场交易信息,包括证券内部编码、交易日、聚源代码及市场分类;记录是否创近期(周、月、季、半年、年及上市以来)新高/新低的价格、收盘价、成交量及成交金额;统计各时段创历史新高/新低的次数;追踪连涨/跌天数及连续放量/缩量天数;分析均线突破情况(5日、10日、20日、60日);并提供N天M板及均线多空头排列看涨看跌等市场趋势指标。\n",
236 | "astockmarketquotesdb.qt_dailyquote\n",
237 | "该数据表记录了证券交易的相关信息,包括证券内部编码、交易日、各类价格(如昨收盘、今开盘、最高价、最低价、收盘价,单位均为元)、成交量(单位为股)、成交金额(单位为元)及成交笔数(单位为笔)。\n",
238 | "astockmarketquotesdb.qt_stockperformance\n",
239 | "该数据表涵盖证券交易核心数据,包括证券内部编码、交易日、价格信息(如昨收盘、今开盘、最高价、最低价、收盘价)、成交量与成交金额、涨跌幅、振幅、换手率、均价及是否停牌状态。此外,提供周、月、季度、半年、年度及今年以来等多时间维度的成交与价格统计,涵盖成交金额、成交量、涨跌幅、振幅、换手率、均价、最高/最低价及收盘价极值等。还涉及市值数据(总市值、流通市值)、基于自由流通股本的换手率统计、Beta与阿尔法值、波动率、夏普比率及市场收益率等风险与收益指标,以及上市以来复权价格极值及其对应时间。全面反映证券交易动态及市场表现。\n",
240 | "astockmarketquotesdb.lc_suspendresumption\n",
241 | "该数据表记录证券停复牌相关信息,包括证券内部编码、信息发布日期及来源、停牌日期时间及原因、停牌事项说明、停牌期限及类型,以及复牌日期时间及事项说明。\n",
242 | "astockoperationsdb.lc_suppcustdetail\n",
243 | "该数据表记录了企业关联交易信息,包括发布日期、公司代码、信息来源及编码、截止日期、关系类型、序号、关联企业名称及代码、供应商/客户属性、交易标的名称及代码、交易金额及占比、备注等关键信息。\n",
244 | "astockoperationsdb.lc_staff\n",
245 | "该数据表涵盖公司代码、统计截止日期、信息来源及合并状态,涉及员工分类方式、类别名称及代码,记录员工年龄范围、数量及占比,并附备注说明与信息发布日期。\n",
246 | "astockoperationsdb.lc_rewardstat\n",
247 | "该数据表涵盖公司薪酬信息,包括公司代码、信息发布日期及来源、统计截止日期;涉及报酬总额及管理层人数、前三名董事及高管报酬;特别记录独立董事津贴详情及总额;并提供备注说明以供参考。\n",
248 | "astockshareholderdb.lc_shtypeclassifi\n",
249 | "该数据表涵盖股东基本信息及分类情况,包括股东代码(SHID、SHCode)、名称(SHName)、所属性质(SHAttribute),以及基于特定标准(Standard)的多级分类代码(一级至四级:FirstLvCode、SecondLvCode、ThirdLvCode、FourthLvCode)。\n",
250 | "astockshareholderdb.lc_mainshlistnew\n",
251 | "该数据表详细记录了公司股东信息及相关股权情况,包括公司代码、日期、信息发布及来源、信息类别、股东排名及序号、股东名称及性质、股东类别、所属券商、证券代码及简称、持股数量及比例、有限售及无限售股数、持股变动及幅度、持有A股、B股、H股及其他股数量及比例、股本性质、股权质押及冻结情况、股东关联关系及说明、一致行动人说明、备注、股东ID及属性、各类普通股持股情况、持股变动类型、有投票权优先股及表决权情况等。\n",
252 | "astockshareholderdb.lc_shnumber\n",
253 | "该数据表涵盖公司股东信息,包括公司代码、信息发布日期及来源、统计截止日期;股东户数及持股情况,涉及总股东、A股、B股、H股、职工股及CDR股东的具体户数、户均持股数及其比例;各类股东户均持股数的季度、半年增长率及其比例变化;特别区分无限售A股和CDR的相应数据;此外还标记了近似数值描述字段。\n",
254 | "astockshareholderdb.lc_mshareholder\n",
255 | "本数据表涵盖公司股权结构相关信息,包括公司代码、信息发布日期及来源,股东名称、持股比例、地位及性质,股权获取方式,法人代表及注册资本,主要业务及经济性质,背景介绍,存在状态,公告类别,国籍及永久境外居留权情况,实际控制人结构图,报告文件格式,截止日期,货币单位及股东ID等关键数据。\n",
256 | "astockshareholderdb.lc_actualcontroller\n",
257 | "本数据表涵盖公司基本信息及实际控制人相关数据,包括公司代码、信息发布与截止日期、实际控制人代码及姓名、经济性质、国籍代码及描述、永久境外居留权情况以及实际控制人所属性质等关键字段。\n",
258 | "astockshareholderdb.lc_sharestru\n",
259 | "该数据表详细记录了公司股权结构及相关信息,包括公司代码、信息来源、日期等基本信息;涵盖各类股份如未流通A股、发起人股、国家股、境内/外资法人股等,及其细分如募集法人股、自然人法人股、职工股等;涉及流通股本、已上市流通A股、高管股、战略投资者配售持股等流通股份详情;还包括B股、H股、S股、N股等不同类型股份,以及总股本、股本变动原因等变动信息;特别标注了国有法人股、有限售流通股等特定股份情况;并记录了信息发布日期、每股面值、外资持股等财务数据;最后涵盖了一些特殊股份如GDR代表基础股票、其他未流通股等。整体上,该表全面反映了公司股权分布及变动情况。\n",
260 | "astockshareholderdb.lc_stockholdingst\n",
261 | "该数据表详细记录了各类机构对特定证券的持股情况,涵盖证券内部编码、公司代码、统计日期及信息来源等基本信息,以及机构持有无限售流通A股、公募基金、券商、券商理财产品、QFII、保险公司、社保基金、企业年金、信托公司、财务公司及其他机构的持股数量和比例,同时包括各类机构持有A股和总股本的数量及比例,前十大股东及无限售股东的持股情况,以及机构、基金、券商等持股户数统计,还包括私募基金、银行、外资机构的持股数量、比例及户数信息。\n",
262 | "astockshareholderdb.lc_sharetransfer\n",
263 | "该数据表详细记录了公司股权转让的相关信息,包括公司代码、信息发布日期及来源、协议签署及批准日期、股权变动日期等时间节点;涉及股权出让方与受让方的名称、经济性质、持股数量及比例变化等详细信息;股权转让方式、涉及股数、交易价格及金额等交易细节;以及生效条件、事项描述、是否终止实施等进展情况。此外,还包括股东序号、证券内部编码、限售股情况、首次信息发布日期、各方属性编码等辅助信息。\n",
264 | "astockshareholderdb.lc_sharefp\n",
265 | "该数据表记录公司股权冻结质押相关信息,包括公司代码、信息发布及起始结束日期、来源与类别;涉及股东名称、ID及属性,质押方名称、ID及属性;具体股数、占比及原因;区分限售与无限售股数,并提供事项编码及备注说明。\n",
266 | "astockshareholderdb.lc_sharefpsta\n",
267 | "该数据表记录股权冻结质押相关信息,包括冻结质押编号(FPCode)、截止日期(EndDate)、信息来源(InfoSource)、类别选择(Category)、公司代码(CompanyCode)、股权被冻结质押股东名称(FPSHName)、累计冻结质押股数(股)(AccuFPShares)、累计占冻结质押方持股数比例(AccuPCTOfPled)、累计占总股本比例(AccuProportion及计算值AccuProportionCalc)、股权被冻结质押股东所属性质(SHAttribute)、股东ID(SHID)及无描述字段(AccuFPSharesCalc)。\n",
268 | "astockshareholderdb.lc_buyback\n",
269 | "该数据表详细记录了公司股份回购的相关信息,包括公司代码、首次信息发布日期、信息来源及股份类别等基本信息;预案发布、股东大会通过、回购并注销股份公告书发布、回购协议签署等关键日期;股份被回购方、回购股数及占总股本比例等回购详情;回购定价方式、价格及总金额等财务数据;回购起始与结束日期、支付方式及资金划出日等时间节点;全称更改、工商变更登记等公司变动信息;回购数量与价格上下限、拟回购资金总额上下限等计划范围;待偿期限、事件进程及其描述等进展情况;股份回购方式代码与描述、资金来源与回购目的说明等补充说明。部分字段如BuybackPurpose、CurrencyUnit、OverruledDate暂无描述。\n",
270 | "astockshareholderdb.lc_buybackattach\n",
271 | "该数据表记录股份回购相关信息,包括回购ID、发布及生效日期、货币单位、回购股数及占比、累计回购数量及占比、价格区间、本次及累计回购资金等关键数据。\n",
272 | "astockshareholderdb.lc_legaldistribution\n",
273 | "该数据表详细记录了证券配售相关信息,包括证券内部编码、发布日期及来源、配售股数及原因、获配企业及其性质、所属券商信息、证券代码及账户、持股时间及配售性质、流通日期、发行价及申购退款情况、首次发布日期、补款及锁定期配售股数、投资者信息(名称、类型、编号、分类)、战略配售参与详情(高管员工、保荐机构、其他计划的数量、金额及占比)、配售对象及获配金额等标准名称及类型。\n",
274 | "astockshareholderdb.lc_nationalstockholdst\n",
275 | "该数据表记录了公司股东持股情况,包括内部编码、公司代码、截止日期等基本信息,以及股东ID、名称、持有A股总数及分类(有限售和无限售A股数)、占总股本和流通A股比例、持股数量变动及变动幅度等详细持股数据。\n",
276 | "astockshareholderdb.cs_foreignholdingst\n",
277 | "该数据表涵盖证券交易相关信息,包括证券内部编码、交易日期、外资持股总数(单位:万股)及外资持股比例(百分比)。\n",
278 | "astockshareholderdb.lc_esop\n",
279 | "该数据表涵盖公司股权激励计划的详细信息,包括内部编码、公司代码及关键日期(如首次信息发布、董事会和股东大会公告日)。字段涉及事件进程、序号及实施细节(是否分期、数据统计区间、首次实施公告日)。股票相关信息包括来源方式、规模上下限(万股)、资金总额上下限(万元)。期限数据含麦考利久期、锁定期及释放期(月)。资金和管理方面涵盖资金来源、参与单位及人员、管理模式及机构、资产管理计划全称及杠杆比例。参与情况详细记录购买股票价格、总参与人数、高管及员工参与人数、认购份额及比例。最后,记录持股完成日、锁定起始日及计划情况说明。\n",
280 | "astockshareholderdb.lc_esopsummary\n",
281 | "该数据表涵盖证券内部编码、公司代码及关键公告日期(包括首次信息发布、董事会和股东大会公告),记录事件进程与序号,并详细说明是否分期实施、股票及资金规模上下限(单位分别为万股和万元),另附备注说明和分期情况,全面反映相关业务细节。\n",
282 | "astockshareholderdb.lc_transferplan\n",
283 | "该数据表记录公司承诺事项相关信息,包括公司代码、信息发布日期及来源、承诺主体及事项类型、有效性及事件进程;涉及股东信息如序号、名称;详细描述增减持计划类别、时间、期限、价格及规模,涵盖具体股份数量、比例及投入资金上下限;同时涵盖不减持承诺期限及交易方式相关描述。\n",
284 | "astockshareholderdb.lc_smattendinfo\n",
285 | "该数据表详细记录了公司股东大会的相关信息,包括公司代码、信息发布日期(首次及最新)、会议日期及时间、股权登记日、会议登记起止日、公告及取消日期、召开地址及有效性。涵盖会议类型、投票方式、年度届次、网络投票通道及代码、投票简称及起止日。涉及主持人及职务、见证律所及律师信息。详细统计了股东出席类别、人数(包括A股、H股、其他及A/B类普通股)、代表股份及占比(总体、A股、H股、其他及A/B类普通股、中小股东、优先股),全面反映股东大会的参与情况和股权分布。\n",
286 | "constantdb.secumain\n",
287 | "该数据表涵盖证券基本信息,包括内部编码、公司及证券代码、中英文名称及其缩写、证券简称及拼音、扩位简称及拼音、市场及类别、上市日期、板块、状态及ISIN代码。\n",
288 | "constantdb.hk_secumain\n",
289 | "该数据表涵盖证券基本信息,包括证券内部编码、公司代码、证券代码等唯一标识;中英文名称及其缩写、证券简称及拼音简称等名称信息;所属证券市场、类别、上市日期、板块、状态及曾用名等上市情况;退市日期、买卖单位、交易货币类别及ISIN代码等交易相关数据。\n",
290 | "constantdb.ct_systemconst\n",
291 | "该数据表包含常量分类信息及其相关数值,具体包括常量分类的编码(LB)与名称(LBMC)、描述(MS)、代码(DM),以及对应的浮点值(FVALUE)、整型值(IVALUE)、日期值(DVALUE)和字符值(CVALUE),各字段重复记录以确保数据完整性。\n",
292 | "constantdb.qt_tradingdaynew\n",
293 | "该数据表记录了证券市场的交易日信息,包括日期、是否为交易日、所属证券市场,以及是否为周、月、季、年的最后交易日等关键时间节点标识。\n",
294 | "constantdb.lc_areacode\n",
295 | "该数据表涵盖地区编码信息,包括地区内部编码、行政编码及一、二级区划代码;地区名称信息,含中英文名称及其缩写;层级关系信息,涉及父节点代码及名称;状态信息,如有效性标识、取消日期;以及变更记录与备注信息。\n",
296 | "constantdb.us_secumain\n",
297 | "该数据表涵盖证券基本信息,包括证券内部编码、代码及简称(含拼音)、类别、市场及上市板块,记录上市日期、状态及ISIN代码,关联公司代码,并标注退市日期及公司中英文名称。\n",
298 | "creditdb.lc_violatiparty\n",
299 | "该数据表记录了违规事件的详细信息,包括事件标识(RID)、事件代码(EventCode)、当事人信息(名称(PartyName)、性质(PartyType)、编码(PartyCode))、时间范围(起始日期(BeginDate)、结束日期(EndDate))、违规详情(条款(ViolationClause)、说明(ViolationStatement))、处罚情况(机构(PenalOrg)、类型(PenalType、PenalTypeNew)、金额(AmountInvolved)、计价货币(CurrencyCode)、说明(PenalStatement))、处罚机构编码(PenalOrgCode)、关联上市公司(RelataCompany)及公告标识(AnnID)。\n",
300 | "hkstockdb.hk_employeechange\n",
301 | "该数据表记录证券相关信息,包括证券内部代码、信息发布日期及来源,涉及股东大会公告日期及有效性状态,涵盖生效与失效日期,并对比变更前后员工数量。\n",
302 | "hkstockdb.hk_stockarchives\n",
303 | "该公司数据表涵盖基本信息、联系方式、行业归属及财务概况:包括公司代码、成立日期、注册地、业务范围、港交所及恒生行业分类、高管信息(主席、秘书、会计师)、办公地址(注册及总办)、股份过户处、联系方式(电话、传真、邮箱、网址)、简介、企业类别及描述、中文名称、审计机构、注册资本及货币单位。\n",
304 | "hkstockdb.cs_hkstockperformance\n",
305 | "该数据表涵盖证券交易核心数据,包括证券内部编码、交易日、价格信息(如昨收盘、今开盘、最高价、最低价、收盘价)、货币代码、成交量与金额、涨跌及涨跌幅、振幅、换手率、均价、总市值与流通市值等。此外,还提供近一周、本周以来、近一月、本月以来、近三个月、近六个月、近一年以及今年以来等多时间维度的同类统计数据,如成交金额、成交量、涨跌、涨跌幅、振幅、换手率、均价、最高价、最低价、收盘价、日均成交金额、日均换手率、日均成交量、日均涨跌幅、日均振幅、日均总市值、日均流通市值等。另包含上市以来复权最高价及日期、复权最低价及日期等历史数据。\n",
306 | "indexdb.lc_indexbasicinfo\n",
307 | "该数据表涵盖指数详细信息,包括编码(IndexCode)、行业规范(IndustryStandard、IndustryType)、发布与编制机构(PubOrgName、CreatIndexOrgName)、发布与基日(PubDate、BaseDate)、基点及加权(BasePoint、WAMethod)、指数分类(IndexType、PubIndexType、IndexSeries、IndexPriceType、IndexDesignType)、主指数关联(Relationship、RelaMainIndexCode、RelaMainCode)、成份证券属性(ComponentType、SecuMarket、ComponentSum、ComponentAdPeriod)、币种(CurrencyCode)及摘要介绍(IndexAbstract、IndexRemark),并记录停用日期(EndDate)。\n",
308 | "indexdb.lc_indexcomponent\n",
309 | "该数据表涵盖指数成分股信息,包括指数及证券内部编码(IndexInnerCode、SecuInnerCode)、成分股调入调出日期(InDate、OutDate)、成分标志(Flag)及所属市场代码(SecuMarket)。\n",
310 | "institutiondb.lc_instiarchive\n",
311 | "该数据表涵盖公司基本信息、联系方式、管理层信息、注册及经营情况等。包括公司代码、所属公司、上市公司代码、基金管理人及托管人名称、中英文全称及简称、注册资本及货币单位、成立日期、经济及企业性质、类别、注册及办公地址、邮编、城市、邮箱、网址、法人代表、总经理等负责人、联系人及电话传真、公司简介、主营范围、所属行业、存续期限及原因、存在状态、评级机构代码、公司属性、统一社会信用代码、注册地及登记机构状态等详尽资料。\n",
312 | "institutiondb.ps_eventstru\n",
313 | "该数据表涵盖事件基本信息,包括事件名称、唯一标识的事件代码及其父级事件代码,反映事件重要性的事件级别,以及标识事件当前状态的有效性。\n",
314 | "institutiondb.ps_newssecurity\n",
315 | "该数据表记录了证券相关事件信息,包括唯一标识(RID)、证券内部编码(InnerCode)、公司代码(CompanyCode)、事件类别(EventType)、事件名称(EventName)、发生时间(EventDate)、情感方向(EmotionDirection)及情感重要度(EmotionImportance)。\n",
316 | "publicfunddb.mf_fundarchives\n",
317 | "该数据表涵盖基金基本信息、编码体系、投资特性、运作方式、收益分配、存续期限、基金经理及机构、申赎规则等多个维度。包括基金内部编码、转型编码、申购代码、证券代码、运作方式、性质、投资风格、类别、方向、目标、范围、业绩基准、风险收益特征、收益分配原则、发售方式、设立规模及日期、存续期、清算日期、保本及货币基金特有信息、基金经理及管理人、托管人、注册机构、申赎金额及份额限制、巨额赎回认定、确认日及到账天数、转托管市场、是否发起式、养老目标或FOF基金等详尽字段。\n",
318 | "publicfunddb.mf_fundprodname\n",
319 | "该数据表涵盖基金相关信息,包括基金内部编码、信息发布日期及来源、类别、披露名称、生效与失效日期、有效性状态、备注,以及拼音证券简称和基金转型统一编码等核心字段。\n",
320 | "publicfunddb.mf_investadvisoroutline\n",
321 | "该数据表涵盖基金公司基本信息,包括公司代码、名称及简称、法人代表与总经理等高管信息、成立日期及组织形式、注册资本、注册及办公地址、邮编、电子邮箱、联系方式(电话、传真、网址、联系地址、联系人及客服热线)、联系人简历、所属地区、存续截止日及注册登记与证监会标识码等。\n",
322 | "publicfunddb.mf_dividend\n",
323 | "该数据表涵盖基金分红信息,包括基金编码(转型统一编码、内部编码)、信息发布(日期、来源)、分红实施(公告日、截止日期、基准日)、收益分配(单位基金收益、未分配收益、是否分红、派现比例及金额)、权益登记与除息(登记日、除息日、场内外发放日)、红利再投资(再投资日、份额到账及可赎回日)、基准日分配利润及金额、方案变更、事件进程(代码及描述)、发放范围、年度累计分红(单位分红、总额、次数)及历史累计分红(总额、次数)。\n",
324 | "usstockdb.us_companyinfo\n",
325 | "该数据表涵盖公司基本信息,包括公司代码、中英文名称及其缩写、地址(含城市、省份、邮编、国家)、联系方式(电话、传真)、公司简介(中英文)、成立日期及精度、公司类型、注册地(国家、省份/州)、总部标识及链接地址等字段。\n",
326 | "usstockdb.us_dailyquote\n",
327 | "本数据表涵盖证券交易核心信息,包括交易日期、证券代码等基础标识;开盘、最高、最低、收盘价及涨跌情况等价格数据;成交量、成交额等交易规模指标;每股收益、总市值、总股本等财务指标,全面反映证券市场动态。\n"
328 | ]
329 | }
330 | ],
331 | "execution_count": 5
332 | },
333 | {
334 | "cell_type": "code",
335 | "metadata": {
336 | "ExecuteTime": {
337 | "end_time": "2025-02-17T06:28:46.906031Z",
338 | "start_time": "2025-02-17T06:28:46.898508Z"
339 | }
340 | },
341 | "source": [
342 | "with open(root_dir + \"/assets/db_table.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
343 | " json.dump(db_table, json_file, ensure_ascii=False, indent=2)"
344 | ],
345 | "outputs": [],
346 | "execution_count": 6
347 | },
348 | {
349 | "cell_type": "code",
350 | "metadata": {
351 | "ExecuteTime": {
352 | "end_time": "2025-02-17T06:30:08.560743Z",
353 | "start_time": "2025-02-17T06:28:46.927184Z"
354 | }
355 | },
356 | "source": [
357 | "db_info = []\n",
358 | "for db_name, db in db_table.items():\n",
359 | " print(db_name)\n",
360 | " db_json = json.dumps(db, ensure_ascii=False)\n",
361 | " db_summary, token, ok = llms.llm_glm_4_plus.generate_response(\n",
362 | " system='''你善于对数据库的表信息进行总结,根据它包含的数据表和字段信息,描述这个数据库,如\"本库名为xxx,记录了xxx;涵盖了xxx;方便用户xxx\"。\n",
363 | "输出一段文字,不换行。\"''',\n",
364 | " messages=[\n",
365 | " {\n",
366 | " \"role\": \"user\",\n",
367 | " \"content\": f\"下面是一个数据库的所有表和字段信息,请帮我为这个数据库写一段介绍,把表和字段信息压缩进去:\\n{db}\",\n",
368 | " }\n",
369 | " ],\n",
370 | " stream=False,\n",
371 | " )\n",
372 | " db_info.append(\n",
373 | " {\n",
374 | " \"db_name\": db_name,\n",
375 | " \"db_desc\": db_summary,\n",
376 | " }\n",
377 | " )"
378 | ],
379 | "outputs": [
380 | {
381 | "name": "stdout",
382 | "output_type": "stream",
383 | "text": [
384 | "astockbasicinfodb\n",
385 | "本库名为“上市公司基本资料”,记录了上市公司的核心信息及变更情况;涵盖了公司概况、名称更改状况、经营范围与行业变更三大板块。其中,“公司概况”表收录了上市公司的基本信息、联系方式、注册及办公地址、证券信息、高管及顾问详情、经营范围及特殊标识等;“公司名称更改状况”表记录了公司名称历次变更的详细信息;“公司经营范围与行业变更”表则详细展示了公司业务范围及行业类别的变动情况。方便用户全面了解上市公司的基本信息、历史变更及业务发展动态,为投资决策和行业研究提供有力支持。\n",
386 | "astockeventsdb\n",
387 | "本库名为“上市公司公告资讯/重大事项”,记录了上市公司公告中披露的各类重大事项信息;涵盖了公司担保明细、公司借贷明细、公司诉讼仲裁明细、重大事项委托理财、公司资产重组明细、公司重大经营合同明细、投资者关系活动及调研明细等多个方面;方便用户全面了解上市公司的财务状况、法律诉讼、资产重组、经营合同及投资者关系等重要信息,支持深入分析和决策。\n",
388 | "astockfinancedb\n",
389 | "本库名为上市公司财务指标/财务报表/融资与分红,记录了A股增发、A股配股、公司分红、资金投向说明、资产负债表_新会计准则、利润分配表_新会计准则、现金流量表_新会计准则、公司研发投入与产出、公司主营业务构成、公司经营情况述评、公司历年审计意见等数据,涵盖了证券发行全流程信息、配股相关信息、分红情况、募资及收购兼并相关信息、财务报表详细信息、财务信息、现金流量详细信息、研发投入及相关人员信息、经营与财务信息、经营状况及相关信息披露情况、审计相关信息等,方便用户全面了解上市公司的财务状况、经营成果、现金流量变动、研发投入、主营业务构成、经营情况、审计意见等信息。\n",
390 | "astockindustrydb\n",
391 | "本库名为“上市公司行业板块”,记录了上市公司在多种行业划分标准下的所属行业情况及其变更历史,涵盖了公司行业划分、行业变更、行业估值指标、行业财务指标以及公司所属概念等信息;方便用户进行公司数据回测、行业估值、财务分析及概念板块研究。具体包括公司行业划分表(lc_exgindustry)记录多级行业代码与名称,公司行业变更表(lc_exgindchange)追踪行业变化,行业估值指标表(lc_industryvaluation)提供市盈率、市净率等财务指标,行业财务指标表(lc_indfinindicators)展示成长、偿债、盈利等能力,概念所属公司表(lc_coconcept)及概念板块常量表(lc_conceptlist)记录公司概念归属及市场热点概念信息。\n",
392 | "astockmarketquotesdb\n",
393 | "本库名为上市公司股票行情,记录了境内股票交易资金流向指标、境内股票成交量技术指标、股票技术形态表、日行情表、股票行情表现(新)、停牌复牌表;涵盖了证券交易的多维度数据,包括证券内部编码、交易日期及时间区间,涵盖小单、中单、大单、超大单及全单的流入、流出、净流入额(元)和量(股),以及相应的笔数、买入率、卖出率、净买入率等指标,并进一步细分主动流入、流出情况及其比率,同时提供与流通市值和股本比的关联分析,全面反映市场交易动态和资金流向。\n",
394 | "astockoperationsdb\n",
395 | "本库名为“上市公司产品供销/人力资源”,记录了A股上市公司的主要供应商与客户清单、职工构成情况及管理层报酬统计;涵盖了公司关联交易信息、员工分类及年龄分布、薪酬总额及高管报酬等关键数据;方便用户全面了解上市公司供销关系、人力资源结构及管理层薪酬状况,为投资决策和企业管理提供有力支持。\n",
396 | "astockshareholderdb\n",
397 | "本库名为上市公司股东与股本/公司治理,记录了上市公司的股东信息、股本结构、公司治理等方面的数据。涵盖了股东类型分类、股东名单、股东户数、大股东介绍、公司实际控制人、公司股本结构变动、股东持股统计、股东股权变动、股东股权冻结和质押、股东股权冻结和质押统计、股份回购、股份回购关联表、法人配售与战略投资者、A股国家队持股统计、外资持股统计、员工持股计划、员工持股计划概况、股东增减持计划表、股东大会出席信息等数据表。方便用户了解上市公司的股权结构、股东情况、公司治理等信息。\n",
398 | "constantdb\n",
399 | "本库名为常量库,记录了证券、系统常量及地区编码等基础信息;涵盖了证券主表(含股票、基金、债券的代码、简称、上市信息)、港股及美股证券主表(含证券基本信息、上市情况及交易数据)、系统常量表(含常量分类及数值)、交易日表(含各市场交易日及关键时间节点)、国家城市代码表(含地区编码、名称及层级关系);方便用户查询证券基础数据、常量值、交易日信息及地区编码,支持金融分析和地域研究。\n",
400 | "creditdb\n",
401 | "本库名为诚信数据库,记录了违规当事人处罚的详细信息;涵盖了事件标识、事件代码、当事人信息(名称、性质、编码)、时间范围(起始日期、结束日期)、违规详情(条款、说明)、处罚情况(机构、类型、金额、计价货币、说明)、处罚机构编码、关联上市公司及公告标识等指标;方便用户查询和分析2014年至今由交易所、上市公司公告、证监会等来源提供的违规事件及其处罚情况。\n",
402 | "hkstockdb\n",
403 | "本库名为港股数据库,记录了港股公司员工数量变动、公司概况及行情表现等全面数据;涵盖了员工变动历史、公司基本信息、联系方式、行业归属、财务概况以及多时间维度的行情统计数据,如价格信息、成交量、涨跌幅、振幅、换手率、市值等;方便用户深入分析港股公司运营状况、市场表现及趋势,为投资决策提供有力支持。\n",
404 | "indexdb\n",
405 | "本库名为指数数据库,记录了市场上主要指数的基本情况及成份证券构成情况;涵盖了指数类别、成份证券类别、发布机构、发布日期、基期基点、指数发布的币种等信息,以及成份证券的市场代码、入选日期、删除日期及成份标志等详细数据;方便用户全面了解指数详细信息,包括编码、行业规范、发布与编制机构、发布与基日、基点及加权、指数分类、主指数关联、成份证券属性、币种及摘要介绍,并追踪指数成份股的变动情况,适用于指数研究、投资决策及市场分析等领域。\n",
406 | "institutiondb\n",
407 | "本库名为机构数据库,记录了市场上重要机构如证券公司、信托公司、保险公司的基本资料及全网披露的证券舆情信息;涵盖了机构基本资料表(lc_instiarchive),包括公司代码、注册资本、法人代表等详尽资料,事件体系指引表(ps_eventstru),涉及事件名称、代码及级别,以及证券舆情表(ps_newssecurity),记录证券相关事件的情感方向及重要度;方便用户全面了解机构背景、事件分类及市场舆情,助力投资决策及风险管理。\n",
408 | "publicfunddb\n",
409 | "本库名为“公募基金数据库”,记录了公募基金的综合信息;涵盖了基金概况、产品名称、管理人概况及分红详情等多个方面。具体包括基金的基本情况、编码体系、投资特性、运作方式、收益分配、存续期限、基金经理及机构信息、申赎规则、名称类信息、管理人背景、分红比例及登记日等详尽字段;方便用户全面了解基金产品、管理公司及分红情况,支持投资决策和数据分析。\n",
410 | "usstockdb\n",
411 | "本库名为美股数据库,记录了美国市场上市公司的基本情况和证券日收盘行情;涵盖了公司概况(包括公司代码、中英文名称、地址、联系方式、公司简介、成立日期、公司类型、注册地、总部标识及链接地址等)以及日行情数据(涉及交易日期、证券代码、开盘价、最高价、最低价、收盘价、涨跌幅、成交量、成交额、每股收益、总市值、总股本等);方便用户全面了解美股公司信息及市场动态,进行投资分析和决策。\n"
412 | ]
413 | }
414 | ],
415 | "execution_count": 7
416 | },
417 | {
418 | "cell_type": "code",
419 | "metadata": {
420 | "ExecuteTime": {
421 | "end_time": "2025-02-17T06:30:08.590375Z",
422 | "start_time": "2025-02-17T06:30:08.586232Z"
423 | }
424 | },
425 | "source": [
426 | "with open(root_dir + \"/assets/db_info.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
427 | " json.dump(db_info, json_file, ensure_ascii=False, indent=2)"
428 | ],
429 | "outputs": [],
430 | "execution_count": 8
431 | }
432 | ],
433 | "metadata": {
434 | "kernelspec": {
435 | "display_name": "py312",
436 | "language": "python",
437 | "name": "python3"
438 | },
439 | "language_info": {
440 | "codemirror_mode": {
441 | "name": "ipython",
442 | "version": 3
443 | },
444 | "file_extension": ".py",
445 | "mimetype": "text/x-python",
446 | "name": "python",
447 | "nbconvert_exporter": "python",
448 | "pygments_lexer": "ipython3",
449 | "version": "3.12.2"
450 | }
451 | },
452 | "nbformat": 4,
453 | "nbformat_minor": 2
454 | }
455 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/config.py:
--------------------------------------------------------------------------------
1 | """This module handles the configuration and data loading for the application."""
2 |
3 | import json
4 | import os
5 | import llms
6 |
7 | ROOT_DIR = os.getcwd()
8 | with open(ROOT_DIR + "/assets/db_info.json", encoding="utf-8") as file:
9 | dbs_info = file.read()
10 | with open(ROOT_DIR + "/assets/db_table.json", encoding="utf-8") as file:
11 | db_table = json.loads(file.read())
12 | with open(ROOT_DIR + "/assets/table_column.json", encoding="utf-8") as file:
13 | table_column = json.loads(file.read())
14 | with open("../../assets/question.json", "r", encoding="utf-8") as file:
15 | all_question = json.load(file)
16 |
17 |
18 | for cols in table_column.values():
19 | for col in cols:
20 | # col["desc"] = re.sub(r'(?<=;)[^;]*?与[^;]*?关联', '', col["desc"])
21 | if col["column"] == "SHKind":
22 | col["desc"] += (
23 | "枚举值:资产管理公司,一般企业,投资、咨询公司,风险投资公司,自然人,其他金融产品,信托公司集合信托计划,金融机构—证券公司,保险投资组合,开放式投资基金,企业年金,信托公司单一证券信托,社保基金、社保机构,金融机构—银行,金融机构—期货公司,基金专户理财,国资局,券商集合资产管理计划,基本养老保险基金,金融机构—信托公司,院校—研究院,金融机构—保险公司,公益基金,保险资管产品,财务公司,基金管理公司,金融机构—金融租赁公司"
24 | )
25 |
26 | column_mapping = {}
27 | for db_name, db in dict(db_table).items():
28 | for table in db["表"]:
29 | table_name = table["表英文"]
30 | column_mapping[f"{db_name}.{table_name}"] = {}
31 | for col in table_column[table_name]:
32 | column_mapping[f"{db_name}.{table_name}"][col["column"]] = str(col["desc"]).split(";", maxsplit=1)[0]
33 |
34 | import_column_names = {
35 | "InnerCode",
36 | "CompanyCode",
37 | "SecuCode",
38 | "ChiNameAbbr",
39 | "ChiSpelling",
40 | "ConceptCode",
41 | "FirstIndustryCode",
42 | "SecondIndustryCode",
43 | "ThirdIndustryCode",
44 | "FourthIndustryCode",
45 | "IndustryNum",
46 | "IndexCode",
47 | "IndexInnerCode",
48 | "SecuInnerCode",
49 | "FirstPublDate",
50 | }
51 |
52 | enum_columns = {}
53 | for t_name, table in table_column.items():
54 | filtered_columns = {col["column"]: col["desc"] for col in table if "具体描述" in col["desc"]}
55 | if filtered_columns:
56 | enum_columns[t_name] = filtered_columns
57 |
58 | MAX_ITERATE_NUM = 20
59 | MAX_SQL_RESULT_ROWS = 100
60 |
61 | START_INDEX = [0, 0] # 起始下标 [team_index, question_idx]
62 | END_INDEX = [len(all_question) - 1, len(all_question[-1]["team"]) - 1] # 结束下标 [team_index, question_idx] (包含)
63 | SAVE_FILE_SUBFIX = ""
64 |
65 | llm_plus = llms.llm_glm_4_plus
66 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/database.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 数据库操作"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 3,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "{\n",
20 | " \"success\": true,\n",
21 | " \"data\": [\n",
22 | " {\n",
23 | " \"EventContent\": \" 公司拟通过全资子公司隆平种业向嘉穗种业转让隆平种业全资子公司湖南兴隆种业有限公司51%股权。\\r\\n 2020年03月31日公告:截至公告披露日,嘉穗种业受让的标的股权已登记至嘉穗种业名下;隆平种业已收到嘉穗种业所支付的本次标的股权转让款的51%(9,368.19万元),嘉穗种业因自身资金周转原因,尚未向隆平种业支付标的股权转让款的49%(9,000.81万元)。\",\n",
24 | " \"EventProcedure\": 1019,\n",
25 | " \"IfEnded\": null\n",
26 | " },\n",
27 | " {\n",
28 | " \"EventContent\": \" 公司于2020年6月11日与三井物产株式会社签订《股权转让协议》,转让控股子公司湖南湘研种业有限公司20%股权,并放弃优先购买湘研种业10.4%股权的权利。\",\n",
29 | " \"EventProcedure\": 1004,\n",
30 | " \"IfEnded\": 0\n",
31 | " }\n",
32 | " ],\n",
33 | " \"count\": 2\n",
34 | "}\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "import requests\n",
40 | "import json\n",
41 | "\n",
42 | "access_token = \"xxx\"\n",
43 | "\n",
44 | "url = \"https://comm.chatglm.cn/finglm2/api/query\"\n",
45 | "headers = {\"Content-Type\": \"application/json\", \"Authorization\": f\"Bearer {access_token}\"}\n",
46 | "\n",
47 | "\n",
48 | "sql = \"\"\"\n",
49 | "SELECT EventContent, EventProcedure, IfEnded\n",
50 | "FROM astockeventsdb.lc_regroup\n",
51 | "WHERE CompanyCode = 549\n",
52 | "AND YEAR(InfoPublDate) = '2020'\n",
53 | "\"\"\"\n",
54 | "\n",
55 | "response = requests.post(url, headers=headers, json={\"sql\": sql, \"limit\": 100})\n",
56 | "print(json.dumps(response.json(), indent=2, ensure_ascii=False))"
57 | ]
58 | }
59 | ],
60 | "metadata": {
61 | "kernelspec": {
62 | "display_name": "py312",
63 | "language": "python",
64 | "name": "python3"
65 | },
66 | "language_info": {
67 | "codemirror_mode": {
68 | "name": "ipython",
69 | "version": 3
70 | },
71 | "file_extension": ".py",
72 | "mimetype": "text/x-python",
73 | "name": "python",
74 | "nbconvert_exporter": "python",
75 | "pygments_lexer": "ipython3",
76 | "version": "3.12.2"
77 | }
78 | },
79 | "nbformat": 4,
80 | "nbformat_minor": 2
81 | }
82 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/llms.py:
--------------------------------------------------------------------------------
1 | """
2 | This module initializes various language models.
3 | """
4 |
5 | import os
6 | from src.llm import OllamaLLM, ZhipuLLM, OpenAILLM, extract_answer_from_r1
7 |
8 | ## 用于验证提交的版本
9 | llm_glm_4_plus = ZhipuLLM(api_key=os.getenv("ZHIPU_API_KEY"), model="glm-4-plus")
10 |
11 |
12 | ## 其他大模型用于
13 | llm_deepseek_r1 = OllamaLLM(
14 | host=os.getenv("OLLAMA_HOST"), model="deepseek-r1:14b", post_process=extract_answer_from_r1
15 | )
16 | llm_gpt_4o_mini = OpenAILLM(
17 | api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini", base_url=os.getenv("OPENAI_BASE_URL")
18 | )
19 | llm_deepseek_v3 = OpenAILLM(
20 | api_key=os.getenv("OPENAI_API_KEY"),
21 | model="deepseek/deepseek-chat",
22 | base_url=os.getenv("OPENAI_BASE_URL"),
23 | default_stream=True,
24 | )
25 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import copy
4 | import logging
5 | import time
6 | from dotenv import load_dotenv
7 |
8 | os.environ["DEBUG"] = "0"
9 | os.environ["SHOW_LLM_INPUT_MSG"] = "1"
10 |
11 | load_dotenv()
12 |
13 | from src.log import setup_logger, get_logger
14 | import config
15 | from agents import agent_rewrite_question, agent_extract_company
16 | from workflows import sql_query, check_db_structure
17 | from utils import ajust_org_question
18 |
19 |
20 | def process_question(question_team: dict, team_idx: int) -> dict:
21 | """
22 | Processes a team of questions, extracting facts and generating answers.
23 |
24 | Args:
25 | question_team (dict): A dictionary containing a list of questions to process.
26 | team_idx(int): index of team
27 |
28 | Returns:
29 | dict: The processed question team with answers and usage tokens.
30 | """
31 | debug_mode = os.getenv("DEBUG", "0") == "1"
32 | facts = []
33 | qas = []
34 | sql_query.clear_history_facts()
35 | for q_idx, question_item in enumerate(question_team["team"]):
36 | qid: str = question_item["id"].strip() # 声明qid的类型为str
37 | question = ajust_org_question(question_item["question"])
38 | if team_idx == config.START_INDEX[0] and q_idx < config.START_INDEX[1]:
39 | qas.extend(
40 | [
41 | {"role": "user", "content": question},
42 | {"role": "assistant", "content": question_item["answer"]},
43 | ]
44 | )
45 | if "facts" in question_item:
46 | facts = question_item["facts"]
47 | if "sql_results" in question_item:
48 | sql_query.history_facts = copy.deepcopy(question_item["sql_results"])
49 | print(f">>>>> 【SKIP】id: {qid}")
50 | continue
51 | if team_idx == config.END_INDEX[0] and q_idx > config.END_INDEX[1]:
52 | print("----- EXIT -----\n")
53 | return question_team
54 | start_time = time.time()
55 | log_file_path = config.ROOT_DIR + f"/output/{qid}.log"
56 | open(log_file_path, "w", encoding="utf-8").close()
57 | setup_logger(
58 | log_file=log_file_path,
59 | log_level=logging.DEBUG,
60 | )
61 | logger = get_logger()
62 |
63 | print(f">>>>> id: {qid}")
64 | print(f">>>>> Original Question: {question_item['question']}")
65 | logger.debug("\n>>>>> Original Question: %s\n", question_item["question"])
66 |
67 | # 获取实体内部代码
68 | agent_extract_company.clear_history()
69 | answer, _ = agent_extract_company.answer(
70 | (
71 | """提取下面这段文字中的实体(如公司名、股票代码、拼音缩写等),如果识别结果是空,那么就回复No Entities."""
72 | f'''"{question}"'''
73 | )
74 | )
75 | if answer != "" and answer not in facts:
76 | facts.append(answer)
77 |
78 | # rewrite question
79 | agent_rewrite_question.clear_history()
80 | qas_content = [
81 | f"Question: {qa['content']}" if qa["role"] == "user" else f"Answer: {qa['content']}" for qa in qas
82 | ]
83 | new_question, _ = agent_rewrite_question.answer(
84 | (
85 | "历史问答:无。\n"
86 | if len(qas_content) == 0
87 | else "下面是顺序的历史问答:\n'''\n" + "\n".join(qas_content) + "\n'''\n"
88 | )
89 | + f"现在用户继续提问,请根据已知信息,理解当前这个问题的完整含义,并重写这个问题使得单独拿出来看仍然能够正确理解:{question}"
90 | )
91 | print(f">>>>> Rewrited Question: {new_question}")
92 |
93 | # 注入已知事实
94 | key_facts = "已知事实"
95 | if len(facts) > 0:
96 | kv = {key_facts: "\n---\n".join(facts)}
97 | sql_query.agent_master.add_system_prompt_kv(kv)
98 | check_db_structure.agent_table_selector.add_system_prompt_kv(kv)
99 | check_db_structure.agent_column_selector.add_system_prompt_kv(kv)
100 | else:
101 | sql_query.agent_master.del_system_prompt_kv(key_facts)
102 | check_db_structure.agent_table_selector.del_system_prompt_kv(key_facts)
103 | check_db_structure.agent_column_selector.del_system_prompt_kv(key_facts)
104 | if debug_mode:
105 | print(f"\n>>>>> {key_facts}:\n" + "\n---\n".join(facts))
106 | logger.debug("\n>>>>> %s:\n%s", key_facts, "\n---\n".join(facts))
107 |
108 | # 注入历史对话
109 | key_qas = "历史对话"
110 | if len(qas_content) > 0:
111 | kv = {key_qas: "\n".join(qas_content)}
112 | sql_query.agent_master.add_system_prompt_kv(kv)
113 | check_db_structure.agent_table_selector.add_system_prompt_kv(kv)
114 | check_db_structure.agent_column_selector.add_system_prompt_kv(kv)
115 | else:
116 | sql_query.agent_master.del_system_prompt_kv(key_qas)
117 | check_db_structure.agent_table_selector.del_system_prompt_kv(key_qas)
118 | check_db_structure.agent_column_selector.del_system_prompt_kv(key_qas)
119 |
120 | check_db_structure.clear_history()
121 | res = check_db_structure.run(inputs={"messages": [{"role": "user", "content": new_question}]})
122 | db_info = res["content"]
123 |
124 | sql_query.clear_history()
125 |
126 | res = sql_query.run(
127 | inputs={
128 | "messages": [
129 | {"role": "assistant", "content": db_info},
130 | {"role": "user", "content": new_question},
131 | ]
132 | }
133 | )
134 | question_item["answer"] = res["content"]
135 | # Caching
136 | qas.extend(
137 | [
138 | {"role": "user", "content": question},
139 | {"role": "assistant", "content": question_item["answer"]},
140 | ]
141 | )
142 | elapsed_time = time.time() - start_time
143 | question_item["usage_tokens"] = {
144 | agent_extract_company.name: agent_extract_company.usage_tokens,
145 | agent_rewrite_question.name: agent_rewrite_question.usage_tokens,
146 | check_db_structure.name: check_db_structure.usage_tokens,
147 | sql_query.name: sql_query.usage_tokens,
148 | }
149 | minutes, seconds = divmod(elapsed_time, 60)
150 | question_item["use_time"] = f"{int(minutes)}m {int(seconds)}s"
151 | question_item["facts"] = copy.deepcopy(facts)
152 | question_item["rewrited_question"] = new_question
153 | question_item["sql_results"] = copy.deepcopy(sql_query.history_facts)
154 |
155 | print(f">>>>> Answer: {question_item['answer']}")
156 | print(f">>>>> Used Time: {int(minutes)}m {int(seconds)}s\n")
157 | with open(config.ROOT_DIR + f"/assets/question.json", "w", encoding="utf-8") as file:
158 | json.dump(config.all_question, file, ensure_ascii=False, indent=4)
159 | print(f"----- Completed Team Index {i} -----\n")
160 | return question_team
161 |
162 |
163 | for i in range(config.START_INDEX[0], config.END_INDEX[0] + 1):
164 | print(f"----- Processing Team Index {i} ... -----\n")
165 | try:
166 | process_question(config.all_question[i], i)
167 | except Exception as exc:
168 | print(f"\n***** Team Index {i} generated an exception: {exc} *****\n")
169 |
170 | total_usage_tokens = {
171 | agent_extract_company.name: 0,
172 | agent_rewrite_question.name: 0,
173 | check_db_structure.name: 0,
174 | sql_query.name: 0,
175 | }
176 |
177 | for q_team in config.all_question:
178 | for q_item in q_team["team"]:
179 | if "usage_tokens" in q_item:
180 | for key in q_item["usage_tokens"]:
181 | if key in total_usage_tokens:
182 | total_usage_tokens[key] += q_item["usage_tokens"][key]
183 |
184 | print(json.dumps(total_usage_tokens, ensure_ascii=False, indent=4))
185 |
186 | total_tokens = sum(total_usage_tokens.values())
187 | print(f"所有tokens数: {total_tokens}")
188 |
189 | for q_team in config.all_question:
190 | for q_item in q_team["team"]:
191 | if "usage_tokens" in q_item:
192 | del q_item["usage_tokens"]
193 | if "use_time" in q_item:
194 | del q_item["use_time"]
195 | if "iterate_num" in q_item:
196 | del q_item["iterate_num"]
197 | if "facts" in q_item:
198 | del q_item["facts"]
199 | if "rewrited_question" in q_item:
200 | del q_item["rewrited_question"]
201 | if "sql_results" in q_item:
202 | del q_item["sql_results"]
203 |
204 | with open(config.ROOT_DIR + f"/output/Eva_Now_result{config.SAVE_FILE_SUBFIX}.json", "w", encoding="utf-8") as f:
205 | json.dump(config.all_question, f, ensure_ascii=False, indent=4)
206 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/manual.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "True"
12 | ]
13 | },
14 | "execution_count": 1,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "from dotenv import load_dotenv\n",
21 | "import os\n",
22 | "\n",
23 | "os.environ[\"DEBUG\"] = \"1\"\n",
24 | "os.environ[\"SHOW_LLM_INPUT_MSG\"] = \"1\"\n",
25 | "\n",
26 | "# 加载 .env 文件\n",
27 | "load_dotenv()"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import json\n",
37 | "import logging\n",
38 | "import time\n",
39 | "import copy\n",
40 | "\n",
41 | "from src.log import setup_logger, get_logger\n",
42 | "import config\n",
43 | "from agents import agent_rewrite_question, agent_extract_company\n",
44 | "from workflows import sql_query, check_db_structure\n",
45 | "from src.utils import show\n",
46 | "from utils import get_constant_column_list, ajust_org_question"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "{\n",
59 | " \"extract_company\": 0,\n",
60 | " \"rewrite_question\": 0,\n",
61 | " \"Check_db_structure\": 0,\n",
62 | " \"Sql_query\": 0\n",
63 | "}\n",
64 | "所有tokens数: 0\n"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "# 计算tokens数\n",
70 | "total_usage_tokens = {\n",
71 | " agent_extract_company.name: 0,\n",
72 | " agent_rewrite_question.name: 0,\n",
73 | " check_db_structure.name: 0,\n",
74 | " sql_query.name: 0,\n",
75 | "}\n",
76 | "\n",
77 | "for q_team in config.all_question:\n",
78 | " for q_item in q_team[\"team\"]:\n",
79 | " if \"usage_tokens\" in q_item:\n",
80 | " for key in q_item[\"usage_tokens\"]:\n",
81 | " if key in total_usage_tokens:\n",
82 | " total_usage_tokens[key] += q_item[\"usage_tokens\"][key]\n",
83 | "\n",
84 | "print(json.dumps(total_usage_tokens, ensure_ascii=False, indent=4))\n",
85 | "\n",
86 | "total_tokens = sum(total_usage_tokens.values())\n",
87 | "print(f\"所有tokens数: {total_tokens}\")"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## 跑指定问题"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "{\n",
107 | " \"id\": \"tttt----1----3-1-1\",\n",
108 | " \"question\": \"湘电股份的信披网址是哪个网站?\"\n",
109 | "}\n",
110 | ">>>>> id: tttt----1----3-1-1\n",
111 | ">>>>> Original Question: 湘电股份的信披网址是哪个网站?\n",
112 | "\n",
113 | "\n",
114 | ">>>>> 【user】 Said:\n",
115 | "提取下面这段文字中的实体(如公司名、股票代码、拼音缩写等),如果识别结果是空,那么就回复No Entities.\"湘电股份的信披网址是哪个网站?\"\n",
116 | "\n",
117 | "\n",
118 | ">>>>> Agent【extract_company】 Said:\n",
119 | "```json\n",
120 | "[\"湘电股份\"]\n",
121 | "```\n",
122 | "\n",
123 | ">>>>> 查询ql:\n",
124 | "SELECT 'constantdb.secumain' AS TableName, InnerCode, CompanyCode,\n",
125 | " ChiName, EngName, SecuCode, ChiNameAbbr, EngNameAbbr, SecuAbbr, ChiSpelling\n",
126 | "FROM constantdb.secumain \n",
127 | "WHERE SecuCode = '湘电股份'\n",
128 | " OR ChiName LIKE '%湘电股份%'\n",
129 | " OR ChiNameAbbr LIKE '%湘电股份%'\n",
130 | " OR EngName LIKE '%湘电股份%'\n",
131 | " OR EngNameAbbr LIKE '%湘电股份%'\n",
132 | " OR SecuAbbr LIKE '%湘电股份%'\n",
133 | " OR ChiSpelling LIKE '%湘电股份%'\n",
134 | "UNION ALL\n",
135 | "SELECT 'constantdb.hk_secumain' AS TableName, InnerCode, CompanyCode,\n",
136 | "ChiName, EngName, SecuCode, ChiNameAbbr, EngNameAbbr, SecuAbbr, ChiSpelling\n",
137 | "FROM constantdb.hk_secumain \n",
138 | "WHERE SecuCode = '湘电股份'\n",
139 | " OR ChiName LIKE '%湘电股份%'\n",
140 | " OR ChiNameAbbr LIKE '%湘电股份%'\n",
141 | " OR EngName LIKE '%湘电股份%'\n",
142 | " OR EngNameAbbr LIKE '%湘电股份%'\n",
143 | " OR SecuAbbr LIKE '%湘电股份%'\n",
144 | " OR FormerName LIKE '%湘电股份%'\n",
145 | " OR ChiSpelling LIKE '%湘电股份%'\n",
146 | "UNION ALL\n",
147 | "SELECT 'constantdb.us_secumain' AS TableName, InnerCode, CompanyCode,\n",
148 | "ChiName, EngName, SecuCode, null as ChiNameAbbr, null as EngNameAbbr, SecuAbbr, ChiSpelling\n",
149 | "FROM constantdb.us_secumain \n",
150 | "WHERE SecuCode = '湘电股份'\n",
151 | " OR ChiName LIKE '%湘电股份%'\n",
152 | " OR EngName LIKE '%湘电股份%'\n",
153 | " OR SecuAbbr LIKE '%湘电股份%'\n",
154 | " OR ChiSpelling LIKE '%湘电股份%';\n",
155 | "查询结果:\n",
156 | "[{\"TableName\": \"constantdb.secumain\", \"InnerCode\": 1551, \"CompanyCode\": 1387, \"ChiName\": \"湘潭电机股份有限公司\", \"EngName\": \"Xiangtan Electric Manufacturing Co., Ltd.\", \"SecuCode\": \"600416\", \"ChiNameAbbr\": \"湘电股份\", \"EngNameAbbr\": \"XEMC\", \"SecuAbbr\": \"湘电股份\", \"ChiSpelling\": \"XDGF\"}]\n",
157 | "\n",
158 | "\n",
159 | ">>>>> 【user】 Said:\n",
160 | "历史问答:无。\n",
161 | "现在用户继续提问,请根据已知信息,理解当前这个问题的完整含义,并重写这个问题使得单独拿出来看仍然能够正确理解:湘电股份的信披网址是哪个网站?\n",
162 | "\n",
163 | "\n",
164 | ">>>>> Agent【rewrite_question】 Said:\n",
165 | "湘电股份的信息披露网址是哪个网站?\n",
166 | ">>>>> Rewrited Question: 湘电股份的信息披露网址是哪个网站?\n",
167 | "\n",
168 | ">>>>> 已知事实:\n",
169 | "湘电股份的关联信息有:[所在数据表是constantdb.secumain;InnerCode(证券内部编码)是1551;CompanyCode(公司代码)是1387;ChiName(中文名称)是湘潭电机股份有限公司;EngName(英文名称)是Xiangtan Electric Manufacturing Co., Ltd.;SecuCode(证券代码)是600416;ChiNameAbbr(中文名称缩写)是湘电股份;EngNameAbbr(英文名称缩写)是XEMC;SecuAbbr(证券简称)是湘电股份;ChiSpelling(拼音证券简称)是XDGF;]\n",
170 | "\n",
171 | "\n",
172 | ">>>>> 【user】 Said:\n",
173 | "请选择db,务必遵循输出的格式要求。\n",
174 | "\n",
175 | "\n",
176 | ">>>>> Agent【Check_db_structure.db_selector】 Said:\n",
177 | "【分析】\n",
178 | "用户询问的是“湘电股份的信息披露网址”,这属于上市公司的基本信息范畴。根据知识库中的描述,上市公司的基本信息(如联系方式、注册信息等)记录在“astockbasicinfodb”数据库中。\n",
179 | "\n",
180 | "【选中的数据库】\n",
181 | "- astockbasicinfodb: 这个数据库包含上市公司的基本信息,包括信息披露网址。\n",
182 | "\n",
183 | "【选中的数据库的清单】\n",
184 | "```json\n",
185 | "[\"astockbasicinfodb\"]\n",
186 | "```\n",
187 | "补充选择db: [\"hkstockdb\", \"usstockdb\"]\n",
188 | "\n",
189 | "\n",
190 | ">>>>> 【user】 Said:\n",
191 | "数据库表信息如下:\n",
192 | "[{\"表名\": \"astockbasicinfodb.lc_stockarchives\", \"说明\": \"该数据表主要记录了公司的基本信息、联系方式、经营信息及证券信息等内容。其中,公司基本信息包括公司代码、国别、中文名称、法人代表、总经理、法律顾问、会计师事务所、公司成立日期、首次注册登记地点、企业法人营业执照注册号、所属区县及地区代码;联系方式涵盖联系人电话、传真、电子邮箱,公司注册地址、办公地址、联系地址及其对应的邮编,董秘及证券事务代表的电话、传真、电子邮件,以及公司邮箱和网址;经营信息涉及公司所属证监会行业、主营及兼营范围;证券信息包括A股、B股、H股及CDR证券的简称和代码,扩位简称,以及尚未盈利标识、特殊表决权标识、协议控制架构标识、红筹企业标识;此外,还包括信息披露相关内容如披露网址、披露报纸,以及公司简介等。\"}, {\"表名\": \"astockbasicinfodb.lc_namechange\", \"说明\": \"该数据表主要记录公司基本信息及相关公告,包括公司代码、中文及英文名称(全称与缩写)、信息来源和发布日期等基本信息;还涵盖重要事件如股东大会决议公告日期、是否否决决议、全称更改日期等关键时间节点,便于追踪企业动态与重要决策信息。\"}, {\"表名\": \"astockbasicinfodb.lc_business\", \"说明\": \"该数据表主要用于记录公司基本信息及相关业务情况,包括公司代码、信息发布日期和来源等基础信息;股东大会决议公告日期及是否否决的决议结果等决策信息;经营范围及主要业务、产品名称等经营信息;涉及行业的分类信息,如行业代码、行业类别及涉足行业等;此外,还包括简称变更原因等补充说明字段。\"}, {\"表名\": \"hkstockdb.hk_employeechange\", \"说明\": \"该数据表主要记录证券相关信息,包括证券内部代码、信息发布日期与来源等基本信息;股东大会公告日期及相关生效与失效日期等时间信息;是否有效的状态标识;以及变更前后员工数量的对比等数据。\"}, {\"表名\": \"hkstockdb.hk_stockarchives\", \"说明\": \"该数据表主要记录公司基本信息及相关属性,涵盖了公司标识信息如公司代码、中文名称、企业类别及描述等;成立及注册信息如公司成立日期、注册地、注册资本及其货币单位等;行业分类信息如所属行业(港交所、恒生)等;管理层信息如主席、公司秘书、合资格会计师等;办公及运营信息如注册办事处、总办事处及主要营业地点、股份过户处(香港)等;联系方式如电话、传真、邮箱、公司网址等;以及其他信息如公司业务、公司简介、审计机构等,全面反映了公司的基本情况和运营特征。\"}, {\"表名\": \"hkstockdb.cs_hkstockperformance\", \"说明\": \"该数据表主要用于记录证券交易及相关指标的详细信息,涵盖了基础信息如证券内部编码、交易日、货币代码等;价格信息包括昨收盘、今开盘、最高价、最低价、收盘价、均价、复权最高价及最低价等;成交信息如成交量、成交金额、换手率等;涨跌信息包括涨跌金额、涨跌幅、振幅等;市值信息如总市值、流通市值等;周期指标覆盖近一周、本周、近一月、本月、近三个月、近六个月、近一年及今年以来的成交、涨跌、振幅、换手率、均价、市值等详细数据,且提供各周期的日均指标;此外,还包含上市以来的复权价格及其对应日期等。\"}, {\"表名\": \"usstockdb.us_companyinfo\", \"说明\": \"该数据表主要记录公司的基本信息,包括公司标识信息如公司代码、英文名称及其缩写、中文名称;地址信息如公司地址、城市、省份、邮编、国家及国家代码;联系方式如电话、传真;公司描述信息如公司简介、英文公司简介、英文业务简介;注册地信息如注册地国家、注册地省份/州;其他信息如成立日期及其精度、公司类型、是否公司总部,以及链接地址等。\"}, {\"表名\": \"usstockdb.us_dailyquote\", \"说明\": \"该数据表记录了证券交易的详细信息,包括交易日和证券内部代码用于标识交易记录,开盘价、最高价、最低价和收盘价用于描述当天的价格波动,成交量和成交额反映市场活跃程度,每股收益TTM和证券总市值提供了盈利能力和规模的财务指标,证券总股本用于计算市场相关比率,昨收盘和日涨跌则用于分析价格变动趋势,涨跌幅百分比和均价提供了额外的市场表现信息。\"}]\n",
193 | "\n",
194 | "请选择table,务必遵循输出的格式要求。\n",
195 | "\n",
196 | "\n",
197 | ">>>>> Agent【Check_db_structure.table_selector】 Said:\n",
198 | "【分析】\n",
199 | "用户询问的是湘电股份的信息披露网址,这属于公司的基本信息范畴。根据已知事实,湘电股份的相关信息存储在`constantdb.secumain`表中,但该表不包含信息披露网址。我们需要找到一个包含公司基本信息和信息披露网址的表。\n",
200 | "\n",
201 | "【选中的数据表】\n",
202 | "- astockbasicinfodb.lc_stockarchives: 这个数据表包含公司的基本信息、联系方式、经营信息及证券信息等内容,其中包括信息披露网址。\n",
203 | "\n",
204 | "【选中的数据库表的清单】\n",
205 | "```json\n",
206 | "[\"astockbasicinfodb.lc_stockarchives\"]\n",
207 | "```\n",
208 | "\n",
209 | "补充选择table: [\"constantdb.lc_areacode\", \"hkstockdb.hk_stockarchives\", \"usstockdb.us_companyinfo\"]\n",
210 | "\n",
211 | "\n",
212 | ">>>>> 【user】 Said:\n",
213 | "已取得可用的数据表的字段信息如下:\n",
214 | "[{\"表名\": \"astockbasicinfodb.lc_stockarchives\", \"表字段\": [{\"column\": \"CompanyCode\", \"desc\": \"公司代码;公司代码(CompanyCode):与“证券主表(SecuMain)”中的“公司代码(CompanyCode)”关联,得到上市公司的交易代码、简称等。\"}, {\"column\": \"State\", \"desc\": \"国别;省份(State):与“国家城市代码表(LC_AreaCode)”中的“地区内部编码(AreaInnerCode)”关联,得到省份具体信息。\"}, {\"column\": \"SecretaryBD\", \"desc\": \"董事会秘书\"}, {\"column\": \"SecuAffairsRepr\", \"desc\": \"证券/股证事务代表\"}, {\"column\": \"AuthReprSBD\", \"desc\": \"董秘授权代表\"}, {\"column\": \"ContactTel\", \"desc\": \"联系人电话\"}, {\"column\": \"ContactFax\", \"desc\": \"联系人传真\"}, {\"column\": \"ContactEmail\", \"desc\": \"联系人电子邮箱\"}, {\"column\": \"RegAddr\", \"desc\": \"公司注册地址\"}, {\"column\": \"RegZipCode\", \"desc\": \"公司注册地址邮编\"}, {\"column\": \"OfficeAddr\", \"desc\": \"公司办公地址\"}, {\"column\": \"OfficeZipCode\", \"desc\": \"公司办公地址邮编\"}, {\"column\": \"ContactAddr\", \"desc\": \"公司联系地址\"}, {\"column\": \"ConatactZipCode\", \"desc\": \"公司联系地址邮编\"}, {\"column\": \"Email\", \"desc\": \"邮箱\"}, {\"column\": \"Website\", \"desc\": \"公司网址\"}, {\"column\": \"DisclosureWebsites\", \"desc\": \"信息披露网址\"}, {\"column\": \"DisclosurePapers\", \"desc\": \"信息披露报纸\"}, {\"column\": \"EstablishmentDate\", \"desc\": \"公司成立日期\"}, {\"column\": \"IRegPlace\", \"desc\": \"首次注册登记地点\"}, {\"column\": \"LegalRepr\", \"desc\": \"法人代表\"}, {\"column\": \"GeneralManager\", \"desc\": \"总经理\"}, {\"column\": \"LegalConsultant\", \"desc\": \"法律顾问\"}, {\"column\": \"AccountingFirm\", \"desc\": \"会计师事务所\"}, {\"column\": \"InduCSRC\", \"desc\": \"公司所属证监会行业(聚源);与(CT_IndustryType)表中的\\\"行业内部编码(IndustryNum)\\\"字段关联,当Standard=1时,LB=1;当Standard=22时,LB=22;当Standard=25时,LB=25;当Standard=26时,LB=26。\"}, {\"column\": \"BusinessMajor\", \"desc\": \"经营范围-主营\"}, {\"column\": \"BusinessMinor\", \"desc\": \"经营范围-兼营\"}, {\"column\": \"AShareAbbr\", \"desc\": \"A股证券简称\"}, {\"column\": \"AStockCode\", \"desc\": \"A股证券代码\"}, {\"column\": \"BShareAbbr\", \"desc\": \"B股证券简称\"}, {\"column\": \"BStockCode\", \"desc\": \"B股证券代码\"}, {\"column\": \"HShareAbbr\", \"desc\": \"H股证券简称\"}, {\"column\": \"HStockCode\", \"desc\": \"H股证券代码\"}, {\"column\": \"BriefIntroText\", \"desc\": \"公司简介\"}, {\"column\": \"ChiName\", \"desc\": \"中文名称\"}, {\"column\": \"BusinessRegNumber\", \"desc\": \"企业法人营业执照注册号\"}, {\"column\": \"SecretaryBDTel\", \"desc\": \"董秘电话\"}, {\"column\": \"SecretaryBDFax\", \"desc\": \"董秘传真\"}, {\"column\": \"SecretaryBDEmail\", \"desc\": \"董秘电子邮件\"}, {\"column\": \"SecuAffairsReprTel\", \"desc\": \"证券事务代表电话\"}, {\"column\": \"SecuAffairsReprFax\", \"desc\": \"证券事务代表传真\"}, {\"column\": \"SecuAffairsReprEmail\", \"desc\": \"证券事务代表电子邮件\"}, {\"column\": \"CityCode\", \"desc\": \"地区代码;地区代码(CityCode):与“国家城市代码表(LC_AreaCode)”中的“地区内部编码(AreaInnerCode)”关联,得到城市具体信息。\"}, {\"column\": \"CDRShareAbbr\", \"desc\": \"CDR证券简称\"}, {\"column\": \"CDRStockCode\", \"desc\": \"CDR证券代码\"}, {\"column\": \"ExtendedAbbr\", \"desc\": \"扩位简称\"}, {\"column\": \"UnprofitableMark\", \"desc\": \"尚未盈利标识;尚未盈利标识(UnprofitableMark):在上市时发行人尚未盈利的,其股票或存托凭证的特别标识为“U”;发行人首次实现盈利的,该特别标识取消,数据值为空。\"}, {\"column\": \"SpecialVoteMark\", \"desc\": \"特殊表决权标识;特殊表决权标识(SpecialVoteMark):在上市时发行人具有表决权差异安排的,其股票或存托凭证的特别标识为“W”;上市后不再具有表决权差异安排的,该特别标识取消,数据值为空。\"}, {\"column\": \"VIEMark\", \"desc\": \"协议控制架构标识;协议控制架构标识(VIEMark):在上市时发行人具有协议控制架构或者类似特殊安排的,其股票或存托凭证的特别标识为“V”;上市后不再具有相关安排的,该特别标识取消,数据值为空。\"}, {\"column\": \"RedChipMark\", \"desc\": \"红筹企业标识;红筹企业标识(RedChipMark):发行人属于红筹企业,则数据值=”是“;空值则指无此标识。\"}, {\"column\": \"RegArea\", \"desc\": \"所属区县;所属区县(RegArea):与“国家城市代码表(LC_AreaCode)”中的“地区内部编码(AreaInnerCode)”关联,得到所属区县具体信息。\"}]}, {\"表名\": \"constantdb.lc_areacode\", \"表字段\": [{\"column\": \"AreaInnerCode\", \"desc\": \"地区内部编码;地区内部编码(AreaInnerCode):聚源内部设置的地区代码,共9位数。\"}, {\"column\": \"AreaCode\", \"desc\": \"地区行政编码;地区行政编码(AreaCode):国家层级代码来自ISO全球国家代码标准;我国代码来自于国家行政区划网。\"}, {\"column\": \"FirstLevelCode\", \"desc\": \"一级区划代码;一级区划代码(FirstLevelCode):与(CT_SystemConst)表中的DM字段关联,令LB = 1961 and DM LIKE '%000',得到一级区划代码的具体描述:1000-中国省级行政区划,2000-中国地级行政区划,3000-中国县级行政区划,4000-其他,5000-中国乡镇级行政区划,9000-国家,10000-大洲与海洋,11000-美国州级行政区划,12000-海外国家城市,100000000-国家级经济区域,200000000-中国海关,300000000-海关经济区划。\"}, {\"column\": \"SecondLevelCode\", \"desc\": \"二级区划代码;二级区划代码(SecondLevelCode):与(CT_SystemConst)表中的DM字段关联,令LB = 1961,得到二级区划代码的具体描述:1000-中国省级行政区划,1001-直辖市,1002-省,1003-自治区,1004-特别行政区,2000-中国地级行政区划,2001-省会,2002-地级市,2003-盟,2004-自治州,2005-地区,2006-直辖市市辖区,2007-直辖市市辖县,3000-中国县级行政区划,3001-市辖区,3002-县,3003-县级市,3004-自治县,3005-旗,3006-自治旗,3007-林区,3008-特区,3009-县级乡镇,3010-县级街道,3011-县级经济开发区,3012-县级其他,4000-其他,4001-中国地理划分(按自然地理),4002-中国地理划分(按经济区域),4010-中国国家级城市群,4011-中国经济合作城市群,4100-欧洲地理划分(按自然地理),4150-非洲地理划分(按自然地理),4800-非标准地区,4900-其他地区划分(按自然地理),4950-政治类联合体、联盟或组织,4951-经济类联合体、联盟或组织,4999-其他划分,5000-中国乡镇级行政区划,5001-乡镇,5002-街道,5003-乡镇级经济开发区,5004-乡镇其他,9000-国家,10000-大洲与海洋,10001-大洲,10002-海洋,11000-美国州级行政区划,11001-特区,11002-州,12000-海外国家城市,100000000-国家级经济区域,100000001-国家级经济特区,100000002-国家级新区,100000003-国家综合配套改革试验区,100000004-高新技术产业开发区,100000005-经济技术开发区,100000006-海关特殊监管区域,100000007-边境/跨境经济合作区,100000008-其他类型开发区,200000000-中国海关,200000001-海关总署,200000002-直属海关,200000003-隶属海关,300000000-海关经济区划,300000001-保税港区/综合保税区,300000002-保税区,300000003-保税物流园区,300000004-保税物流中心,300000005-出口加工区/珠澳跨境工业园区,300000006-高新技术产业开发区,300000007-国际边境合作中心,300000008-经济技术开发区,300000009-经济特区,300000010-一般经济区域,300000011-综合实验区,300000099-其他经济区划。\"}, {\"column\": \"AreaChiName\", \"desc\": \"地区中文名称\"}, {\"column\": \"AreaEngName\", \"desc\": \"地区英文名称\"}, {\"column\": \"AreaEngNameAbbr\", \"desc\": \"地区英文名称缩写\"}, {\"column\": \"ParentNode\", \"desc\": \"父节点代码\"}, {\"column\": \"ParentName\", \"desc\": \"父节点名称\"}, {\"column\": \"IfEffected\", \"desc\": \"是否有效;是否有效(IfEffected):该字段固定以下常量:1-是 2-否。\"}, {\"column\": \"CancelDate\", \"desc\": \"取消日期\"}, {\"column\": \"ChangeNote\", \"desc\": \"变更内容\"}, {\"column\": \"Remark\", \"desc\": \"备注\"}]}, {\"表名\": \"hkstockdb.hk_stockarchives\", \"表字段\": [{\"column\": \"CompanyCode\", \"desc\": \"公司代码\"}, {\"column\": \"EstablishmentDate\", \"desc\": \"公司成立日期\"}, {\"column\": \"RegAbbr\", \"desc\": \"注册地;注册地点(RegAbbr)与(CT_SystemConst)表中的DM字段关联,令LB = 1023,得到注册地点的具体描述:3-中国港澳台,4-中东地区,7-国际,100-亚洲,101-阿富汗,102-巴林,103-孟加拉国,104-不丹,105-文莱,106-缅甸,107-柬埔寨,108-塞浦路斯,109-朝鲜,110-中国香港,111-印度,112-印度尼西亚,113-伊朗,114-伊拉克,115-以色列,116-日本,117-约旦,118-科威特,119-老挝,120-黎巴嫩,121-中国澳门,122-马来西亚,123-马尔代夫,124-蒙古,125-尼泊尔,126-阿曼,127-巴基斯坦,128-巴勒斯坦,129-菲律宾,130-卡塔尔,131-沙特阿拉伯,132-新加坡,133-韩国,134-斯里兰卡,135-叙利亚,136-泰国,137-土耳其,138-阿拉伯联合酋长国,139-也门共和国,141-越南,142-中国,143-中国台湾,144-东帝汶,199-亚洲其他,200-非洲,201-阿尔及利亚,202-安哥拉,203-贝宁,204-博茨瓦那,205-布隆迪,206-喀麦隆,207-加那利群岛,208-佛得角,209-中非,210-塞卜泰(休达),211-乍得,212-科摩罗,213-刚果,214-吉布提,215-埃及,216-赤道几内亚,217-埃塞俄比亚,218-加蓬,219-冈比亚,220-加纳,221-几内亚,222-几内亚(比绍),223-科特迪瓦,224-肯尼亚,225-利比里亚,226-利比亚,227-马达加斯加,228-马拉维,229-马里,230-毛里塔尼亚,231-毛里求斯,232-摩洛哥,233-莫桑比克,234-纳米比亚,235-尼日尔,236-尼日利亚,237-留尼汪,238-卢旺达,239-圣多美和普林西比,240-塞内加尔,241-塞舌尔,242-塞拉利昂,243-索马里,244-南非,245-西撒哈拉,246-苏丹,247-坦桑尼亚,248-多哥,249-突尼斯,250-乌干达,251-布基纳法索,252-扎伊尔,253-赞比亚,254-津巴布韦,255-莱索托,256-梅利利亚,257-斯威士兰,258-厄立特里亚,259-马约特岛,260-刚果民主共和国,299-非洲其他,300-欧洲,301-比利时,302-丹麦,303-英国,304-德国,305-法国,306-爱尔兰,307-意大利,308-卢森堡,309-荷兰,310-希腊,311-葡萄牙,312-西班牙,313-阿尔巴尼亚,314-安道尔,315-奥地利,316-保加利亚,318-芬兰,320-直布罗陀,321-匈牙利,322-冰岛,323-列支敦士登,324-马耳他,325-摩纳哥,326-挪威,327-波兰,328-罗马尼亚,329-圣马力诺,330-瑞典,331-瑞士,334-爱沙尼亚,335-拉脱维亚,336-立陶宛,337-格鲁吉亚,338-亚美尼亚,339-阿塞拜疆,340-白俄罗斯,341-哈萨克斯坦,342-吉尔吉斯,343-摩尔多瓦,344-俄罗斯,345-塔吉克斯坦,346-土库曼斯坦,347-乌克兰,348-乌兹别克斯坦,349-南斯拉夫联盟共和国,350-斯洛文尼亚共和国,351-克罗地亚共和国,352-捷克共和国,353-斯洛伐克共和国,354-前南斯拉夫马其顿共和国,355-波斯尼亚-黑塞哥维那共和国,356-法罗群岛,357-梵蒂冈城国,358-塞尔维亚和黑山,359-塞尔维亚共和国,360-黑山共和国,361-苏联,399-欧洲其他,400-拉丁美洲,401-安提瓜和巴布达,402-阿根廷,403-阿鲁巴岛,404-巴哈马,405-巴巴多斯,406-伯利兹,408-玻利维亚,409-博内尔,410-巴西,411-开曼群岛,412-智利,413-哥伦比亚,414-多米尼克,415-哥斯达黎加,416-古巴,417-库腊索岛,418-多米尼加共和国,419-厄瓜多尔,420-法属圭亚那,421-格林纳达,422-瓜德罗普,423-危地马拉,424-圭亚那,425-海地,426-洪都拉斯,427-牙买加,428-马提尼克,429-墨西哥,430-蒙特塞拉特,431-尼加拉瓜,432-巴拿马,433-巴拉圭,434-秘鲁,435-波多黎各,436-萨巴,437-圣卢西亚,438-圣马丁岛,439-圣文森特和格林纳丁斯,440-萨尔瓦多,441-苏里南,442-特立尼达和多巴哥,443-特克斯和凯科斯群岛,444-乌拉圭,445-委内瑞拉,446-英属维尔京群岛,447-圣其茨--尼维斯,448-荷属安地列斯群岛,499-拉丁美洲其他,500-北美洲,501-加拿大,502-美国,503-格陵兰,504-百慕大,505-泽西岛,599-北美洲其他,600-大洋洲,601-澳大利亚,602-库克群岛,603-斐济,604-盖比群岛,605-马克萨斯群岛,606-瑙鲁,607-新喀里多尼亚,608-瓦努阿图,609-新西兰,610-诺福克岛,611-巴布亚新几内亚,612-社会群岛,613-所罗门群岛,614-汤加,615-土阿莫土群岛,616-土布艾群岛,617-萨摩亚,618-基里巴斯,619-图瓦卢,620-密克罗尼西亚联邦,621-马绍尔群岛共和国,622-贝劳共和国,623-帕劳共和国,624-瓦利斯和浮图纳,625-法属波利尼西亚,626-圣皮埃尔和密克隆,699-大洋洲其他,701-国(地)别不详的,702-联合国和其他国际组织,703-亚太经济合作组织,704-东南亚国家联盟,705-欧洲联盟,706-独立国家联合体,707-经济合作与发展组织(OECD),708-经合组织北美,709-经合组织亚洲大洋洲,710-经合组织欧洲,711-欧英EFTA,901-欧元区,903-欧盟15国,905-欧盟25国,906-欧盟27国,907-欧盟其他,909-全球。\"}, {\"column\": \"Business\", \"desc\": \"公司业务\"}, {\"column\": \"InduCHKE\", \"desc\": \"所属行业-港交所;所属行业-港交所(InduCHKE):目前字段在该表已经不维护。\"}, {\"column\": \"InduCHS\", \"desc\": \"所属行业-恒生;所属行业-恒生(InduCHS):目前字段在该表已经不维护,可以在 港股公司行业划分表HK_ExgIndustry获取到对应的行业分类。\"}, {\"column\": \"Chairman\", \"desc\": \"主席\"}, {\"column\": \"CompanySecretary\", \"desc\": \"公司秘书\"}, {\"column\": \"CertifiedAccountant\", \"desc\": \"合资格会计师\"}, {\"column\": \"RegisteredOffice\", \"desc\": \"注册办事处\"}, {\"column\": \"GeneralOffice\", \"desc\": \"总办事处及主要营业地点\"}, {\"column\": \"Registrars\", \"desc\": \"股份过户处(香港)\"}, {\"column\": \"Tel\", \"desc\": \"电话\"}, {\"column\": \"Fax\", \"desc\": \"传真\"}, {\"column\": \"Eail\", \"desc\": \"邮箱\"}, {\"column\": \"Website\", \"desc\": \"公司网址\"}, {\"column\": \"BriefIntroduction\", \"desc\": \"公司简介\"}, {\"column\": \"CompanyType\", \"desc\": \"企业类别;公司类别(CompanyType)与(CT_SystemConst)表中的DM字段关联,令LB = 1501,得到公司类别的具体描述:1-境内注册内地国资控制,2-境内注册内地个人控制,5-境外注册内地国资控制,6-境外注册内地个人控制,9-其他。\"}, {\"column\": \"CompanyTypeDesc\", \"desc\": \"公司类别描述\"}, {\"column\": \"ChiName\", \"desc\": \"中文名称\"}, {\"column\": \"AuditInstitution\", \"desc\": \"审计机构\"}, {\"column\": \"RegCapital\", \"desc\": \"注册资本(元)\"}, {\"column\": \"RegCapitalCurrency\", \"desc\": \"注册资本货币单位;注册资本货币单位(RegCapitalCurrency)与(CT_SystemConst)表中的DM字段关联,令LB = 1548,得到注册资本货币单位的具体描述:1000-美元,1100-港元,1110-印度卢比,1120-印度尼西亚卢比,1130-伊朗里亚尔,1140-波兰兹罗提,1150-匈牙利福林,1160-日本元,1161-欧洲日元(离岸),1170-约旦第纳尔,1180-科威特第纳尔,1190-阿联酋迪拉姆,1200-亚美尼亚德拉姆,1210-澳门元,1220-马来西亚林吉特,1230-安第列斯群岛盾,1240-安哥拉宽扎,1250-尼泊尔卢比,1260-哈萨克斯坦坚戈,1270-巴基斯坦卢比,1280-阿鲁巴岛弗罗林,1290-菲律宾比索,1300-阿塞拜疆马纳特,1310-波斯尼亚马克,1320-新加坡元,1330-韩国元,1340-孟加拉塔卡,1350-百慕大元,1360-泰国铢,1370-沙特里亚尔,1380-文莱林吉特,1390-不丹努尔特鲁姆,1400-博茨瓦纳普拉,1410-白俄罗斯卢布,1420-人民币元,1430-台湾元,1440-伯利兹元,1450-南苏丹镑,1460-世界投资报告法郎,1470-佛得角埃斯库多,1480-厄立特里亚纳克法,1490-埃塞俄比亚比尔,1500-福克兰群岛镑,1510-格鲁吉亚拉里,1520-直布罗陀镑,1530-克罗地亚库纳,1540-以色列新谢克尔,1550-吉尔吉斯斯坦索姆,1560-开曼群岛元,1570-莱索托马洛蒂,1580-摩尔多瓦列伊,1590-马其顿第纳尔,1600-蒙古图格里克,1610-马拉维克瓦查,1620-梅蒂卡尔,1630-纳米比亚元,1640-巴布亚新几内亚基那,1650-塞尔维亚第纳尔,1660-圣赫勒拿群岛磅,1670-圣多美和普林西比多布拉,1680-太平洋法郎,1690-非共体法郎,1700-斯威士兰里兰吉尼,1710-塔吉克斯坦索莫尼,1720-土库曼斯坦马纳特,1730-汤加潘加,1740-乌克兰格里夫纳,1750-乌兹别克斯坦苏姆,1760-瓦努阿图瓦图,1770-萨摩亚塔拉,1780-中非金融合作法郎,1790-东加勒比元,1800-哈萨克斯坦腾格(废弃),1810-人民币(离岸),2470-坦桑尼亚先令,3000-欧元,3010-比利时法郎,3020-丹麦克朗,3030-英镑,3040-德国马克,3050-法国法郎,3070-意大利里拉,3090-荷兰盾,3120-西班牙比塞塔,3150-奥地利先令,3180-芬兰马克,3260-挪威克朗,3300-瑞典克朗,3310-瑞士法郎,3311-记帐瑞士法郎,3313-清算瑞士法郎,3440-俄罗斯卢布,4100-巴西雷亚尔,4290-墨西哥比索,5010-加拿大元,6010-澳大利亚元,6090-新西兰元,7101-刚果法郎,7103-尼日利亚奈拉,7105-越南盾,7107-肯尼亚先令,7109-卢森堡法郎,7111-摩洛哥迪拉姆,7113-南非兰特,7115-斯里兰卡卢比,7117-新苏丹磅,7119-也门里亚尔(废弃),7121-爱尔兰镑,8000-阿尔巴尼亚列克,8010-阿尔及利亚第纳尔,8020-阿富汗尼,8030-阿根廷比索,8040-也门里亚尔,8050-阿曼里亚尔,8060-埃及镑,8070-巴巴多斯元,8080-巴哈马元,8090-巴拉圭瓜拉尼,8100-巴林第纳尔,8110-巴拿马巴波亚,8120-保加利亚列弗,8130-冰岛克朗,8140-波兰兹罗提(废弃),8150-玻利维亚诺,8160-布隆迪法郎,8170-朝鲜圆,8180-赤道几内亚埃奎勒,8190-多米尼加比索,8200-厄瓜多尔苏克雷,8210-斐济元,8220-冈比亚法拉西,8230-哥伦比亚比索,8240-哥斯达黎加科朗,8250-古巴比索,8260-圭亚那元,8270-海地古德,8280-洪都拉斯伦皮拉,8290-吉布提法郎,8300-几内亚法郎,8310-几内亚比索,8320-加纳塞地,8330-柬埔寨瑞尔,8340-捷克克朗,8350-津巴布韦元,8360-卡塔尔里亚尔,8370-科摩罗法郎,8380-老挝基普,8390-黎巴嫩镑,8400-利比里亚元,8410-利比亚第纳尔,8420-卢旺达法郎,8430-罗马尼亚列伊,8440-马达加斯加阿里亚里,8450-马尔代夫卢比,8460-马耳他镑,8470-毛里求斯卢比,8480-毛里塔尼亚乌吉亚,8490-秘鲁新索尔,8500-缅甸元,8510-也门第纳尔,8520-南斯拉夫新第纳尔,8530-尼加拉瓜科多巴,8540-埃斯库多,8550-萨尔瓦多科朗,8560-塞拉里昂利昂,8570-塞浦路斯镑,8580-塞舌尔卢比,8590-沙特阿拉伯亚尔(废弃),8600-苏里南元,8610-所罗门元,8620-索马里先令,8630-特立尼达多巴哥元,8640-突尼斯第纳尔,8650-土耳其里拉,8660-危地马拉格查尔,8670-委内瑞拉玻利瓦尔,8680-乌干达先令,8690-乌拉圭新比索,8700-希腊德拉马克,8710-匈牙利福林(废弃),8720-叙利亚镑,8730-牙买加元,8740-伊拉克第纳尔,8750-赞比亚克瓦查,8760-扎伊尔,8770-智利比索,8780-玻利维亚Mvdol基金,8790-智利CUF基金,8800-哥伦比亚实际价值单位,8810-古巴可兑换比索,8820-墨西哥UDI基金,8830-莫桑比克梅蒂卡尔(废弃),8840-东帝汶埃斯库多,9000-本币(废弃),9900-其他货币,9901-本地货币,9990-特别提款权,9999-各币种折合美元。\"}]}, {\"表名\": \"usstockdb.us_companyinfo\", \"表字段\": [{\"column\": \"CompanyCode\", \"desc\": \"公司代码\"}, {\"column\": \"EngName\", \"desc\": \"英文名称\"}, {\"column\": \"EngNameAbbr\", \"desc\": \"英文名称缩写\"}, {\"column\": \"ChiName\", \"desc\": \"中文名称\"}, {\"column\": \"PEOAddress\", \"desc\": \"公司地址\"}, {\"column\": \"PEOCity\", \"desc\": \"城市\"}, {\"column\": \"PEOState\", \"desc\": \"省份\"}, {\"column\": \"PEOZip\", \"desc\": \"邮编\"}, {\"column\": \"PEOStatus\", \"desc\": \"国家\"}, {\"column\": \"PEOTel\", \"desc\": \"电话\"}, {\"column\": \"BusinessDcrp\", \"desc\": \"公司简介\"}, {\"column\": \"BriefIntroText\", \"desc\": \"公司简介\"}, {\"column\": \"EstablishmentDate\", \"desc\": \"成立日期\"}, {\"column\": \"CompanyType\", \"desc\": \"公司类型;公司类型(CompanyType)与(CT_SystemConst)表中的DM字段关联,令LB = 2261,得到公司类型的具体描述:1-美国联邦存款保险公司(FDIC)的银行分支,2-高等院校,3-融资子公司,4-政府,5-控股公司,6-合营企业,7-非盈利性组织,8-上市公司,9-非上市公司,10-子公司,11-已停止经营解散的实体。\"}, {\"column\": \"BriefIntroTextEng\", \"desc\": \"英文公司简介\"}, {\"column\": \"Fax\", \"desc\": \"传真\"}, {\"column\": \"RegCountry\", \"desc\": \"注册地国家;注册地国家(RegCountry):与“国家城市代码表(LC_AreaCode)”中的“地区内部编码(AreaInnerCode)”关联,得到注册地国家的相关信息。\"}, {\"column\": \"RegState\", \"desc\": \"注册地省份/州\"}, {\"column\": \"BusinessDcrpEng\", \"desc\": \"英文业务简介\"}, {\"column\": \"IfHeadOffice\", \"desc\": \"是否公司总部;是否公司总部(IfHeadOffice)与(CT_SystemConst)表中的DM字段关联,令LB=999 AND DM IN (1,2),得到是否公司总部的具体描述:1-是,2-否。\"}, {\"column\": \"LinkAddress\", \"desc\": \"链接地址\"}, {\"column\": \"CountryCode\", \"desc\": \"国家代码;与“国家城市代码表(LC_AreaCode)”中的“地区内部编码(AreaInnerCode)”关联,得到国家的相关信息。\"}, {\"column\": \"EstablishmentDatePreci\", \"desc\": \"成立日期精度;成立日期精度(EstablishmentDatePreci)与(CT_SystemConst)表中的DM字段关联,令LB=102 AND DM in (27,28,29),得到成立日期精度的具体描述:27-年,28-月,29-日。\"}]}]\n",
215 | "\n",
216 | "请选择column,务必遵循输出的格式要求。\n",
217 | "\n",
218 | "\n",
219 | ">>>>> Agent【Check_db_structure.columns_selector】 Said:\n",
220 | "【分析】\n",
221 | "用户询问湘电股份的信息披露网址。\n",
222 | "\n",
223 | "【当前的表之间相互关联的字段】\n",
224 | "表astockbasicinfodb.lc_stockarchives和constantdb.secumain之间通过CompanyCode关联。\n",
225 | "\n",
226 | "【信息所在字段】\n",
227 | "- astockbasicinfodb.lc_stockarchives.DisclosureWebsites: 这个字段可能包含信息披露网址的信息,对应用户提问中的“信息披露网址”。\n",
228 | "\n",
229 | "【筛选条件所在字段】\n",
230 | "- constantdb.secumain.CompanyCode: 这个字段可能包含公司代码的信息,用于筛选湘电股份的相关信息。\n",
231 | "\n",
232 | "【选中的字段的清单】\n",
233 | "```json\n",
234 | "{\"astockbasicinfodb.lc_stockarchives\": [\"DisclosureWebsites\"], \"constantdb.secumain\": [\"CompanyCode\"]}\n",
235 | "```\n",
236 | "\n",
237 | "\n",
238 | ">>>>> 【user】 Said:\n",
239 | "湘电股份的信息披露网址是哪个网站?\n",
240 | "\n",
241 | "\n",
242 | ">>>>> Agent【Sql_query.master】 Said:\n",
243 | "【已知信息】\n",
244 | "- 湘电股份的关联信息有:所在数据表是constantdb.secumain;InnerCode(证券内部编码)是1551;CompanyCode(公司代码)是1387;ChiName(中文名称)是湘潭电机股份有限公司;EngName(英文名称)是Xiangtan Electric Manufacturing Co., Ltd.;SecuCode(证券代码)是600416;ChiNameAbbr(中文名称缩写)是湘电股份;EngNameAbbr(英文名称缩写)是XEMC;SecuAbbr(证券简称)是湘电股份;ChiSpelling(拼音证券简称)是XDGF。\n",
245 | "\n",
246 | "【当前阶段要获取的信息】\n",
247 | "- 湘电股份的信息披露网址\n",
248 | "\n",
249 | "【信息所在字段】\n",
250 | "- DisclosureWebsites\n",
251 | "\n",
252 | "【筛选条件所在字段】\n",
253 | "- CompanyCode\n",
254 | "\n",
255 | "【SQL语句的思路】\n",
256 | "- 从astockbasicinfodb.lc_stockarchives表中查询DisclosureWebsites字段\n",
257 | "- 使用CompanyCode作为筛选条件,值为1387\n",
258 | "\n",
259 | "【执行SQL语句】\n",
260 | "```exec_sql\n",
261 | "SELECT DisclosureWebsites \n",
262 | "FROM astockbasicinfodb.lc_stockarchives \n",
263 | "WHERE CompanyCode = 1387\n",
264 | "```\n",
265 | "\n",
266 | "【上述SQL语句的含义】\n",
267 | "- 从astockbasicinfodb.lc_stockarchives表中获取CompanyCode为1387的记录的DisclosureWebsites字段值,即湘电股份的信息披露网址。\n",
268 | "\n",
269 | ">>>>> 查询ql:\n",
270 | "SELECT DisclosureWebsites \n",
271 | "FROM astockbasicinfodb.lc_stockarchives \n",
272 | "WHERE CompanyCode = 1387;\n",
273 | "查询结果:\n",
274 | "[{\"DisclosureWebsites\": \"http://www.sse.com.cn\"}]\n",
275 | "\n",
276 | "\n",
277 | ">>>>> 【user】 Said:\n",
278 | "查询SQL:\n",
279 | "SELECT DisclosureWebsites \n",
280 | "FROM astockbasicinfodb.lc_stockarchives \n",
281 | "WHERE CompanyCode = 1387;\n",
282 | "查询结果:\n",
283 | "[{\"DisclosureWebsites\": \"http://www.sse.com.cn\"}]\n",
284 | "\n",
285 | "请理解查询结果\n",
286 | "\n",
287 | "\n",
288 | ">>>>> Agent【Sql_query.understand_query_result】 Said:\n",
289 | "查询结果表明: 该查询从表astockbasicinfodb.lc_stockarchives中选取了字段DisclosureWebsites,条件是CompanyCode等于1387。结果显示,公司代码为1387的公司的信息披露网址为http://www.sse.com.cn。\n",
290 | "\n",
291 | "\n",
292 | ">>>>> 【user】 Said:\n",
293 | "查询SQL:\n",
294 | "SELECT DisclosureWebsites \n",
295 | "FROM astockbasicinfodb.lc_stockarchives \n",
296 | "WHERE CompanyCode = 1387;\n",
297 | "查询结果:\n",
298 | "[{\"DisclosureWebsites\": \"http://www.sse.com.cn\"}]\n",
299 | "\n",
300 | "查询结果表明: 该查询从表astockbasicinfodb.lc_stockarchives中选取了字段DisclosureWebsites,条件是CompanyCode等于1387。结果显示,公司代码为1387的公司的信息披露网址为http://www.sse.com.cn。\n",
301 | "\n",
302 | "请检查筛选条件是否存在问题,比如时间日期字段没有用DATE()或YEAR()格式化?当然,如果没问题,那么就根据结果考虑下一步;那么当前掌握的信息是否能够回答\"湘电股份的信息披露网址是哪个网站?\"?还是要继续执行下一阶段SQL查询?\n",
303 | "\n",
304 | "\n",
305 | ">>>>> Agent【Sql_query.master】 Said:\n",
306 | "【已知信息】\n",
307 | "- 湘电股份的关联信息有:所在数据表是constantdb.secumain;InnerCode(证券内部编码)是1551;CompanyCode(公司代码)是1387;ChiName(中文名称)是湘潭电机股份有限公司;EngName(英文名称)是Xiangtan Electric Manufacturing Co., Ltd.;SecuCode(证券代码)是600416;ChiNameAbbr(中文名称缩写)是湘电股份;EngNameAbbr(英文名称缩写)是XEMC;SecuAbbr(证券简称)是湘电股份;ChiSpelling(拼音证券简称)是XDGF。\n",
308 | "- 查询结果:湘电股份的信息披露网址为http://www.sse.com.cn。\n",
309 | "\n",
310 | "【当前阶段要获取的信息】\n",
311 | "- 无\n",
312 | "\n",
313 | "【信息所在字段】\n",
314 | "- 无\n",
315 | "\n",
316 | "【筛选条件所在字段】\n",
317 | "- 无\n",
318 | "\n",
319 | "【SQL语句的思路】\n",
320 | "- 无\n",
321 | "\n",
322 | "【执行SQL语句】\n",
323 | "```exec_sql\n",
324 | "无\n",
325 | "```\n",
326 | "\n",
327 | "【上述SQL语句的含义】\n",
328 | "- 无\n",
329 | "\n",
330 | "当前掌握的信息已经能够回答\"湘电股份的信息披露网址是哪个网站?\"的问题。湘电股份的信息披露网址是http://www.sse.com.cn,无需继续执行下一阶段SQL查询。\n",
331 | "\n",
332 | "\n",
333 | ">>>>> 【user】 Said:\n",
334 | "充分尊重前面给出的结论,回答问题:\"湘电股份的信息披露网址是哪个网站?\"\n",
335 | "\n",
336 | "\n",
337 | ">>>>> Agent【Sql_query.summary】 Said:\n",
338 | "湘电股份的信息披露网址是http://www.sse.com.cn\n",
339 | ">>>>> Answer: 湘电股份的信息披露网址是http://www.sse.com.cn\n",
340 | ">>>>> Used Time: 1m 2s\n",
341 | "\n",
342 | "{\n",
343 | " \"id\": \"tttt----1----3-1-1\",\n",
344 | " \"question\": \"湘电股份的信披网址是哪个网站?\",\n",
345 | " \"answer\": \"湘电股份的信息披露网址是http://www.sse.com.cn\",\n",
346 | " \"usage_tokens\": {\n",
347 | " \"extract_company\": 132,\n",
348 | " \"rewrite_question\": 280,\n",
349 | " \"Check_db_structure\": 12552,\n",
350 | " \"Sql_query\": 7815\n",
351 | " },\n",
352 | " \"use_time\": \"1m 2s\",\n",
353 | " \"facts\": [\n",
354 | " \"湘电股份的关联信息有:[所在数据表是constantdb.secumain;InnerCode(证券内部编码)是1551;CompanyCode(公司代码)是1387;ChiName(中文名称)是湘潭电机股份有限公司;EngName(英文名称)是Xiangtan Electric Manufacturing Co., Ltd.;SecuCode(证券代码)是600416;ChiNameAbbr(中文名称缩写)是湘电股份;EngNameAbbr(英文名称缩写)是XEMC;SecuAbbr(证券简称)是湘电股份;ChiSpelling(拼音证券简称)是XDGF;]\"\n",
355 | " ],\n",
356 | " \"rewrited_question\": \"湘电股份的信息披露网址是哪个网站?\",\n",
357 | " \"sql_results\": [\n",
358 | " \"查询结果表明: 该查询从表astockbasicinfodb.lc_stockarchives中选取了字段DisclosureWebsites,条件是CompanyCode等于1387。结果显示,公司代码为1387的公司的信息披露网址为http://www.sse.com.cn。\"\n",
359 | " ]\n",
360 | "}\n"
361 | ]
362 | }
363 | ],
364 | "source": [
365 | "t_idx = 0 # team index\n",
366 | "q_idx = 0 # question index in team\n",
367 | "sql_query.clear_history_facts()\n",
368 | "question_team = config.all_question[t_idx]\n",
369 | "question_item = question_team[\"team\"][q_idx]\n",
370 | "show(question_item)\n",
371 | "facts = [] # pylint: disable=invalid-name\n",
372 | "qas = [] # pylint: disable=invalid-name\n",
373 | "qid: str = question_item[\"id\"].strip() # 声明qid的类型为str\n",
374 | "question = ajust_org_question(question_item[\"question\"])\n",
375 | "for i in range(q_idx):\n",
376 | " qas.extend(\n",
377 | " [\n",
378 | " {\"role\": \"user\", \"content\": ajust_org_question(question_team[\"team\"][i][\"question\"])},\n",
379 | " {\"role\": \"assistant\", \"content\": question_team[\"team\"][i][\"answer\"]},\n",
380 | " ]\n",
381 | " )\n",
382 | " if \"facts\" in question_team[\"team\"][i]:\n",
383 | " facts = question_team[\"team\"][i][\"facts\"]\n",
384 | " if \"sql_results\" in question_team[\"team\"][i]:\n",
385 | " sql_query.history_facts = copy.deepcopy(question_team[\"team\"][i][\"sql_results\"])\n",
386 | "\n",
387 | "start_time = time.time()\n",
388 | "log_file_path = config.ROOT_DIR + f\"/output/{qid}.log\"\n",
389 | "open(log_file_path, \"w\", encoding=\"utf-8\").close()\n",
390 | "setup_logger(\n",
391 | " log_file=log_file_path,\n",
392 | " log_level=logging.DEBUG,\n",
393 | ")\n",
394 | "logger = get_logger()\n",
395 | "\n",
396 | "print(f\">>>>> id: {qid}\")\n",
397 | "print(f\">>>>> Original Question: {question}\")\n",
398 | "logger.debug(\"\\n>>>>> Original Question: %s\\n\", question)\n",
399 | "\n",
400 | "# 获取实体内部代码\n",
401 | "agent_extract_company.clear_history()\n",
402 | "answer, _ = agent_extract_company.answer(\n",
403 | " (\n",
404 | " \"\"\"提取下面这段文字中的实体(如公司名、股票代码、拼音缩写等),如果识别结果是空,那么就回复No Entities.\"\"\"\n",
405 | " f'''\"{question}\"'''\n",
406 | " )\n",
407 | ")\n",
408 | "if answer != \"\" and answer not in facts:\n",
409 | " facts.append(answer)\n",
410 | "\n",
411 | "# rewrite question\n",
412 | "agent_rewrite_question.clear_history()\n",
413 | "qas_content = [f\"Question: {qa['content']}\" if qa[\"role\"] == \"user\" else f\"Answer: {qa['content']}\" for qa in qas]\n",
414 | "new_question, _ = agent_rewrite_question.answer(\n",
415 | " (\n",
416 | " \"历史问答:无。\\n\"\n",
417 | " if len(qas_content) == 0\n",
418 | " else \"下面是顺序的历史问答:\\n'''\\n\" + \"\\n\".join(qas_content) + \"\\n'''\\n\"\n",
419 | " )\n",
420 | " + f\"现在用户继续提问,请根据已知信息,理解当前这个问题的完整含义,并重写这个问题使得单独拿出来看仍然能够正确理解:{question}\"\n",
421 | ")\n",
422 | "print(f\">>>>> Rewrited Question: {new_question}\")\n",
423 | "\n",
424 | "# 注入已知事实\n",
425 | "key_facts = \"已知事实\"\n",
426 | "if len(facts) > 0:\n",
427 | " kv = {key_facts: \"\\n---\\n\".join(facts)}\n",
428 | " sql_query.agent_master.add_system_prompt_kv(kv)\n",
429 | " check_db_structure.agent_table_selector.add_system_prompt_kv(kv)\n",
430 | " check_db_structure.agent_column_selector.add_system_prompt_kv(kv)\n",
431 | "else:\n",
432 | " sql_query.agent_master.del_system_prompt_kv(key_facts)\n",
433 | " check_db_structure.agent_table_selector.del_system_prompt_kv(key_facts)\n",
434 | " check_db_structure.agent_column_selector.del_system_prompt_kv(key_facts)\n",
435 | "print(f\"\\n>>>>> {key_facts}:\\n\" + \"\\n---\\n\".join(facts))\n",
436 | "logger.debug(\"\\n>>>>> %s:\\n%s\", key_facts, \"\\n---\\n\".join(facts))\n",
437 | "\n",
438 | "# 注入历史对话\n",
439 | "key_qas = \"历史对话\"\n",
440 | "if len(qas_content) > 0:\n",
441 | " kv = {key_qas: \"\\n\".join(qas_content)}\n",
442 | " sql_query.agent_master.add_system_prompt_kv(kv)\n",
443 | " check_db_structure.agent_table_selector.add_system_prompt_kv(kv)\n",
444 | " check_db_structure.agent_column_selector.add_system_prompt_kv(kv)\n",
445 | "else:\n",
446 | " sql_query.agent_master.del_system_prompt_kv(key_qas)\n",
447 | " check_db_structure.agent_table_selector.del_system_prompt_kv(key_qas)\n",
448 | " check_db_structure.agent_column_selector.del_system_prompt_kv(key_qas)\n",
449 | "\n",
450 | "# 搜索相关数据库结构\n",
451 | "check_db_structure.clear_history()\n",
452 | "res = check_db_structure.run(inputs={\"messages\": [{\"role\": \"user\", \"content\": new_question}]})\n",
453 | "db_info = res[\"content\"]\n",
454 | "\n",
455 | "# 查询数据库回答用户问题\n",
456 | "sql_query.clear_history()\n",
457 | "res = sql_query.run(\n",
458 | " inputs={\n",
459 | " \"messages\": [\n",
460 | " {\"role\": \"assistant\", \"content\": db_info},\n",
461 | " {\"role\": \"user\", \"content\": new_question},\n",
462 | " ]\n",
463 | " }\n",
464 | ")\n",
465 | "question_item[\"answer\"] = res[\"content\"]\n",
466 | "\n",
467 | "# Caching\n",
468 | "qas.extend(\n",
469 | " [\n",
470 | " {\"role\": \"user\", \"content\": question},\n",
471 | " {\"role\": \"assistant\", \"content\": question_item[\"answer\"]},\n",
472 | " ]\n",
473 | ")\n",
474 | "elapsed_time = time.time() - start_time\n",
475 | "question_item[\"usage_tokens\"] = {\n",
476 | " agent_extract_company.name: agent_extract_company.usage_tokens,\n",
477 | " agent_rewrite_question.name: agent_rewrite_question.usage_tokens,\n",
478 | " check_db_structure.name: check_db_structure.usage_tokens,\n",
479 | " sql_query.name: sql_query.usage_tokens,\n",
480 | "}\n",
481 | "minutes, seconds = divmod(elapsed_time, 60)\n",
482 | "question_item[\"use_time\"] = f\"{int(minutes)}m {int(seconds)}s\"\n",
483 | "question_item[\"facts\"] = copy.deepcopy(facts)\n",
484 | "question_item[\"rewrited_question\"] = new_question\n",
485 | "question_item[\"sql_results\"] = copy.deepcopy(sql_query.history_facts)\n",
486 | "\n",
487 | "print(f\">>>>> Answer: {question_item['answer']}\")\n",
488 | "print(f\">>>>> Used Time: {int(minutes)}m {int(seconds)}s\\n\")\n",
489 | "show(question_item)"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "metadata": {},
495 | "source": [
496 | "## 保存结果"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 14,
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "with open(config.ROOT_DIR + \"/output/all_question.json\", \"w\", encoding=\"utf-8\") as f:\n",
506 | " json.dump(config.all_question, f, ensure_ascii=False, indent=4) # 添加缩进以便于阅读\n",
507 | "\n",
508 | "result_commit = copy.deepcopy(config.all_question)\n",
509 | "for q_team in result_commit:\n",
510 | " for q_item in q_team[\"team\"]:\n",
511 | " if \"usage_tokens\" in q_item:\n",
512 | " del q_item[\"usage_tokens\"]\n",
513 | " if \"use_time\" in q_item:\n",
514 | " del q_item[\"use_time\"]\n",
515 | " if \"iterate_num\" in q_item:\n",
516 | " del q_item[\"iterate_num\"]\n",
517 | " if \"facts\" in q_item:\n",
518 | " del q_item[\"facts\"]\n",
519 | " if \"rewrited_question\" in q_item:\n",
520 | " del q_item[\"rewrited_question\"]\n",
521 | " if \"sql_results\" in q_item:\n",
522 | " del q_item[\"sql_results\"]\n",
523 | "\n",
524 | "with open(config.ROOT_DIR + \"/output/Eva_Now_result.json\", \"w\", encoding=\"utf-8\") as f:\n",
525 | " json.dump(result_commit, f, ensure_ascii=False, indent=4) # 添加缩进以便于阅读"
526 | ]
527 | }
528 | ],
529 | "metadata": {
530 | "kernelspec": {
531 | "display_name": "Python 3 (ipykernel)",
532 | "language": "python",
533 | "name": "python3"
534 | },
535 | "language_info": {
536 | "codemirror_mode": {
537 | "name": "ipython",
538 | "version": 3
539 | },
540 | "file_extension": ".py",
541 | "mimetype": "text/x-python",
542 | "name": "python",
543 | "nbconvert_exporter": "python",
544 | "pygments_lexer": "ipython3",
545 | "version": "3.12.2"
546 | }
547 | },
548 | "nbformat": 4,
549 | "nbformat_minor": 4
550 | }
551 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | zhipuai>=2.1.5.20241204
2 | pandas>=2.2.3
3 | tqdm>=4.67.1
4 | ollama>=0.4.6
5 | colorama>=0.4.6
6 | mysql-connector-python>=9.2.0
7 | jieba>=0.42.1
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MetaGLM/FinGLM2/39cf08c54128e641eef3284f2b68d6d4c0fadb5d/baseline/howard_baseline/src/__init__.py
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/agent.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides implementations of agent
3 | and their interactions with various APIs.
4 | """
5 |
6 | import os
7 | from dataclasses import dataclass, field
8 | from typing import Optional, Callable, Tuple, List, Dict
9 | from src.llm import LLM, DEBUG_OPTION_PRINT_TOOL_CALL_RESULT
10 | from src.log import get_logger
11 |
12 |
13 | @dataclass
14 | class AgentConfig:
15 | """Configuration settings for the Agent class."""
16 |
17 | llm: LLM
18 | name: str
19 | role: str
20 | constraint: Optional[str] = None
21 | output_format: Optional[str] = None
22 | knowledge: Optional[str] = None
23 | tools: Optional[List[Dict]] = None
24 | funcs: Optional[List[Callable]] = None
25 | retry_limit: int = 3
26 | enable_history: bool = True
27 | temperature: Optional[float] = None
28 | top_p: Optional[float] = None
29 | stream: Optional[bool] = None
30 | debug_tool_call_result: bool = True
31 | system_prompt_kv: Optional[Dict] = field(default_factory=dict)
32 | pre_process: Optional[Callable[["Agent", dict], None]] = None
33 | post_process: Optional[Callable[[str], str]] = None
34 | max_history_num: int = 30
35 |
36 |
37 | class Agent:
38 | """Represents an agent that interacts with various APIs using a language model."""
39 |
40 | def __init__(self, config: AgentConfig):
41 | self.name = config.name
42 | self.role = config.role
43 | self.llm = config.llm
44 | self.constraint = config.constraint
45 | self.output_format = config.output_format
46 | self.knowledge = config.knowledge
47 | self.tools = config.tools
48 | self.history = []
49 | self.max_history_num = config.max_history_num
50 | self.usage_tokens = 0 # 总共使用的token数量
51 | self.retry_limit = config.retry_limit
52 | self.enable_history = config.enable_history
53 | self.options = {}
54 | if config.temperature is not None:
55 | self.options["temperature"] = config.temperature
56 | if config.top_p is not None:
57 | self.options["top_p"] = config.top_p
58 | self.stream = config.stream
59 | self.debug_tool_call_result = config.debug_tool_call_result
60 | if config.funcs is not None:
61 | self.funcs = {func.__name__: func for func in config.funcs}
62 | else:
63 | self.funcs = None
64 | if config.system_prompt_kv is not None:
65 | self.system_prompt_kv = config.system_prompt_kv
66 | else:
67 | self.system_prompt_kv = {}
68 | self.pre_process = config.pre_process
69 | self.post_process = config.post_process
70 |
71 | def clear_history(self):
72 | """Clears the agent's conversation history and resets token counts."""
73 | self.history = []
74 | self.usage_tokens = 0
75 |
76 | def add_system_prompt_kv(self, kv: dict):
77 | """Sets the system prompt key-value pairs for the agent."""
78 | for k, v in kv.items():
79 | self.system_prompt_kv[k] = v
80 |
81 | def del_system_prompt_kv(self, key: str):
82 | """Deletes the specified key from the system prompt key-value pairs for the agent."""
83 | if key in self.system_prompt_kv:
84 | del self.system_prompt_kv[key]
85 |
86 | def clear_system_prompt_kv(self):
87 | """
88 | Clear the agent's additional system prompt settings
89 | """
90 | self.system_prompt_kv = {}
91 |
92 | def get_system_prompt(self):
93 | """Generates and returns the system prompt based on the agent's attributes."""
94 | system_prompt = f"## 角色描述\n{self.role}"
95 | if self.constraint is not None:
96 | system_prompt += f"\n\n## 约束要求\n{self.constraint}"
97 | if self.output_format is not None:
98 | system_prompt += f"\n\n## 输出格式\n{self.output_format}"
99 | if self.knowledge is not None:
100 | system_prompt += f"\n\n## 知识库\n{self.knowledge}"
101 | for key, value in self.system_prompt_kv.items():
102 | system_prompt += f"\n\n## {key}\n{value}"
103 | return system_prompt
104 |
105 | def chat(self, messages: list[dict]) -> Tuple[str, int]:
106 | """Attempts to generate a response from the language model, retrying if necessary.
107 | return:
108 | - str: assistant's answer
109 | - int: usage_tokens
110 | """
111 | debug_mode = os.getenv("DEBUG", "0") == "1"
112 | show_llm_input_msg = os.getenv("SHOW_LLM_INPUT_MSG", "0") == "1"
113 | logger = get_logger()
114 |
115 | if self.pre_process is not None:
116 | self.pre_process(self, messages)
117 | usage_tokens = 0
118 | for attempt in range(self.retry_limit):
119 | if attempt > 0:
120 | if debug_mode:
121 | print(f"\n重试第 {attempt} 次...\n")
122 | logger.info("\n重试第 %d 次...\n", attempt)
123 | response = ""
124 | try:
125 | msgs = (
126 | messages
127 | if attempt == 0
128 | else messages
129 | + [{"role": "assistant", "content": response}, {"role": "user", "content": "请修正后重试"}]
130 | )
131 | if show_llm_input_msg:
132 | if debug_mode:
133 | print(f"\n\n>>>>> 【{msgs[-1]['role']}】 Said:\n{msgs[-1]['content']}")
134 | logger.debug("\n\n>>>>> 【%s】 Said:\n%s", msgs[-1]["role"], msgs[-1]["content"])
135 | if debug_mode:
136 | print(f"\n\n>>>>> Agent【{self.name}】 Said:")
137 | logger.debug("\n\n>>>>> Agent【%s】 Said:\n", self.name)
138 | response, token_count, ok = self.llm.generate_response(
139 | system=self.get_system_prompt(),
140 | messages=msgs,
141 | tools=self.tools,
142 | funcs=self.funcs,
143 | options=self.options,
144 | stream=self.stream,
145 | debug_options={DEBUG_OPTION_PRINT_TOOL_CALL_RESULT: self.debug_tool_call_result},
146 | )
147 | usage_tokens += token_count
148 | self.usage_tokens += token_count
149 | if ok and self.post_process is not None:
150 | response = self.post_process(response)
151 | except Exception as e:
152 | if debug_mode:
153 | print(f"\n发生异常:{str(e)}")
154 | logger.debug("\n发生异常:%s", str(e))
155 | ok = False
156 | response += f"\n发生异常:{str(e)}"
157 | if ok: # 如果生成成功,退出重试
158 | break
159 | else:
160 | response, token_count = f"发生异常:{response}", 0 # 如果所有尝试都失败,返回默认值
161 | return response, token_count
162 |
163 | if self.enable_history:
164 | self.history = messages + [{"role": "assistant", "content": response}]
165 | if len(self.history) > self.max_history_num:
166 | half = len(self.history) // 2 + 1
167 | # 浓缩一半的history
168 | if debug_mode:
169 | print(f"\n\n>>>>> Agent【{self.name}】 Compress History:")
170 | logger.debug("\n\n>>>>> Agent【%s】 Compress History:\n", self.name)
171 | try:
172 | compressed_msg, token_count, ok = self.llm.generate_response(
173 | system="请你把所有历史对话浓缩成一段话,必须保留重要的信息,不要换行,不要有任何markdown格式",
174 | messages=self.history[:half],
175 | stream=self.stream,
176 | )
177 | usage_tokens += token_count
178 | self.usage_tokens += token_count
179 | if ok:
180 | self.history = [{"role": "assistant", "content": compressed_msg}] + self.history[half:]
181 | except Exception as e:
182 | if debug_mode:
183 | print(f"\n发生异常:{str(e)}")
184 | logger.debug("\n发生异常:%s", str(e))
185 | return response, usage_tokens
186 |
187 | def answer(self, message: str) -> Tuple[str, int]:
188 | """Generates a response to a user's message using the agent's history.
189 | return:
190 | - str: assistant's answer
191 | - int: usage_tokens
192 | """
193 | messages = self.history + [{"role": "user", "content": message}]
194 | return self.chat(messages=messages)
195 |
196 |
197 | class AgentTemplate:
198 | """A template for creating Agent instances with a given configuration."""
199 |
200 | def __init__(self, config: AgentConfig):
201 | self.config = config
202 |
203 | def create_agent_instance(self) -> Agent:
204 | """创建一个Agent实例"""
205 | return Agent(self.config)
206 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/database.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides a MySQLConnector class for executing SQL queries
3 | and returning results in JSON format.
4 | """
5 |
6 | import json
7 | import datetime
8 | import mysql.connector
9 |
10 |
11 | class MySQLConnector:
12 | """A class to connect to a MySQL database and execute SQL queries."""
13 |
14 | def __init__(self, host: str, user: str, password: str, database: str):
15 | self.connection = mysql.connector.connect(host=host, user=user, password=password, database=database)
16 |
17 | def execute_sql_query(self, sql: str) -> str:
18 | """Executes a SQL query and returns the result in JSON format."""
19 | sql = sql.replace("\\n", " ")
20 | connection = None
21 | cursor = None
22 | try:
23 | # 连接到MySQL数据库
24 | cursor = self.connection.cursor()
25 | cursor.execute(sql)
26 | columns = [column[0] for column in cursor.description]
27 | result = cursor.fetchall()
28 | # Convert date objects to strings
29 | result_dict = [
30 | {
31 | column: (value.isoformat() if isinstance(value, (datetime.date, datetime.datetime)) else value)
32 | for column, value in zip(columns, row)
33 | }
34 | for row in result
35 | ]
36 | return json.dumps(result_dict, ensure_ascii=False)
37 | except mysql.connector.Error as err:
38 | return f"Error: {err}"
39 | finally:
40 | if connection is not None and connection.is_connected():
41 | cursor.close()
42 | connection.close()
43 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/llm.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides implementations of large language models (LLM)
3 | and their interactions with various APIs.
4 | """
5 |
6 | from abc import ABC, abstractmethod
7 | from typing import Optional, Callable
8 | import os
9 | import re
10 | import json
11 | from ollama import Client
12 | from zhipuai import ZhipuAI
13 | from openai import OpenAI
14 | from src.log import get_logger
15 |
16 | CHAT_OPTION_TEMPERATURE = "temperature"
17 | CHAT_OPTION_TOP_K = "top_k"
18 | CHAT_OPTION_MAX_TOKENS = "max_tokens"
19 |
20 | DEBUG_OPTION_PRINT_TOOL_CALL_RESULT = "print_tool_call_result"
21 |
22 |
23 | class LLM(ABC):
24 | """Abstract base class for language models."""
25 |
26 | @abstractmethod
27 | def generate_response(
28 | self,
29 | system: str,
30 | messages: list,
31 | tools: Optional[list[dict]] = None,
32 | funcs: Optional[dict[str, Callable]] = None,
33 | options: Optional[dict] = None,
34 | stream: Optional[bool] = None,
35 | debug_options: Optional[dict] = None,
36 | tool_choice: Optional[bool] = None,
37 | ) -> tuple[str, int, bool]:
38 | """生成响应的方法,所有LLM都需要实现
39 | tool_choice: 目前只有openai支持
40 | - None 代表 auto,由llm自行判断是否要调用tool
41 | - False 代表none,表示不调用tool
42 | - True 代表required, 表示必须调用tool
43 | 返回的tuple包含两个元素:
44 | - str: LLM的回答
45 | - int: 输入输出总共的token数量
46 | - bool: 是否成功生成回答
47 |
48 | 参数:
49 | - messages: 一个数组,每个元素是一个dict,包含role和content,例如:
50 | [
51 | {"role": "user", "content": "你好"},
52 | {"role": "assistant", "content": "你好!有什么我可以帮助你的吗?"}
53 | ]
54 | 其中role支持user、assistant。
55 | """
56 |
57 |
58 | class OllamaLLM(LLM):
59 | """Concrete implementation of LLM using the Ollama API."""
60 |
61 | def __init__(self, host: str, model: str, post_process: Optional[Callable[[str], str]] = None):
62 | self.host = host
63 | self.model = model
64 | self.post_process = post_process
65 | # 初始化其他必要的参数
66 | self.client = Client(host)
67 |
68 | def generate_response(
69 | self,
70 | system: str,
71 | messages: list,
72 | tools: Optional[list[dict]] = None,
73 | funcs: Optional[dict[str, Callable]] = None,
74 | options: Optional[dict] = None,
75 | stream: Optional[bool] = None,
76 | debug_options: Optional[dict] = None,
77 | tool_choice: Optional[bool] = None, # no use yet
78 | ) -> tuple[str, int, bool]:
79 | debug_mode = os.getenv("DEBUG", "0") == "1"
80 | logger = get_logger()
81 | if options is None:
82 | options = {}
83 | if debug_options is None:
84 | debug_options = {}
85 | if CHAT_OPTION_MAX_TOKENS in options:
86 | options["num_ctx"] = options[CHAT_OPTION_MAX_TOKENS]
87 | options.pop(CHAT_OPTION_MAX_TOKENS)
88 | options.setdefault("num_ctx", 5120)
89 | if stream is None:
90 | stream = True
91 | # fortest
92 | # print(system)
93 | response = self.client.chat(
94 | model=self.model,
95 | messages=[{"role": "system", "content": system}] + messages,
96 | options=options,
97 | tools=tools,
98 | stream=stream,
99 | )
100 | content = ""
101 | tool_calls = []
102 | token_count = 0
103 | if stream:
104 | for piece in response:
105 | if piece.message.content is not None:
106 | content += piece.message.content
107 | if debug_mode:
108 | print(piece.message.content, end="")
109 | logger.debug("%s", piece.message.content)
110 | if piece.message.tool_calls is not None:
111 | tool_calls.extend(piece.message.tool_calls)
112 | if piece.prompt_eval_count is not None:
113 | token_count += piece.prompt_eval_count
114 | if piece.eval_count is not None:
115 | token_count += piece.eval_count
116 | if debug_mode and content != "":
117 | print() # 打印换行以便于调试输出的可读性
118 | if content != "":
119 | logger.debug("\n")
120 | else:
121 | if response.message.content is not None:
122 | content = response.message.content
123 | if response.message.tool_calls is not None:
124 | tool_calls = response.message.tool_calls
125 | if response.prompt_eval_count is not None:
126 | token_count = response.prompt_eval_count
127 | if response.eval_count is not None:
128 | token_count += response.eval_count
129 | if debug_mode and content != "":
130 | print(content)
131 | if content != "":
132 | logger.debug("%s\n", content)
133 | ok = True
134 | for tool_call in tool_calls:
135 | function_call = tool_call["function"]
136 | function_name = function_call["name"]
137 | arguments = function_call["arguments"]
138 | for key, value in arguments.items():
139 | if isinstance(value, str):
140 | try:
141 | arguments[key] = json.loads(value)
142 | except json.JSONDecodeError:
143 | arguments[key] = value # 保留原值
144 | if debug_mode:
145 | print(f"调用函数 {function_name}({arguments})")
146 | logger.debug("调用函数 %s(%s)\n", function_name, arguments)
147 | function = (
148 | (funcs.get(function_name) if funcs is not None else None)
149 | or globals().get(function_name)
150 | or locals().get(function_name)
151 | )
152 | if function:
153 | try:
154 | content += "\n调用结果:\n" + function(**arguments)
155 | except Exception as e:
156 | if debug_mode:
157 | print(f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}")
158 | logger.debug("\n调用结果:\n执行函数%s时发生错误: %s", function_name, str(e))
159 | content += f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}"
160 | ok = False
161 | else:
162 | content += f"\n调用结果:\n未找到名为 {function_name} 的函数, context: {tool_call}"
163 | ok = False
164 | if debug_mode and debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
165 | print(content)
166 | if debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
167 | logger.debug("%s\n", content)
168 | if not ok:
169 | break
170 | if ok and self.post_process is not None:
171 | content = self.post_process(content)
172 | return content.strip(), token_count, ok
173 |
174 |
175 | class ZhipuLLM(LLM):
176 | """Concrete implementation of LLM using the ZhipuAI API."""
177 |
178 | def __init__(self, api_key: str, model: str, post_process: Optional[Callable[[str], str]] = None):
179 | self.api_key = api_key
180 | self.model = model
181 | self.post_process = post_process
182 | self.client = ZhipuAI(api_key=api_key)
183 |
184 | def generate_response(
185 | self,
186 | system: str,
187 | messages: list,
188 | tools: Optional[list[dict]] = None,
189 | funcs: Optional[dict[str, Callable]] = None,
190 | options: Optional[dict] = None,
191 | stream: Optional[bool] = None,
192 | debug_options: Optional[dict] = None,
193 | tool_choice: Optional[bool] = None, # no use yet
194 | ) -> tuple[str, int, bool]:
195 | debug_mode = os.getenv("DEBUG", "0") == "1"
196 | logger = get_logger()
197 | if options is None:
198 | options = {}
199 | if debug_options is None:
200 | debug_options = {}
201 | if stream is None:
202 | stream = True
203 | # fortest
204 | # show(system)
205 | response = self.client.chat.completions.create(
206 | model=self.model,
207 | messages=[{"role": "system", "content": system}] + messages,
208 | top_p=options.get(CHAT_OPTION_TOP_K, 0.5),
209 | temperature=options.get(CHAT_OPTION_TEMPERATURE, 0.5),
210 | max_tokens=options.get(CHAT_OPTION_MAX_TOKENS, None),
211 | stream=stream,
212 | tools=tools,
213 | )
214 | content = ""
215 | tool_calls = []
216 | token_count = 0
217 | if stream:
218 | for piece in response:
219 | if len(piece.choices) > 0:
220 | if piece.choices[0].delta.content is not None:
221 | content += piece.choices[0].delta.content
222 | if debug_mode:
223 | print(piece.choices[0].delta.content, end="")
224 | logger.debug("%s", piece.choices[0].delta.content)
225 | if piece.choices[0].delta.tool_calls is not None:
226 | tool_calls.extend(piece.choices[0].delta.tool_calls)
227 | if piece.usage is not None:
228 | token_count += piece.usage.total_tokens
229 | if debug_mode and content != "":
230 | print() # 打印换行以便于调试输出的可读性
231 | if content != "":
232 | logger.debug("\n")
233 | else:
234 | if response.choices[0].message.content is not None:
235 | content = response.choices[0].message.content
236 | if response.choices[0].message.tool_calls is not None:
237 | tool_calls = response.choices[0].message.tool_calls
238 | if response.usage is not None:
239 | token_count = response.usage.total_tokens
240 | if debug_mode and content != "":
241 | print(content)
242 | if content != "":
243 | logger.debug("%s\n", content)
244 | ok = True
245 | for tool_call in tool_calls:
246 | function_call = tool_call.function
247 | function_name = function_call.name
248 | arguments = json.loads(function_call.arguments)
249 | if debug_mode:
250 | print(f"调用函数 {function_name}({arguments})")
251 | logger.debug("调用函数 %s(%s)\n", function_name, arguments)
252 | function = (
253 | (funcs.get(function_name) if funcs is not None else None)
254 | or globals().get(function_name)
255 | or locals().get(function_name)
256 | )
257 | if function:
258 | try:
259 | content += "\n调用结果:\n" + function(**arguments)
260 | except Exception as e:
261 | if debug_mode:
262 | print(f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}")
263 | logger.debug("\n调用结果:\n执行函数%s时发生错误: %s", function_name, str(e))
264 | content += f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}"
265 | ok = False
266 | else:
267 | content += f"\n调用结果:\n未找到名为 {function_name} 的函数, context: {tool_call}"
268 | ok = False
269 | if debug_mode and debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
270 | print(content)
271 | if debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
272 | logger.debug("%s\n", content)
273 | if not ok:
274 | break
275 | if ok and self.post_process is not None:
276 | content = self.post_process(content)
277 | return content.strip(), token_count, ok
278 |
279 |
280 | def extract_answer_from_r1(text) -> str:
281 | """
282 | Removes content enclosed by and tags from the given text.
283 |
284 | Parameters:
285 | text (str): The input text containing tags.
286 |
287 | Returns:
288 | str: The text with tags and their content removed.
289 | """
290 | # 删除 和 标签包围的内容
291 | text_without_think = re.sub(r".*?", "", text, flags=re.DOTALL)
292 | return text_without_think
293 |
294 |
295 | class OpenAILLM(LLM):
296 | """Concrete implementation of LLM using the OpenAI API."""
297 |
298 | def __init__(
299 | self,
300 | api_key: str,
301 | model: str,
302 | post_process: Optional[Callable[[str], str]] = None,
303 | base_url: Optional[str] = None,
304 | default_stream: Optional[bool] = False,
305 | ):
306 | self.api_key = api_key
307 | self.model = model
308 | self.post_process = post_process
309 | # 初始化其他必要的参数
310 | self.client = OpenAI(api_key=api_key, base_url=base_url)
311 | if self.model.startswith("o"):
312 | self.system_role = "developer"
313 | else:
314 | self.system_role = "system"
315 | self.default_stream = default_stream
316 |
317 | def generate_response(
318 | self,
319 | system: str,
320 | messages: list,
321 | tools: Optional[list[dict]] = None,
322 | funcs: Optional[dict[str, Callable]] = None,
323 | options: Optional[dict] = None,
324 | stream: Optional[bool] = None,
325 | debug_options: Optional[dict] = None,
326 | tool_choice: Optional[bool] = None, # "none", "auto", "required"
327 | ) -> tuple[str, int, bool]:
328 | debug_mode = os.getenv("DEBUG", "0") == "1"
329 | logger = get_logger()
330 | if options is None:
331 | options = {}
332 | if debug_options is None:
333 | debug_options = {}
334 | if stream is None:
335 | stream = self.default_stream
336 | tool_choice_str = "auto" if tool_choice is None else "required" if tool_choice is True else "none"
337 | # fortest
338 | # print(system)
339 | response = self.client.chat.completions.create(
340 | model=self.model,
341 | messages=[{"role": self.system_role, "content": system}] + messages,
342 | temperature=options.get(CHAT_OPTION_TEMPERATURE, 0.5),
343 | max_tokens=options.get(CHAT_OPTION_MAX_TOKENS, 5120),
344 | stream=stream,
345 | tools=tools,
346 | tool_choice=None if tools is None else tool_choice_str,
347 | )
348 | # fortest
349 | # print(response)
350 |
351 | content = ""
352 | tool_calls = []
353 | token_count = 0
354 | if stream:
355 | for piece in response:
356 | if len(piece.choices) > 0:
357 | if piece.choices[0].delta.content is not None:
358 | content += piece.choices[0].delta.content
359 | if debug_mode:
360 | print(piece.choices[0].delta.content, end="")
361 | logger.debug("%s", piece.choices[0].delta.content)
362 | if piece.choices[0].delta.tool_calls is not None:
363 | tool_calls.extend(piece.choices[0].delta.tool_calls)
364 | if piece.usage is not None:
365 | token_count += piece.usage.total_tokens
366 | if debug_mode and content != "":
367 | print() # 打印换行以便于调试输出的可读性
368 | if content != "":
369 | logger.debug("\n")
370 | else:
371 | if response.choices[0].message.content is not None:
372 | content = response.choices[0].message.content
373 | if response.choices[0].message.tool_calls is not None:
374 | tool_calls = response.choices[0].message.tool_calls
375 | if response.usage is not None:
376 | token_count = response.usage.total_tokens
377 | if debug_mode and content != "":
378 | print(content)
379 | if content != "":
380 | logger.debug("%s\n", content)
381 | ok = True
382 | for tool_call in tool_calls:
383 | function_call = tool_call.function
384 | function_name = function_call.name
385 | arguments = json.loads(function_call.arguments)
386 | if debug_mode:
387 | print(f"调用函数 {function_name}({arguments})")
388 | logger.debug("调用函数 %s(%s)\n", function_name, arguments)
389 | function = (
390 | (funcs.get(function_name) if funcs is not None else None)
391 | or globals().get(function_name)
392 | or locals().get(function_name)
393 | )
394 | if function:
395 | try:
396 | content += "\n调用结果:\n" + function(**arguments)
397 | except Exception as e:
398 | if debug_mode:
399 | print(f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}")
400 | logger.debug("\n调用结果:\n执行函数%s时发生错误: %s", function_name, str(e))
401 | content += f"\n调用结果:\n执行函数{function_name}时发生错误: {str(e)}"
402 | ok = False
403 | else:
404 | content += f"\n调用结果:\n未找到名为 {function_name} 的函数, context: {tool_call}"
405 | ok = False
406 | if debug_mode and debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
407 | print(content)
408 | if debug_options.get(DEBUG_OPTION_PRINT_TOOL_CALL_RESULT, True):
409 | logger.debug("%s\n", content)
410 | if not ok:
411 | break
412 | if ok and self.post_process is not None:
413 | content = self.post_process(content)
414 | return content.strip(), token_count, ok
415 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/log.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides custom logging handlers that write log messages without newlines.
3 | """
4 |
5 | import logging
6 |
7 | DEFAULT_LOGGER = "default_logger"
8 |
9 |
10 | class NoNewlineStreamHandler(logging.StreamHandler):
11 | """
12 | A custom stream handler that writes log messages without adding newlines.
13 | """
14 |
15 | def emit(self, record):
16 | msg = self.format(record)
17 | stream = self.stream
18 | stream.write(msg)
19 | self.flush()
20 |
21 |
22 | class NoNewlineFileHandler(logging.FileHandler):
23 | """
24 | A custom file handler that writes log messages without adding newlines.
25 | """
26 |
27 | def emit(self, record):
28 | msg = self.format(record)
29 | with open(self.baseFilename, self.mode, encoding=self.encoding) as file:
30 | file.write(msg)
31 | file.flush()
32 |
33 |
34 | def get_logger(logger_name: str = DEFAULT_LOGGER):
35 | """
36 | Get a custom logger.
37 |
38 | :param logger_name: Name of the logger to update (default is 'default_logger').
39 | :return: Configured logger instance.
40 | """
41 | return logging.getLogger(logger_name)
42 |
43 |
44 | def setup_logger(log_file: str, log_level: int = logging.INFO, logger_name: str = DEFAULT_LOGGER):
45 | """
46 | Set the log file for the specified logger.
47 |
48 | :param log_file: file path for logging output.
49 | :param log_level: log level (default is 'logging.INFO')
50 | :param logger_name: Name of the logger to update (default is 'default_logger').
51 | """
52 | logger = logging.getLogger(logger_name)
53 | logger.setLevel(log_level)
54 |
55 | # Remove existing file handlers
56 | for handler in logger.handlers[:]:
57 | if isinstance(handler, logging.FileHandler):
58 | logger.removeHandler(handler)
59 |
60 | # Add new file handler
61 | formatter = logging.Formatter("%(message)s")
62 | new_file_handler = NoNewlineFileHandler(log_file, mode="a", encoding="utf-8")
63 | new_file_handler.setFormatter(formatter)
64 | logger.addHandler(new_file_handler)
65 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/teamwork.py:
--------------------------------------------------------------------------------
1 | """
2 | This module defines the Teamwork class,
3 | which coordinates teamwork to solve problems using LLM agents and workflows.
4 | """
5 |
6 | import os
7 |
8 | # import re
9 | import json
10 | from typing import Tuple, Optional
11 | from src.llm import LLM
12 | from src.agent import Agent, AgentConfig, AgentTemplate
13 | from src.workflow import Workflow
14 | from src.utils import generate_markdown_table, extract_last_json
15 | from src.log import get_logger
16 |
17 |
18 | class Teamwork:
19 | """Coordinates teamwork to solve problems using LLM agents and workflows"""
20 |
21 | def __init__(
22 | self, llm_coordinator: LLM, llm_deliver: LLM, default_context_len: int = 10, name: Optional[str] = None
23 | ):
24 | self.name = "Teamwork" if name is None else name
25 | self.knowledge = []
26 | self.context = [] # 记录对话历史 list[dict(role, content)]
27 | self.default_context_len = default_context_len
28 | self.final_answer_mark = "Final Answer is:"
29 | self.agent_list_title_map = {"name": "Agent Name", "backstory": "Backstory", "usecase": "Usecase"}
30 | self.agent_deliver = Agent(
31 | AgentConfig(
32 | name=self.name + ".deliver",
33 | role="你负责根据context,把针对Problem的回复重新组织语言输出。",
34 | output_format="回答要简洁。",
35 | llm=llm_deliver,
36 | post_process=lambda x: f"Final Answer is: {x}",
37 | )
38 | )
39 | self.agent_map = {
40 | self.agent_deliver.name: self.agent_deliver,
41 | }
42 | self.agent_list = [
43 | {
44 | "name": self.agent_deliver.name,
45 | "backstory": "这是一个负责做最终回复的agent。",
46 | "usecase": (
47 | """如果已经能够对Problem给出最后回复或者要结束任务了,就调用这个agent。\n"""
48 | """当需要结束任务时,请务必调用这个agent!!!\n"""
49 | ),
50 | }
51 | ]
52 | self.agent_coordinator = Agent(
53 | AgentConfig(
54 | name=self.name + ".coordinator",
55 | role=(
56 | """你是理智且聪明的会议主持,基于待解决的Problem和参与协作Agent List,安排合适的agent发表意见。"""
57 | """你总是call tool来唤起agent工作,直至能够针对Problem给出Final Answer。"""
58 | ),
59 | output_format=(
60 | """如果某个agent需要被唤起,请在以下格式中输出调用:\n\n"""
61 | """CALL_AGENT:\n"""
62 | """```json\n"""
63 | """{\"agent_name\": \"\", \"instruction\": \"\"}\n"""
64 | """```\n"""
65 | """\n\n其中,{}是代理人的名字,是可选的指令。"""
66 | ),
67 | constraint=("""- 一次仅能调用一个agent工作\n"""),
68 | llm=llm_coordinator,
69 | enable_history=True,
70 | # temperature = 0.5,
71 | # top_p = 0.5,
72 | )
73 | )
74 |
75 | def register_agent(self, agent: Agent | AgentTemplate | Workflow, name: str, backstory: str, usecase: str):
76 | """Registers an agent or workflow with the given details.
77 |
78 | Args:
79 | agent (Agent|Workflow): The agent or workflow to register.
80 | name (str): The name of the agent.
81 | backstory (str): The backstory of the agent.
82 | usecase (str): The use case for the agent.
83 | """
84 | if name in self.agent_map:
85 | raise ValueError(f"agent_name: {name} has been registered")
86 | if isinstance(agent, AgentTemplate):
87 | self.agent_map[name] = agent.create_agent_instance()
88 | else:
89 | self.agent_map[name] = agent
90 | self.agent_list.append(
91 | {
92 | "name": name,
93 | "backstory": backstory,
94 | "usecase": usecase,
95 | }
96 | )
97 |
98 | def clear_history(self):
99 | """Clears the history for all registered agents and the coordinator."""
100 | for agent in self.agent_map.values():
101 | agent.clear_history()
102 | self.agent_coordinator.clear_history()
103 | self.context = []
104 |
105 | def final_answer(self, answer: str) -> str:
106 | """Formats and returns the final answer, optionally printing it in debug mode."""
107 | debug_mode = os.getenv("DEBUG", "0") == "1"
108 | if debug_mode:
109 | print(f"\n\n>>>>> Final Answer: {answer}")
110 | logger = get_logger()
111 | logger.debug("\n\n>>>>> Final Answer: %s\n", answer)
112 | return f"{self.final_answer_mark} {answer}"
113 |
114 | def call_agent(self, agent_name: str, *args, instruction: Optional[str] = None, **kwargs) -> str:
115 | """Calls the specified agent to provide an answer based on the context.
116 |
117 | Args:
118 | agent_name (str): The name of the agent to call.
119 | instruction: Optional[str] = None, # 可选的指令,用于指导代理的工作
120 | *args: Additional arguments for the agent. No used, just for illustration.
121 | **kwargs: Additional keyword arguments for the agent. No used, just for illustration.
122 |
123 | Returns:
124 | str: The answer provided by the agent.
125 | """
126 | _ = args
127 | _ = kwargs
128 | if agent_name in self.agent_map:
129 | agent = self.agent_map[agent_name]
130 | if agent_name == self.agent_deliver.name:
131 | messages = self.context + [{"role": "user", "content": "请针对Problem给出最终答复"}]
132 | elif instruction is not None:
133 | messages = self.context + [{"role": "user", "content": f"{instruction}\n请按指示行动"}]
134 | else:
135 | messages = self.context + [{"role": "user", "content": "请按指示行动"}]
136 |
137 | if isinstance(agent, Agent):
138 | # 处理Agent类型的逻辑
139 | answer, _ = agent.chat(messages)
140 | elif isinstance(agent, Workflow):
141 | # 处理Workflow类型的逻辑
142 | response = agent.run(inputs={"messages": messages})
143 | answer = response["content"]
144 | else:
145 | raise TypeError(f"未知类型: {type(agent)}")
146 | return answer
147 | else:
148 | return f"unknown agent_name: {agent_name}"
149 |
150 | def extract_args_for_call_agent(self, text: str) -> Optional[dict]:
151 | """Extracts agent name and instruction from the given text using regex.
152 |
153 | Args:
154 | text (str): The text containing the agent call information.
155 |
156 | Returns:
157 | Optional[dict]:
158 | A dictionary with 'agent_name' and 'instruction' if found, otherwise None.
159 | """
160 | if "CALL_AGENT:" in text:
161 | args_json = extract_last_json(text=text)
162 | if args_json is not None:
163 | return json.loads(args_json)
164 | return None
165 |
166 | def add_system_prompt_kv(self, kv: dict):
167 | """Adds a key-value pair to the system prompt for all agents and the coordinator.
168 |
169 | Args:
170 | kv (dict): The key-value pair to add.
171 | """
172 | self.agent_coordinator.add_system_prompt_kv(kv)
173 | for agent in self.agent_map.values():
174 | agent.add_system_prompt_kv(kv)
175 |
176 | def del_system_prompt_kv(self, key: str):
177 | """Deletes the specified key from the system prompt key-value pairs for the agent."""
178 | self.agent_coordinator.del_system_prompt_kv(key)
179 | for agent in self.agent_map.values():
180 | agent.del_system_prompt_kv(key)
181 |
182 | def clear_system_prompt_kv(self):
183 | """Clears all key-value pairs from the system prompt for all agents and the coordinator."""
184 | self.agent_coordinator.clear_system_prompt_kv()
185 | for agent in self.agent_map.values():
186 | agent.clear_system_prompt_kv()
187 |
188 | def solve(self, problem: str, max_iterate_num: int = 10) -> Tuple[str, int]:
189 | """Solve a problem using the registered agents,
190 | iterating until a final answer is found or the maximum iterations are reached.
191 |
192 | Args:
193 | problem (str): The problem to be solve.
194 | max_iterate_num (int, optional): The maximum number of iterations. Defaults to 10.
195 |
196 | Returns:
197 | str: The final answer provided by the agents.
198 | int: iterate_num
199 | """
200 | debug_mode = os.getenv("DEBUG", "0") == "1"
201 | logger = get_logger()
202 | # start
203 | if debug_mode:
204 | print(f"\n\n>>>>> 【Problem】: {problem}")
205 | logger.debug("\n\n>>>>> 【Problem】: %s\n", problem)
206 |
207 | self.agent_coordinator.add_system_prompt_kv({"Problem": problem})
208 | self.agent_coordinator.add_system_prompt_kv(
209 | {
210 | # "Agent List": generate_markdown_table(
211 | # self.agent_list, self.agent_list_title_map,
212 | # )
213 | "Agent List": json.dumps(self.agent_list, ensure_ascii=False),
214 | }
215 | )
216 | self.agent_deliver.add_system_prompt_kv({"Problem": problem})
217 |
218 | self.context.append({"role": "user", "content": f"我们开始解决这个problem:\n{problem}"})
219 |
220 | answer, _ = self.agent_coordinator.chat(
221 | messages=self.context[-1:]
222 | + [
223 | {
224 | "role": "user",
225 | "content": "现在是否已经能够解决Problem了?请你判断下一个要找哪个agent来回答。务必遵循call agent的格式要求。",
226 | }
227 | ],
228 | )
229 | args = self.extract_args_for_call_agent(answer)
230 | if args is not None:
231 | try:
232 | answer = self.call_agent(**args)
233 | except Exception as e:
234 | answer = f"发生异常:{str(e)}"
235 | self.context.append({"role": "assistant", "content": f"{args['agent_name']} Said:\n{answer}"})
236 |
237 | iterate_num = 1
238 | while self.final_answer_mark not in answer and iterate_num < max_iterate_num:
239 | iterate_num += 1
240 | answer, _ = self.agent_coordinator.chat(
241 | messages=[
242 | {
243 | "role": "user",
244 | "content": (
245 | self.context[-1]["content"]
246 | + "\n\n现在是否已经能够解决Problem了?请你判断下一个要找哪个agent来回答。务必遵循call agent的格式要求。"
247 | ),
248 | }
249 | ],
250 | )
251 | args = self.extract_args_for_call_agent(answer)
252 | if args is not None:
253 | try:
254 | answer = self.call_agent(**args)
255 | except Exception as e:
256 | answer = f"发生异常:{str(e)}"
257 | self.context.append({"role": "assistant", "content": f"{args['agent_name']} Said:\n{answer}"})
258 | if self.final_answer_mark not in answer:
259 | messages = self.context + [{"role": "user", "content": "请针对Problem给出最终答复"}]
260 | answer, _ = self.agent_deliver.chat(messages)
261 | return answer.split(self.final_answer_mark, 1)[-1].strip(), iterate_num
262 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides utility functions.
3 | """
4 |
5 | import re
6 | import json
7 | from typing import Optional
8 |
9 | COLUMN_LIST_MARK = "数据表的字段信息如下"
10 |
11 |
12 | def generate_markdown_table(data_list, key_title_map):
13 | """
14 | 根据输入的数据列表和键标题映射生成 Markdown 表格。
15 |
16 | :param data_list: 包含字典的列表,每个字典代表一行数据。
17 | :param key_title_map: 字典,键为数据字典中的键,值为表格标题。
18 | :return: 生成的 Markdown 表格字符串。
19 | """
20 | # 创建表头
21 | headers = "| " + " | ".join(key_title_map.values()) + " |\n"
22 | separators = "| " + " | ".join("---" for _ in key_title_map) + " |\n"
23 | markdown_table = headers + separators
24 |
25 | # 填充表格内容
26 | for item in data_list:
27 | row = "| " + " | ".join(item[key].replace("\n", "\\n") for key in key_title_map) + " |\n"
28 | markdown_table += row
29 |
30 | return markdown_table
31 |
32 |
33 | def get_column_list(db_table, table_column, tables: list[str]) -> str:
34 | """
35 | tables: list of table names, format is database_name.table_name
36 | """
37 | column_lists = []
38 | for table in tables:
39 | if "." not in table or table.count(".") != 1:
40 | raise ValueError(f"发生异常: 表名`{table}`格式不正确,应该为`database_name.table_name`")
41 | db_name, table_name = table.split(".")
42 | if db_name not in db_table:
43 | raise KeyError(f"发生异常: 数据库名`{db_name}`不存在")
44 | if any(t["表英文"] == table_name for t in db_table[db_name]["表"]):
45 | column_lists.append(
46 | {
47 | "表名": table,
48 | "表字段": table_column[table_name],
49 | }
50 | )
51 | result = f"已取得可用的{COLUMN_LIST_MARK}:\n" + json.dumps(column_lists, ensure_ascii=False) + "\n"
52 | return result
53 |
54 |
55 | def extract_last_sql(query_string: str, block_mark: str) -> Optional[str]:
56 | """
57 | 从给定的字符串中提取最后一组 SQL 语句,并去掉注释。
58 |
59 | :param query_string: 包含 SQL 语句的字符串。
60 | :param block_mark: SQL 代码块的标记。
61 | :return: 最后一组 SQL 语句。
62 | """
63 | # 使用正则表达式匹配 SQL 语句块
64 | sql_pattern = re.compile(rf"(?s)```{re.escape(block_mark)}\s+(.*?)\s+```")
65 | matches = sql_pattern.findall(query_string)
66 | if matches:
67 | # 提取最后一个 SQL 代码块
68 | last_sql_block = matches[-1].strip()
69 | # 去掉注释但保留分号
70 | last_sql_block = re.sub(r"--.*(?=\n)|--.*$", "", last_sql_block)
71 | # 分割 SQL 语句
72 | sql_statements = [stmt.strip() for stmt in last_sql_block.split(";") if stmt.strip()]
73 | # 返回最后一个非空 SQL 语句
74 | return sql_statements[-1] + ";" if sql_statements else None
75 | return None
76 |
77 |
78 | def count_total_sql(query_string: str, block_mark: str) -> int:
79 | """
80 | 从给定的字符串中提取所有 SQL 语句的总数。
81 |
82 | :param query_string: 包含 SQL 语句的字符串。
83 | :param block_mark: SQL 代码块的标记。
84 | :return: SQL 语句的总数。
85 | """
86 | # 使用正则表达式匹配 SQL 语句块
87 | sql_pattern = re.compile(rf"(?s)```{re.escape(block_mark)}\s+(.*?)\s+```")
88 | matches = sql_pattern.findall(query_string)
89 | total_sql_count = 0
90 | for sql_block in matches:
91 | # 去掉注释但保留分号
92 | sql_block = re.sub(r"--.*(?=\n)|--.*$", "", sql_block)
93 | # 分割 SQL 语句并计数
94 | sql_statements = [stmt.strip() for stmt in sql_block.split(";") if stmt.strip()]
95 | total_sql_count += len(sql_statements)
96 | return total_sql_count
97 |
98 |
99 | def extract_last_json(text: str) -> Optional[str]:
100 | """
101 | 从给定文本中提取最后一个```json和```之间的内容。
102 |
103 | Args:
104 | text (str): 包含JSON内容的文本。
105 |
106 | Returns:
107 | Optional[str]: 提取的JSON字符串,如果未找到则返回None。
108 | """
109 | pattern = r"```json(.*?)```"
110 | matches = re.findall(pattern, text, re.DOTALL)
111 | return matches[-1].strip() if matches else None
112 |
113 |
114 | def show(obj):
115 | """
116 | 打印对象的 JSON 表示。
117 | """
118 | if isinstance(obj, dict):
119 | print(json.dumps(obj, ensure_ascii=False, indent=2))
120 | elif isinstance(obj, list):
121 | print(json.dumps(obj, ensure_ascii=False, indent=2))
122 | elif isinstance(obj, str):
123 | if str(obj).startswith(("{", "[")):
124 | try:
125 | o = json.loads(str)
126 | print(json.dumps(o, ensure_ascii=False, indent=2))
127 | except Exception:
128 | print(obj)
129 | else:
130 | print(obj)
131 | elif isinstance(obj, (int, float)):
132 | print(obj)
133 | else:
134 | print(obj)
135 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/src/workflow.py:
--------------------------------------------------------------------------------
1 | """
2 | This module defines the Workflow abstract base class and its implementation, RecallDbInfo,
3 | which handles recalling database information using various agents.
4 | """
5 |
6 | import json, os, copy
7 | from abc import ABC, abstractmethod
8 | from typing import Callable, Optional
9 |
10 | from src.log import get_logger
11 | from src.llm import LLM
12 | from src.agent import Agent, AgentConfig
13 | from src.utils import generate_markdown_table, extract_last_sql, extract_last_json, COLUMN_LIST_MARK, count_total_sql
14 |
15 |
16 | class Workflow(ABC):
17 | """
18 | Abstract base class defining the basic interface for workflows.
19 | """
20 |
21 | @abstractmethod
22 | def run(self, inputs: dict) -> dict:
23 | """
24 | 运行工作流,返回结果。
25 |
26 | return: dict
27 | - content: str, 结果内容
28 | - usage_tokens: int, 使用的token数量
29 | """
30 |
31 | @abstractmethod
32 | def clear_history(self):
33 | """
34 | 清除工作流内部的agent的history
35 | """
36 |
37 | @abstractmethod
38 | def add_system_prompt_kv(self, kv: dict):
39 | """
40 | 给agent的system prompt增加设定
41 | """
42 |
43 | @abstractmethod
44 | def del_system_prompt_kv(self, key: str):
45 | """Deletes the specified key from the system prompt key-value pairs for the agent."""
46 |
47 | @abstractmethod
48 | def clear_system_prompt_kv(self):
49 | """
50 | 清除agent的system prompt额外设定
51 | """
52 |
53 |
54 | class SqlQuery(Workflow):
55 | """
56 | Implements the functionality to write and execute sql to fetch data, inheriting from Workflow.
57 | """
58 |
59 | def __init__(
60 | self,
61 | execute_sql_query: Callable[[str], str],
62 | llm: LLM,
63 | max_iterate_num: int = 5,
64 | name: Optional[str] = None,
65 | specific_column_desc: Optional[dict] = None,
66 | cache_history_facts: Optional[bool] = False,
67 | default_sql_limit: Optional[int] = None,
68 | ):
69 | self.name = "Sql_query" if name is None else name
70 | self.execute_sql_query = execute_sql_query
71 | self.max_iterate_num = max_iterate_num
72 | self.usage_tokens = 0
73 | self.is_cache_history_facts = cache_history_facts
74 | self.history_facts = []
75 | self.max_db_struct_num = 1
76 | self.specific_column_desc = specific_column_desc if specific_column_desc is not None else {}
77 | self.default_sql_limit = default_sql_limit
78 | self.agent_master = Agent(
79 | AgentConfig(
80 | name=self.name + ".master",
81 | role=(
82 | """你是一个严谨的数据库专家,擅长通过分步拆解的方式获取数据。你遵循以下原则:\n"""
83 | """**Core Principles**\n"""
84 | """1. 采用分步执行策略:先执行基础查询 → 分析结果 → 执行后续查询\n"""
85 | """2. 每个交互周期仅执行单条SQL语句,确保可维护性和性能\n"""
86 | """3. 已经尝试过的方案不要重复尝试,如果没有更多可以尝试的方案,就说明情况并停止尝试。\n"""
87 | """**!!绝对执行规则!!**\n"""
88 | """- 每次响应有且仅有一个 ```exec_sql 代码块\n"""
89 | """- 即使需要多步操作,也必须分次请求执行\n"""
90 | """- 出现多个SQL语句将触发系统级阻断\n"""
91 | """- 不使用未知的表名和字段名\n"""
92 | """- 获取任何实体或概念,如果它在同一张表里存在唯一编码,要顺便把它查询出来备用\n"""
93 | ),
94 | constraint=(
95 | """- 时间日期过滤必须对字段名进行格式化:`DATE(column_name) (op) 'YYYY-MM-DD'` 或 `YEAR(column_name) (op) 'YYYY'`\n"""
96 | """- 表名必须完整格式:database_name.table_name(即使存在默认数据库)\n"""
97 | """- 字符串搜索总是采取模糊搜索,总是优先用更短的关键词去搜索,增加搜到结果的概率\n"""
98 | """- 若所需表/字段未明确存在,必须要求用户确认表结构\n"""
99 | """- 当遇到空结果时,请检查是否存在下述问题:\n"""
100 | """ 1. 时间日期字段是否使用DATE()或YEAR()进行了格式化\n"""
101 | """ 2. 字段跟值并不匹配,比如把股票代码误以为公司代码\n"""
102 | """ 3. 字段语言版本错配,比如那中文的字串去跟英文的字段匹配\n"""
103 | """ 4. 可以通过SELECT * FROM database_name.table_name LIMIT 1;了解所有字段的值是什么形式\n"""
104 | """ 5. 是否可以把时间范围放宽了解一下具体情况\n"""
105 | """ 6. 关键词模糊匹配是否可以把关键词改短后再事实?\n"""
106 | """- 如果确认查找的方式是正确的,那么可以接受空结果!!!\n"""
107 | """- 每次交互只处理一个原子查询操作\n"""
108 | """- 连续步骤必须显式依赖前序查询结果\n"""
109 | """- 如果总是执行失败,尝试更换思路,拆解成简单SQL,逐步执行确认\n"""
110 | """- 擅于使用DISTINC,尤其当发现获取的结果存在重复,去重后不满足期望的数量的时候,比如要查询前10个结果,但是发现结果里存在重复,那么就要考虑使用DISTINC重新查询\n"""
111 | """- 在MySQL查询中,使用 WHERE ... IN (...) 不能保持传入列表的顺序,可通过 ORDER BY FIELD(列名, 值1, 值2, 值3, ...) 强制按指定顺序排序。"""
112 | ),
113 | output_format=(
114 | """分阶段输出模板:\n"""
115 | """【已知信息】\n"""
116 | """(这里写当前已知的所有事实信息)\n"""
117 | """【当前阶段要获取的信息】\n"""
118 | """(如果无需继续执行SQL,那么这里写"无")\n"""
119 | """【信息所在字段】\n"""
120 | """(如果无需继续执行SQL,那么这里写"无")\n"""
121 | """(如果已知字段里缺少需要的字段,那么用`SELECT * FROM database_name.table_name LIMIT 1;`来了解这个表的字段值的形式)\n"""
122 | """【筛选条件所在字段】\n"""
123 | """(如果无需继续执行SQL,那么这里写"无")\n"""
124 | """(认真检查,时间日期过滤必须对字段名进行格式化:`DATE(column_name) (op) 'YYYY-MM-DD'` 或 `YEAR(colum_name) (op) 'YYYY'`)\n"""
125 | """【SQL语句的思路】\n"""
126 | """(接着要执行的SQL的思路)\n"""
127 | """(如果无需继续执行SQL,那么这里写"无")\n"""
128 | """(这里必须使用已知的数据库表和字段,不能假设任何数据表或字典)\n"""
129 | """【执行SQL语句】\n"""
130 | """(唯一允许的SQL代码块,如果当前阶段无需继续执行SQL,那么这里写"无")\n"""
131 | """(如果涉及到数学运算即便是用已知的纯数字做计算,也可以通过SQL语句来进行,保证计算结果的正确性,如`SELECT 1+1 AS a`)\n"""
132 | """(这里必须使用已知的数据库表和字段,不能假设任何数据表或字典)\n"""
133 | """```exec_sql\n"""
134 | """SELECT [精准字段] \n"""
135 | """FROM [完整表名] \n"""
136 | """WHERE [条件原子化] \n"""
137 | """LIMIT [强制行数]\n"""
138 | """【上述SQL语句的含义】\n"""
139 | """(如果当前阶段无执行SQL,那么这里写"无")\n"""
140 | ),
141 | llm=llm,
142 | enable_history=True,
143 | # temperature = 0.8,
144 | # top_p = 0.7,
145 | stream=False,
146 | )
147 | )
148 | self.agent_understand_query_result = Agent(
149 | AgentConfig(
150 | name=self.name + ".understand_query_result",
151 | role="你是优秀的数据库专家和数据分析师,负责根据已知的数据库结构说明,以及用户提供的SQL语句,理解这个SQL的查询结果。",
152 | output_format=(
153 | "输出模板:\n"
154 | "查询结果表明:\n"
155 | "(一段话描述查询结果,不遗漏重要信息,不捏造事实,没有任何markdown格式,务必带上英文字段名)\n"
156 | ),
157 | llm=llm,
158 | enable_history=False,
159 | stream=False,
160 | )
161 | )
162 | self.agent_summary = Agent(
163 | AgentConfig(
164 | name=self.name + ".summary",
165 | role="你负责根据当前已知的事实信息,回答用户的提问。",
166 | constraint=("""- 根据上下文已知的事实信息回答,不捏造事实\n"""),
167 | output_format=("""- 用一段文字来回答,不要有任何markdown格式,不要有换行\n"""),
168 | llm=llm,
169 | enable_history=False,
170 | stream=False,
171 | )
172 | )
173 | self.agent_lists = [
174 | self.agent_master,
175 | self.agent_summary,
176 | self.agent_understand_query_result,
177 | ]
178 |
179 | def clear_history(self):
180 | self.usage_tokens = 0
181 | for agent in self.agent_lists:
182 | agent.clear_history()
183 |
184 | def clear_history_facts(self):
185 | self.history_facts = []
186 |
187 | def add_system_prompt_kv(self, kv: dict):
188 | for agent in self.agent_lists:
189 | agent.add_system_prompt_kv(kv=kv)
190 |
191 | def del_system_prompt_kv(self, key: str):
192 | """Deletes the specified key from the system prompt key-value pairs for the agent."""
193 | for agent in self.agent_lists:
194 | agent.del_system_prompt_kv(key=key)
195 |
196 | def clear_system_prompt_kv(self):
197 | for agent in self.agent_lists:
198 | agent.clear_system_prompt_kv()
199 |
200 | def run(self, inputs: dict) -> dict:
201 | """
202 | inputs:
203 | - messages: list[dict] # 消息列表,每个元素是一个dict,包含role和content
204 | """
205 | debug_mode = os.getenv("DEBUG", "0") == "1"
206 | logger = get_logger()
207 | usage_tokens = 0
208 | same_sqls = {}
209 | told_specific_columns = set()
210 |
211 | if "messages" not in inputs:
212 | raise KeyError("发生异常: inputs缺少'messages'字段")
213 |
214 | db_structs = []
215 | messages = []
216 | for msg in inputs["messages"]:
217 | if COLUMN_LIST_MARK in msg["content"]:
218 | db_structs.append(msg["content"])
219 | if len(db_structs) > self.max_db_struct_num:
220 | db_structs.pop(0)
221 | self.agent_master.add_system_prompt_kv({"KNOWN DATABASE STRUCTURE": "\n\n---\n\n".join(db_structs)})
222 | self.agent_understand_query_result.add_system_prompt_kv(
223 | {"KNOWN DATABASE STRUCTURE": "\n\n---\n\n".join(db_structs)}
224 | )
225 | else:
226 | messages.append(msg)
227 | local_db_structs = copy.deepcopy(db_structs)
228 |
229 | first_user_msg = messages[-1]["content"]
230 | if len(self.history_facts) > 0:
231 | messages[-1]["content"] = (
232 | "之前已查询到信息如下:\n" + "\n---\n".join(self.history_facts) + "\n\n请问:" + first_user_msg
233 | )
234 |
235 | iterate_num = 0
236 | is_finish = False
237 | answer = ""
238 | while iterate_num < self.max_iterate_num:
239 | iterate_num += 1
240 | answer, tkcnt_1 = self.agent_master.chat(messages=messages)
241 | usage_tokens += tkcnt_1
242 |
243 | if "```exec_sql" in answer and ("SELECT " in answer or "SHOW " in answer):
244 | sql_cnt = count_total_sql(
245 | query_string=answer,
246 | block_mark="exec_sql",
247 | )
248 | if sql_cnt > 1:
249 | emphasize = "一次仅允许给出一组待执行的SQL写到代码块```exec_sql ```中"
250 | if emphasize not in messages[-1]["content"]:
251 | messages[-1]["content"] += f"\n\n{emphasize}"
252 | else:
253 | sql = extract_last_sql(
254 | query_string=answer,
255 | block_mark="exec_sql",
256 | )
257 | if sql is None:
258 | emphasize = "请务必需要把待执行的SQL写到代码块```exec_sql ```中"
259 | if emphasize not in messages[-1]["content"]:
260 | messages[-1]["content"] += f"\n\n{emphasize}"
261 | else:
262 | messages.append(
263 | {
264 | "role": "assistant",
265 | "content": answer,
266 | }
267 | )
268 | if sql in same_sqls:
269 | emphasize = (
270 | f"下面的sql已经执行过:\n{sql}\n结果是:\n{same_sqls[sql]}\n"
271 | "请不要重复执行,考虑其它思路:\n"
272 | "如果遇到字段不存在的错误,可以用`SELECT * FROM database_name.table_name LIMIT 1;`来查看这个表的字段值的形式;\n"
273 | "如果原SQL过于复杂,可以考虑先查询简单SQL获取必要信息再逐步推进;\n"
274 | )
275 | messages.append(
276 | {
277 | "role": "user",
278 | "content": emphasize,
279 | }
280 | )
281 | else:
282 | need_tell_cols = []
283 | for t_name, cols in self.specific_column_desc.items():
284 | if t_name in sql:
285 | for col_name in cols:
286 | if (
287 | col_name in sql
288 | and f"{t_name}.{col_name}" not in told_specific_columns
289 | and not any(col_name in db_struct for db_struct in db_structs)
290 | ):
291 | need_tell_cols.append({col_name: cols[col_name]})
292 | told_specific_columns.add(f"{t_name}.{col_name}")
293 | if len(need_tell_cols) > 0:
294 | local_db_structs.append(json.dumps(need_tell_cols, ensure_ascii=False))
295 | self.agent_master.add_system_prompt_kv(
296 | {"KNOWN DATABASE STRUCTURE": "\n\n---\n\n".join(local_db_structs)}
297 | )
298 | self.agent_understand_query_result.add_system_prompt_kv(
299 | {"KNOWN DATABASE STRUCTURE": "\n\n---\n\n".join(local_db_structs)}
300 | )
301 | try:
302 | data = self.execute_sql_query(sql=sql)
303 | rows = json.loads(data)
304 | if len(rows) == 0: # 空结果
305 | messages.append(
306 | {
307 | "role": "user",
308 | "content": (
309 | f"查询SQL:\n{sql}\n查询结果:\n{data}\n"
310 | + (
311 | ""
312 | if len(need_tell_cols) == 0
313 | else "\n补充字段说明如下:\n"
314 | + json.dumps(need_tell_cols, ensure_ascii=False)
315 | )
316 | + "\n请检查筛选条件是否存在问题,比如时间日期字段没有用DATE()或YEAR()格式化?当然,如果没问题,那么就根据结果考虑下一步"
317 | ),
318 | }
319 | )
320 | elif self.default_sql_limit is not None and len(rows) == self.default_sql_limit:
321 | messages.append(
322 | {
323 | "role": "user",
324 | "content": (
325 | f"查询SQL:\n{sql}\n查询结果:\n{data}\n"
326 | + (
327 | ""
328 | if len(need_tell_cols) == 0
329 | else "\n补充字段说明如下:\n"
330 | + json.dumps(need_tell_cols, ensure_ascii=False)
331 | )
332 | + f"\n请注意,这里返回的不一定是全部结果,因为默认限制了只返回{self.default_sql_limit}个,你可以根据现在看到的情况,采取子查询的方式去进行下一步"
333 | ),
334 | }
335 | )
336 | else:
337 | facts, tkcnt_1 = self.agent_understand_query_result.answer(
338 | (
339 | f"查询SQL:\n{sql}\n查询结果:\n{data}\n"
340 | + (
341 | ""
342 | if len(need_tell_cols) == 0
343 | else "\n补充字段说明如下:\n"
344 | + json.dumps(need_tell_cols, ensure_ascii=False)
345 | )
346 | + "\n请理解查询结果"
347 | )
348 | )
349 | if self.is_cache_history_facts:
350 | self.history_facts.append(facts)
351 | usage_tokens += tkcnt_1
352 | messages.append(
353 | {
354 | "role": "user",
355 | "content": (
356 | f"查询SQL:\n{sql}\n查询结果:\n{data}\n"
357 | + (
358 | ""
359 | if len(need_tell_cols) == 0
360 | else "\n补充字段说明如下:\n"
361 | + json.dumps(need_tell_cols, ensure_ascii=False)
362 | )
363 | + (f"\n{facts}\n" if facts != "" else "\n")
364 | + "\n请检查筛选条件是否存在问题,比如时间日期字段没有用DATE()或YEAR()格式化?当然,如果没问题,那么就根据结果考虑下一步;"
365 | + f'那么当前掌握的信息是否能够回答"{first_user_msg}"?还是要继续执行下一阶段SQL查询?'
366 | ),
367 | }
368 | )
369 |
370 | same_sqls[sql] = data
371 | except Exception as e:
372 | messages.append(
373 | {
374 | "role": "user",
375 | "content": (
376 | f"查询SQL:\n{sql}\n查询发生异常:{str(e)}\n"
377 | + (
378 | ""
379 | if len(need_tell_cols) == 0
380 | else "\n补充字段说明如下:\n"
381 | + json.dumps(need_tell_cols, ensure_ascii=False)
382 | )
383 | + "\n请修正"
384 | ),
385 | }
386 | )
387 | same_sqls[sql] = f"查询发生异常:{str(e)}"
388 | else:
389 | messages.append(
390 | {
391 | "role": "assistant",
392 | "content": answer,
393 | }
394 | )
395 | is_finish = True
396 | break
397 | if not is_finish:
398 | if debug_mode:
399 | print(f"Workflow【{self.name}】迭代次数超限({self.max_iterate_num}),中断并退出")
400 | logger.debug("Workflow【%s】迭代次数超限(%d),中断并退出", self.name, self.max_iterate_num)
401 |
402 | answer, tkcnt_1 = self.agent_summary.chat(
403 | messages[-2:] + [{"role": "user", "content": f'''充分尊重前面给出的结论,回答问题:"{first_user_msg}"'''}]
404 | )
405 | usage_tokens += tkcnt_1
406 |
407 | self.usage_tokens += usage_tokens
408 | return {
409 | "content": answer,
410 | "usage_tokens": usage_tokens,
411 | }
412 |
413 |
414 | class CheckDbStructure(Workflow):
415 | """
416 | Implements the functionality to check database structure, inheriting from Workflow.
417 | """
418 |
419 | def __init__(
420 | self,
421 | dbs_info: str,
422 | db_table: dict,
423 | table_column: dict,
424 | db_selector_llm: LLM,
425 | table_selector_llm: LLM,
426 | column_selector_llm: LLM,
427 | name: Optional[str] = None,
428 | db_select_post_process: Optional[Callable[[list], list]] = None,
429 | table_select_post_process: Optional[Callable[[list], list]] = None,
430 | import_column_names: Optional[set] = None,
431 | foreign_key_hub: Optional[dict] = None,
432 | ):
433 | self.name = "Check_db_structure" if name is None else name
434 | self.dbs_info = dbs_info
435 | self.db_table = db_table
436 | self.table_column = table_column
437 | self.usage_tokens = 0
438 | self.import_column_names = import_column_names if import_column_names is not None else {}
439 | self.db_select_post_process = db_select_post_process
440 | self.table_select_post_process = table_select_post_process
441 | self.foreign_key_hub = foreign_key_hub if foreign_key_hub is not None else {}
442 |
443 | self.agent_db_selector = Agent(
444 | AgentConfig(
445 | name=self.name + ".db_selector",
446 | role=(
447 | """你是一个数据分析专家。根据用户的提问,从已知的数据库中,选出一个或多个数据库名,"""
448 | """判断可以从这些库中获取到用户所需要的信息。"""
449 | """请选择能最快获取到用户所需信息的数据库名,不要舍近求远。只需要说明思考过程并给出数据库名即可。"""
450 | ),
451 | output_format=(
452 | """输出模板示例:\n"""
453 | """【分析】\n"""
454 | """分析用户的提问\n"""
455 | """【选中的数据库】\n"""
456 | """(选出必要的数据库,不是越多越好)\n"""
457 | """- database_name: 这个数据库包含哪些会被用到的信息\n"""
458 | """【选中的数据库的清单】\n"""
459 | """```json\n"""
460 | """["database_name", "database_name"]\n"""
461 | """```\n"""
462 | ),
463 | llm=db_selector_llm,
464 | knowledge=json.dumps(dbs_info, ensure_ascii=False),
465 | enable_history=False,
466 | stream=False,
467 | )
468 | )
469 | self.agent_table_selector = Agent(
470 | AgentConfig(
471 | name=self.name + ".table_selector",
472 | role=(
473 | """你是一个数据分析专家,从已知的数据表中,根据需要选出一个或多个表名。"""
474 | """请尽可能选择能最合适的表名。"""
475 | ),
476 | output_format=(
477 | """输出模板示例:\n"""
478 | """【分析】\n"""
479 | """分析用户的提问\n"""
480 | """【选中的数据表】\n"""
481 | """(选出必要的数据表,不是越多越好)\n"""
482 | """- database_name.table_name: 这个数据表包含哪些会被用到的信息\n"""
483 | """【选中的数据库表的清单】\n"""
484 | """```json\n"""
485 | """["database_name.table_name", "database_name.table.name"]\n"""
486 | """```\n"""
487 | """给出的表名应该是库名和表名的组合(database_name.table_name)"""
488 | ),
489 | llm=table_selector_llm,
490 | enable_history=False,
491 | stream=False,
492 | )
493 | )
494 | self.agent_column_selector = Agent(
495 | AgentConfig(
496 | name=self.name + ".columns_selector",
497 | role=(
498 | """你是一个数据分析专家,从已知的数据表字段中,根据用户的问题,找出所有相关的字段名。"""
499 | """请不要有遗漏!"""
500 | ),
501 | output_format=(
502 | """输出模板示例:\n"""
503 | """【分析】\n"""
504 | """分析用户的提问\n"""
505 | """【当前的表之间相互关联的字段】\n"""
506 | """(考虑表之间的关联,把关联的字段选出来)\n"""
507 | """表A和表B之间: ...\n"""
508 | """表A和表C之间: ...\n"""
509 | """【信息所在字段】\n"""
510 | """(选出跟用户提问相关的信息字段,没有遗漏)\n"""
511 | """- database_name.table_name.column_name: 这个字段可能包含xx信息,对应用户提问中的xxx\n"""
512 | """【筛选条件所在字段】\n"""
513 | """(选出跟用户提问相关的条件字段,没有遗漏)\n"""
514 | """(跟条件字段有外键关联的字段冗余选上,因为联表查询要用到)\n"""
515 | """- database_name.table_name.column_name: 这个字段可能包含xx信息,对应用户提问中的xxx\n"""
516 | """【选中的字段的清单】\n"""
517 | """(把同一个表的字段聚合在这个表名[database_name.table_name]下面)\n"""
518 | """```json\n"""
519 | """{"database_name.table_name": ["column_name", "column_name"],"database_name.table_name": ["column_name", "column_name"]}\n"""
520 | """```\n"""
521 | ),
522 | llm=column_selector_llm,
523 | enable_history=False,
524 | stream=False,
525 | )
526 | )
527 | self.agent_lists = [
528 | self.agent_db_selector,
529 | self.agent_table_selector,
530 | self.agent_column_selector,
531 | ]
532 |
533 | def get_table_list(self, dbs: list[str]) -> str:
534 | """
535 | Retrieves a list of tables for each specified database.
536 |
537 | Parameters:
538 | dbs (list[str]): A list of database names.
539 |
540 | Returns:
541 | str: A formatted string containing the table information for each database.
542 | """
543 | table_list = []
544 | for db_name in dbs:
545 | if db_name not in self.db_table:
546 | raise KeyError(f"发生异常: 数据库名`{db_name}`不存在")
547 | for table in self.db_table[db_name]["表"]:
548 | table_list.append({"表名": f"{db_name}.{table['表英文']}", "说明": table["cols_summary"]})
549 | result = "数据库表信息如下:\n" + json.dumps(table_list, ensure_ascii=False) + "\n"
550 | return result
551 |
552 | def get_column_list(self, tables: list[str]) -> str:
553 | """
554 | tables: list of table names, format is database_name.table_name
555 | """
556 |
557 | column_lists = []
558 | for table in tables:
559 | if "." not in table or table.count(".") != 1:
560 | raise ValueError(f"发生异常: 表名`{table}`格式不正确,应该为database_name.table_name")
561 | db_name, table_name = table.split(".")
562 | if db_name not in self.db_table:
563 | raise KeyError(f"发生异常: 数据库名`{db_name}`不存在")
564 | if any(t["表英文"] == table_name for t in self.db_table[db_name]["表"]):
565 | column_lists.append(
566 | {
567 | "表名": table,
568 | "表字段": self.table_column[table_name],
569 | }
570 | )
571 | result = f"已取得可用的{COLUMN_LIST_MARK}:\n" + json.dumps(column_lists, ensure_ascii=False) + "\n"
572 | return result
573 |
574 | def filter_column_list(self, tables: list[str], column_filter: dict) -> str:
575 | """
576 | tables: list of table names, format is database_name.table_name
577 | column_filter: dict{"table_name":["col1", "col2"]}
578 | """
579 | column_lists = []
580 | for table in tables:
581 | if "." not in table or table.count(".") != 1:
582 | raise ValueError(f"发生异常: 表名`{table}`格式不正确,应该为database_name.table_name")
583 | if table not in column_filter:
584 | continue
585 | db_name, table_name = table.split(".")
586 | if db_name not in self.db_table:
587 | raise KeyError(f"发生异常: 数据库名`{db_name}`不存在")
588 | if any(t["表英文"] == table_name for t in self.db_table[db_name]["表"]):
589 | column_list = {
590 | "表名": table,
591 | "表字段": [],
592 | }
593 | for col in self.table_column[table_name]:
594 | if col["column"] in column_filter[table] or col["column"] in self.import_column_names:
595 | column_list["表字段"].append(col)
596 | column_lists.append(column_list)
597 | for table, cols in self.foreign_key_hub.items():
598 | if table not in tables:
599 | column_list = {
600 | "表名": table,
601 | "表字段": [],
602 | }
603 | db_name, table_name = table.split(".")
604 | for col in self.table_column[table_name]:
605 | if col["column"] in cols or col["column"] in self.import_column_names:
606 | column_list["表字段"].append(col)
607 | column_lists.append(column_list)
608 | result = f"已取得可用的{COLUMN_LIST_MARK}:\n" + json.dumps(column_lists, ensure_ascii=False) + "\n"
609 | return result
610 |
611 | def clear_history(self):
612 | self.usage_tokens = 0
613 | for agent in self.agent_lists:
614 | agent.clear_history()
615 |
616 | def add_system_prompt_kv(self, kv: dict):
617 | for agent in self.agent_lists:
618 | agent.add_system_prompt_kv(kv=kv)
619 |
620 | def del_system_prompt_kv(self, key: str):
621 | """Deletes the specified key from the system prompt key-value pairs for the agent."""
622 | for agent in self.agent_lists:
623 | agent.del_system_prompt_kv(key=key)
624 |
625 | def clear_system_prompt_kv(self):
626 | for agent in self.agent_lists:
627 | agent.clear_system_prompt_kv()
628 |
629 | def run(self, inputs: dict) -> dict:
630 | """
631 | inputs:
632 | - messages: list[dict] # 消息列表,每个元素是一个dict,包含role和content
633 | """
634 |
635 | debug_mode = True
636 | logger = get_logger()
637 | usage_tokens = 0
638 |
639 | if "messages" not in inputs:
640 | raise KeyError("发生异常: inputs缺少'messages'字段")
641 |
642 | messages = []
643 | for msg in inputs["messages"]:
644 | if COLUMN_LIST_MARK not in msg["content"]:
645 | messages.append(msg)
646 |
647 | for _ in range(3):
648 | try:
649 | answer, tk_cnt = self.agent_db_selector.chat(
650 | messages=messages + [{"role": "user", "content": "请选择db,务必遵循输出的格式要求。"}]
651 | )
652 | usage_tokens += tk_cnt
653 | args_json = extract_last_json(answer)
654 | if args_json is not None:
655 | dbs = json.loads(args_json)
656 | if self.db_select_post_process is not None:
657 | dbs = self.db_select_post_process(dbs)
658 | table_list = self.get_table_list(dbs=dbs)
659 | break
660 | except Exception as e:
661 | if debug_mode:
662 | print(f"\nagent_db_selector 遇到问题: {str(e)}, 现在重试...\n")
663 | logger.debug("\nagent_db_selector 遇到问题: %s, 现在重试...\n", str(e))
664 |
665 | # 选择数据表
666 | for _ in range(3):
667 | try:
668 | answer, tk_cnt = self.agent_table_selector.chat(
669 | messages=messages
670 | + [{"role": "user", "content": f"{table_list}\n请选择table,务必遵循输出的格式要求。"}]
671 | )
672 | usage_tokens += tk_cnt
673 | args_json = extract_last_json(answer)
674 | if args_json is not None:
675 | tables = json.loads(args_json)
676 | if self.table_select_post_process is not None:
677 | tables = self.table_select_post_process(tables)
678 | column_list = self.get_column_list(tables=tables)
679 | break
680 | except Exception as e:
681 | if debug_mode:
682 | print(f"\n遇到问题: {str(e)}, 现在重试...\n")
683 | logger.debug("\nagent_table_selector 遇到问题: %s, 现在重试...\n", str(e))
684 |
685 | # 筛选字段
686 | for _ in range(3):
687 | try:
688 | answer, tk_cnt = self.agent_column_selector.chat(
689 | messages=messages
690 | + [{"role": "user", "content": f"{column_list}\n请选择column,务必遵循输出的格式要求。"}]
691 | )
692 | usage_tokens += tk_cnt
693 | args_json = extract_last_json(answer)
694 | if args_json is not None:
695 | column_filter = json.loads(args_json)
696 | column_list = self.filter_column_list(tables=tables, column_filter=column_filter)
697 | break
698 | except Exception as e:
699 | if debug_mode:
700 | print(f"\n遇到问题: {str(e)}, 现在重试...\n")
701 | logger.debug("\nagent_column_selector 遇到问题: %s, 现在重试...\n", str(e))
702 |
703 | self.usage_tokens += usage_tokens
704 | return {
705 | "content": column_list,
706 | "usage_tokens": usage_tokens,
707 | }
708 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides specific functions for Game Finglm
3 | """
4 |
5 | import os
6 | import re
7 | import jieba
8 | import json
9 | import requests
10 | from src.log import get_logger
11 | from src.agent import Agent
12 | from src.utils import extract_last_sql, extract_last_json
13 | from src.workflow import COLUMN_LIST_MARK
14 | import config
15 |
16 |
17 | def execute_sql_query(sql: str) -> str:
18 | """
19 | Executes an SQL query using the specified API endpoint and returns the result as a string.
20 |
21 | Args:
22 | sql (str): The SQL query to be executed.
23 |
24 | Returns:
25 | str: The result of the SQL query execution.
26 | """
27 | debug_mode = os.getenv("DEBUG", "0") == "1"
28 | sql = sql.replace("\\n", " ")
29 | url = "https://comm.chatglm.cn/finglm2/api/query"
30 | access_token = os.getenv("ZHIPU_ACCESS_TOKEN", "")
31 | headers = {"Content-Type": "application/json", "Authorization": f"Bearer {access_token}"}
32 | logger = get_logger()
33 | logger.info("\n>>>>> 查询sql:\n%s\n", sql)
34 | if debug_mode:
35 | print(f"\n>>>>> 查询ql:\n{sql}")
36 | try:
37 | response = requests.post(
38 | url, headers=headers, json={"sql": sql, "limit": config.MAX_SQL_RESULT_ROWS}, timeout=30
39 | )
40 | except requests.exceptions.Timeout as exc:
41 | logger.info("请求超时,无法执行SQL查询,请优化SQL")
42 | if debug_mode:
43 | print("请求超时,无法执行SQL查询,请优化SQL")
44 | raise RuntimeError("执行SQL查询超时,请优化SQL后重试。") from exc
45 | result = response.json()
46 | if "success" in result and result["success"] is True:
47 | data = json.dumps(result["data"], ensure_ascii=False)
48 | logger.info("查询结果:\n%s\n", data)
49 | if debug_mode:
50 | print(f"查询结果:\n{data}")
51 | return data
52 | logger.info("查询失败: %s\n", result["detail"])
53 | if debug_mode:
54 | print("查询失败:" + result["detail"])
55 | if "Commands out of sync" in result["detail"]:
56 | raise SyntaxError("不能同时执行多组SQL: " + result["detail"])
57 | raise RuntimeError(result["detail"])
58 |
59 |
60 | def keep_db_column_info(agent: Agent, messages: dict) -> None:
61 | """Stores knowledge from messages into the agent."""
62 | for msg in messages:
63 | if COLUMN_LIST_MARK in msg["content"]:
64 | agent.add_system_prompt_kv(kv={"Known Database Structure": msg["content"]})
65 |
66 |
67 | def extract_and_execute_sql(message: str) -> str:
68 | """
69 | Extracts SQL from a message and executes it, returning the result.
70 |
71 | Args:
72 | message (str): The message containing the SQL query.
73 |
74 | Returns:
75 | str: The result of the SQL query execution.
76 | """
77 | sql = extract_last_sql(
78 | query_string=message,
79 | block_mark="sql",
80 | )
81 | if sql is None:
82 | if "SELECT" in message:
83 | raise RuntimeError("请把sql写到代码块```sql```中")
84 | else:
85 | return message
86 | result = execute_sql_query(sql=sql)
87 | return f"{message}\n执行SQL:\n{sql}查询结果是:\n{result}"
88 |
89 |
90 | def get_constant_column_list(table_column: dict) -> list:
91 | """
92 | Retrieves a list of basic columns for constant tables based on the provided table column data.
93 |
94 | Args:
95 | table_column (dict): A dictionary containing table names as keys and their columns as values.
96 |
97 | Returns:
98 | list: A list of dictionaries, each containing the table name and its corresponding columns that are part of the constant tables.
99 | """
100 | constant_tables = {
101 | "constantdb.secumain": {
102 | "InnerCode",
103 | "CompanyCode",
104 | "SecuCode",
105 | "ChiName",
106 | "ChiNameAbbr",
107 | "EngName",
108 | "EngNameAbbr",
109 | "SecuAbbr",
110 | },
111 | "constantdb.hk_secumain": {
112 | "InnerCode",
113 | "CompanyCode",
114 | "SecuCode",
115 | "ChiName",
116 | "ChiNameAbbr",
117 | "EngName",
118 | "EngNameAbbr",
119 | "SecuAbbr",
120 | "FormerName",
121 | },
122 | "constantdb.us_secumain": {
123 | "InnerCode",
124 | "CompanyCode",
125 | "SecuCode",
126 | "ChiName",
127 | "EngName",
128 | "SecuAbbr",
129 | },
130 | "constantdb.ct_systemconst": {"LB", "LBMC", "MS", "DM"},
131 | "constantdb.lc_areacode": {
132 | "AreaInnerCode",
133 | "ParentNode",
134 | "IfEffected",
135 | "AreaChiName",
136 | "ParentName",
137 | "AreaEngName",
138 | "AreaEngNameAbbr",
139 | "FirstLevelCode",
140 | "SecondLevelCode",
141 | },
142 | "astockindustrydb.lc_conceptlist": {
143 | "ClassCode",
144 | "ClassName",
145 | "SubclassCode",
146 | "SubclassName",
147 | "ConceptCode",
148 | "ConceptName",
149 | "ConceptEngName",
150 | },
151 | }
152 | column_lists = []
153 | for table, cols in constant_tables.items():
154 | _, table_name = table.split(".")
155 | col_list = {
156 | "表名": table,
157 | "表字段": [],
158 | }
159 | for col in table_column[table_name]:
160 | if col["column"] in cols:
161 | col_list["表字段"].append(col)
162 | column_lists.append(col_list)
163 | return column_lists
164 |
165 |
166 | def ajust_org_question(question: str) -> str:
167 | if "合并报表调整后" in question:
168 | question = question.replace("合并报表调整后", "合并报表")
169 | return question
170 |
171 |
172 | def query_company(name: str) -> str:
173 | # name = name.replace("公司", "")
174 | if name == "":
175 | return "[]"
176 | sql = f"""SELECT 'constantdb.secumain' AS TableName, InnerCode, CompanyCode,
177 | ChiName, EngName, SecuCode, ChiNameAbbr, EngNameAbbr, SecuAbbr, ChiSpelling
178 | FROM constantdb.secumain
179 | WHERE SecuCode = '{name}'
180 | OR ChiName LIKE '%{name}%'
181 | OR ChiNameAbbr LIKE '%{name}%'
182 | OR EngName LIKE '%{name}%'
183 | OR EngNameAbbr LIKE '%{name}%'
184 | OR SecuAbbr LIKE '%{name}%'
185 | OR ChiSpelling LIKE '%{name}%'
186 | UNION ALL
187 | SELECT 'constantdb.hk_secumain' AS TableName, InnerCode, CompanyCode,
188 | ChiName, EngName, SecuCode, ChiNameAbbr, EngNameAbbr, SecuAbbr, ChiSpelling
189 | FROM constantdb.hk_secumain
190 | WHERE SecuCode = '{name}'
191 | OR ChiName LIKE '%{name}%'
192 | OR ChiNameAbbr LIKE '%{name}%'
193 | OR EngName LIKE '%{name}%'
194 | OR EngNameAbbr LIKE '%{name}%'
195 | OR SecuAbbr LIKE '%{name}%'
196 | OR FormerName LIKE '%{name}%'
197 | OR ChiSpelling LIKE '%{name}%'
198 | UNION ALL
199 | SELECT 'constantdb.us_secumain' AS TableName, InnerCode, CompanyCode,
200 | ChiName, EngName, SecuCode, null as ChiNameAbbr, null as EngNameAbbr, SecuAbbr, ChiSpelling
201 | FROM constantdb.us_secumain
202 | WHERE SecuCode = '{name}'
203 | OR ChiName LIKE '%{name}%'
204 | OR EngName LIKE '%{name}%'
205 | OR SecuAbbr LIKE '%{name}%'
206 | OR ChiSpelling LIKE '%{name}%';"""
207 | return execute_sql_query(sql)
208 |
209 |
210 | def seg_entities(entity: str) -> list[str]:
211 | stopwords = ["公司", "基金", "管理", "有限", "有限公司"]
212 | seg_list = list(jieba.cut(entity, cut_all=False))
213 | filtered_seg_list = [word for word in seg_list if word not in stopwords]
214 | return filtered_seg_list
215 |
216 |
217 | def extract_company_code(llm_answer: str) -> str:
218 | """Extracts company codes from the given LLM answer.
219 |
220 | Args:
221 | llm_answer (str): The answer from the LLM containing company information.
222 |
223 | Returns:
224 | str: A formatted string with extracted company codes.
225 | """
226 | debug_mode = os.getenv("DEBUG", "0") == "1"
227 | logger = get_logger()
228 | results = []
229 | try:
230 | names_json = extract_last_json(llm_answer)
231 | if names_json is not None:
232 | names = json.loads(names_json)
233 | if not isinstance(names, list):
234 | raise ValueError("names should be a list")
235 | for name in names:
236 | rows = json.loads(query_company(name))
237 | if len(rows) > 0:
238 | info = f"{name}的关联信息有:[" if len(rows) == 1 else f"{name}关联信息有多组:["
239 | for idx, row in enumerate(rows):
240 | col_chi = {}
241 | if "TableName" in row:
242 | col_chi = config.column_mapping[row["TableName"]]
243 | for k, v in dict(row).items():
244 | if k == "TableName":
245 | info += f"所在数据表是{v};"
246 | continue
247 | if k in col_chi:
248 | info += f"{k}({col_chi[k]})是{v};"
249 | else:
250 | info += f"{k}是{v};"
251 | info += "]" if idx == len(rows) - 1 else "],"
252 | results.append(info)
253 |
254 | except Exception as e:
255 | if debug_mode:
256 | print(f"extract_company_code::Exception:{str(e)}")
257 | logger.debug("extract_company_code::Exception:%s", str(e))
258 | return "\n".join(results)
259 |
260 |
261 | def foreign_key_hub() -> dict:
262 | return {
263 | "constantdb.secumain": {"InnerCode", "CompanyCode", "SecuCode", "SecuAbbr", "ChiNameAbbr"},
264 | "constantdb.hk_secumain": {"InnerCode", "CompanyCode", "SecuCode", "SecuAbbr", "ChiNameAbbr"},
265 | "constantdb.us_secumain": {"InnerCode", "CompanyCode", "SecuCode", "SecuAbbr"},
266 | }
267 |
268 |
269 | def db_select_post_process(dbs: list[str]) -> list[str]:
270 | debug_mode = os.getenv("DEBUG", "0") == "1"
271 | logger = get_logger()
272 | required_dbs = {"astockbasicinfodb", "hkstockdb", "usstockdb"}
273 | present_dbs = set(dbs)
274 | missing_dbs = required_dbs - present_dbs
275 | # 确保所有必需的数据库都存在
276 | if len(missing_dbs) > 0 and len(missing_dbs) != len(required_dbs):
277 | if debug_mode:
278 | print("补充选择db: " + json.dumps(list(missing_dbs), ensure_ascii=False))
279 | logger.debug("补充选择db: %s", json.dumps(list(missing_dbs), ensure_ascii=False))
280 | dbs.extend(missing_dbs)
281 |
282 | return list(dbs)
283 |
284 |
285 | def table_select_post_process(tables: list[str]) -> list[str]:
286 | debug_mode = os.getenv("DEBUG", "0") == "1"
287 | logger = get_logger()
288 |
289 | required_tables_list = [
290 | {
291 | "astockbasicinfodb.lc_stockarchives",
292 | "hkstockdb.hk_stockarchives",
293 | "usstockdb.us_companyinfo",
294 | "constantdb.lc_areacode",
295 | },
296 | {"astockmarketquotesdb.qt_dailyquote", "hkstockdb.cs_hkstockperformance", "usstockdb.us_dailyquote"},
297 | {"astockmarketquotesdb.qt_stockperformance", "hkstockdb.cs_hkstockperformance"},
298 | {"publicfunddb.mf_fundprodname", "publicfunddb.mf_fundarchives"},
299 | {"astockmarketquotesdb.lc_suspendresumption", "constantdb.hk_secumain", "constantdb.us_secumain"},
300 | {"astockmarketquotesdb.qt_dailyquote", "astockmarketquotesdb.cs_stockpatterns"},
301 | {"astockshareholderdb.lc_sharestru", "astockshareholderdb.lc_mainshlistnew"},
302 | ]
303 |
304 | for required_tables in required_tables_list:
305 | present_tables = set(tables)
306 | missing_tables = required_tables - present_tables
307 | # 确保所有必需的数据库都存在
308 | if len(missing_tables) > 0 and len(missing_tables) != len(required_tables):
309 | if debug_mode:
310 | print("\n补充选择table: " + json.dumps(list(missing_tables), ensure_ascii=False))
311 | logger.debug("\n补充选择table: %s", json.dumps(list(missing_tables), ensure_ascii=False))
312 | tables.extend(missing_tables)
313 | return tables
314 |
--------------------------------------------------------------------------------
/baseline/howard_baseline/workflows.py:
--------------------------------------------------------------------------------
1 | """This module initializes Workflows."""
2 |
3 | import config
4 | from src.workflow import SqlQuery, CheckDbStructure
5 | from utils import execute_sql_query, db_select_post_process, table_select_post_process, foreign_key_hub
6 |
7 | sql_query = SqlQuery(
8 | execute_sql_query=execute_sql_query,
9 | llm=config.llm_plus,
10 | max_iterate_num=config.MAX_ITERATE_NUM,
11 | cache_history_facts=True,
12 | specific_column_desc=config.enum_columns,
13 | default_sql_limit=config.MAX_SQL_RESULT_ROWS,
14 | )
15 | sql_query.agent_master.add_system_prompt_kv(
16 | {
17 | "EXTEND INSTRUCTION": (
18 | """- 如果Company和InnerCode都搜不到,那么要考虑股票代码\n"""
19 | """- CompanyCode跟InnerCode不对应,不能写`CompanyCode`=`InnerCode`,可以通过constantdb.secumain、constantdb.hk_secumain或constantdb.us_secumain换取对方\n"""
20 | """- 涉及股票价格时:\n"""
21 | """ - 筛选是否新高,要选择`最高价`字段(HighPrice),而非收盘价(ClosePrice),比如月度新高要看月最高价(HighPriceRM),年度新高要看年最高价(HighPriceRY),周新高要看周最高价(HighPriceRW)\n"""
22 | """- ConceptCode是数字,不是字符串\n"""
23 | """- 在lc_actualcontroller中只有1条记录也代表实控人发生了变更\n"""
24 | """- 如果用户的前一条提问里提及某实体,那么后续追问虽未明说,但也应该是跟该实体相关\n"""
25 | """- 注意观察同一个表中的类型字段,结合用户的问题,判断是否要进行类型筛选\n"""
26 | """- 如果用户提问是希望知道名字,那么要把名字查出来\n"""
27 | """- 中国的城市的AreaInnerCode是constantdb.lc_areacode里ParentName为'中国'的,你不应该也并不能获取到所有中国的城市代码,所以你需要用联表查询\n"""
28 | """- 我们的数据库查询是有一个默认的LIMIT的,这是个重要的信息,当你的SQL没有明确LIMIT的时候,你要知道获取到的数据可能不是全部。\n"""
29 | """- 如果用户提问涉及某个年度的“年度报告”,默认该报告是在次年发布。例如,“2019年年度报告”是在2020年发布的。\n"""
30 | """- 季度报告通常在下一个季度发布,例如,第一季度的报告会在第二季度发布。\n"""
31 | """- 如果用户想知道子类概念的名称,你应该去获取astockindustrydb.lc_conceptlist的ConceptName和ConceptCode\n"""
32 | """- A股公司的基本信息在astockbasicinfodb.lc_stockarchives, 港股的在hkstockdb.hk_stockarchives, 美股的在usstockdb.us_companyinfo\n"""
33 | """- A股公司的上市基本信息在constantdb.secumain, 港股的在constantdb.hk_secumain, 美股的在constantdb.us_secumain\n"""
34 | """- 作为筛选条件的名称,请务必分清楚它是公司名、人名还是其他什么名称,避免用错字段\n"""
35 | """- 但凡筛选条件涉及到字符串匹配的,都采取模糊匹配,增加匹配成功概率\n"""
36 | """- 比例之间的加减乘除,要务必保证算子是统一单位的,比如3%其实是0.03,0.02其实是2%\n"""
37 | """- 时间日期字段都需要先做`DATE()`或`YEAR()`格式化再参与SQL的筛选条件,否则就扣你20美元罚款\n"""
38 | """- 关于概念,可以同时把ConceptName、SubclassName、ClassName查询出来,你就对概念有全面的了解,要记住概念有三个级别,据此理解用户提及的概念分别属于哪个级别\n"""
39 | """- IndustryCode跟CompanyCode不对应,不能写`IndustryCode`=`CompanyCode`\n"""
40 | """- 指数内部编码(IndexInnerCode):与“证券主表(constantdb.secumain)”中的“证券内部编码(InnerCode)”关联\n"""
41 | """- 证券内部编码(SecuInnerCode):关联不同主表,查询证券代码、证券简称等基本信息。当0=0.42.1
2 | zhipuai>=2.1.5.20241204
3 | transformers>=4.47.0
4 | pandas>=2.2.3
5 | openpyxl>=3.1.5
6 | tqdm>=4.67.1
--------------------------------------------------------------------------------
/baseline/sample/utils.py:
--------------------------------------------------------------------------------
1 | from zhipuai import ZhipuAI
2 |
3 |
4 | def call_large_model(messages, api_key, model="glm-4-plus", max_tokens=1024, temperature=0.0):
5 | client = ZhipuAI(api_key=api_key)
6 | response = client.chat.completions.create(
7 | model=model,
8 | messages=messages,
9 | temperature=temperature,
10 | max_tokens=max_tokens,
11 | )
12 | return response.choices[0].message.content.strip()
13 |
--------------------------------------------------------------------------------
/baseline/soldier_baseline/README.md:
--------------------------------------------------------------------------------
1 | # 逐个突破·精准召回——GLM子问题优化与RAG高效检索
2 |
3 | ***
4 | 成绩验证分数: 55.2分
5 |
6 | 成绩验证时间: 2025年2月9日 9:00 AM
7 |
8 | 完整运行时长: 10小时2分钟
9 |
10 | 完整运行Token消耗: GLM-4-Plus 2400万 Tokens
11 |
12 | 其中: `MODEL_sql = "glm-4-plus"`, `MODEL_rag = "glm-4-plus"`
13 |
14 | ***
15 |
16 |
17 | 参赛方自测分数: MODEL_sql和MODEL_rag具体含义请看[这里](#1)
18 |
19 | * 55.67分 (`MODEL_sql = "glm-4-plus"`, `MODEL_rag = "glm-4-flashx"`)
20 | * 30.73分 (`MODEL_sql = "glm-4-flashx"`, `MODEL_rag = "glm-4-flashx"`)
21 |
22 | 完整运行时长: 6.5小时左右(数据预处理3.5小时,推理3小时)
23 |
24 | 完整运行Token消耗: 约2000万 Tokens
25 |
26 | ## 项目介绍
27 |
28 | soldier 像士兵一样绝对听从命令,提供准确的信息查询。部分工具函数参考了bus_baseline,感谢作者。
29 |
30 | * 作者: soldier
31 |
32 | ### 思路介绍
33 |
34 | 1. GLM直接解决原始问题比较困难,可以通过不断的解决子问题,来解决原始问题
35 | 2. RAG召回的越好,效果越好
36 |
37 | ### 创新点
38 |
39 | #### 针对上述1
40 |
41 | * 使用prompt分步写出子问题,解决子问题,最后将所有的对话总结,得到最终答案,逻辑如下,由函数`run_conversation_until_complete`
42 | 实现:
43 |
44 | ```text
45 | 1. 输入问题,生成第一个子问题及其回复
46 | 2. 通过不断对话得到n个子问题的答案,直到GLM不再提出子问题
47 | 3. 将对话总结,得到最终答案
48 | ```
49 |
50 | #### 针对上述2
51 |
52 | * 对原始问题进行表召回时
53 | (1)通过命名实体识别、LSH(局部敏感哈希 Locality Sensitive
54 | Hashing)计算表中的值与实体的相似度、GLM判断问题类型,初步确定问题涉及的股票市场(A股、港股、美股),用于缩小召回范围
55 | (2)再通过GLM召回和问题相关的表
56 | (3)如果问题和某行业相关,则后续提示词会附带这个行业的信息
57 | (4)对table schema进行了重新生成,以方便GLM理解,一是加入了数据示例,二是删除了值单一的字段
58 | * 加入了根据数据表结构生成的表之间关系,让GLM对表之间的join有更准确的判断
59 | * 每次回答子问题时,使用GLM从关键信息中筛选出与问题相关的信息作为参考;回答下一个子问题时,与一个子问题相关的信息要删除。所有关键信息如下,后续可以继续增加:
60 |
61 | ```python
62 | # 下面是对模型来说较难的查询关键词,参考了bus_baseline
63 | examples = {
64 | "查询美股公司信息": "查询美股公司,要同时查询USStockDB.US_CompanyInfo 和 ConstantDB.US_SecuMain中的信息,不要遗漏",
65 | "查询港股公司信息": "查询港股公司,要同时查询HKStockDB.HK_StockArchives 和 ConstantDB.HK_SecuMain中的信息,不要遗漏",
66 | "查询的表名中有DailyQuote": "查询AStockMarketQuotesDB.QT_DailyQuote, USStockDB.US_DailyQuote中的特定公司股票时,不要使用其他字段筛选特定公司,示例sql语句:SELECT * FROM AStockDailyQuote WHERE InnerCode = 1234;",
67 | "查询某行业市值": "查询某行业市值,示例sql语句:SELECT TotalMV, NegotiableMV, FreeFloatMV FROM AStockIndustryDB.LC_IndustryValuation WHERE date(TradingDay) = '2020-07-02' AND IndustryName = '风电零部件';",
68 | "进行加减乘除数学计算": "使用sql进行加减乘除数学计算。",
69 | "比例/百分比是多少": "查询比例/百分比是多少,要考虑加上百分号后再进行四舍五入。",
70 | "近一个月最高价": "查询近一个月最高价,你写的sql语句可以优先考虑表中已有字段HighPriceRM 近一月最高价(元)",
71 | "近一个月最低价": "查询近一月最低价(元),你写的sql语句直接调用已有字段LowPriceRM",
72 | "查询某行业数量": "查询某行业某年数量 示例sql语句:SELECT count(*) as 风电零部件_2021 FROM AStockIndustryDB.LC_ExgIndustry where ThirdIndustryName like '%风电零部件%' and year(InfoPublDate)=2021 and IfPerformed = 1;",
73 | "某股票/公司属于哪些行业/概念板块?": "查询某股票/公司属于哪些概念板块? 示例sql语句:SELECT ConceptCode, ConceptName from AStockIndustryDB.LC_ConceptList WHERE ConceptCode IN (SELECT DISTINCT ConceptCode FROM AStockIndustryDB.LC_COConcept WHERE InnerCode = 1167);",
74 | "某行业/概念板块有哪些股票/公司?": "查询某概念板块有哪些股票/公司? 示例sql语句:SELECT InnerCode FROM AStockIndustryDB.LC_COConcept WHERE ConceptCode = 11100021;",
75 | """持有无限售流通A股数量""": """特别重要一定注意,查询最新更新XXXX年年度报告,机构持有无限售流通A股数量合计InstitutionsHoldProp最多公司代码,优先使用查询sql语句,SELECT *
76 | FROM AStockShareholderDB.LC_StockHoldingSt
77 | WHERE date(EndDate) = 'XXXX-12-31'
78 | AND UpdateTime = (
79 | SELECT MAX(UpdateTime)
80 | FROM AStockShareholderDB.LC_StockHoldingSt
81 | WHERE date(EndDate) = 'XXXX-12-31'
82 | ) order by InstitutionsHoldings desc limit 1 ,XXXX代表问题查询年度,sql语句禁止出现group by InnerCode;
83 |
84 | 查询最新更新XXXX年年度报告,公司机构持有无限售流通A股比例合计InstitutionsHoldProp是多少,优先使用查询sql语句,SELECT InstitutionsHoldProp
85 | FROM AStockShareholderDB.LC_StockHoldingSt
86 | WHERE date(EndDate) = 'XXXX-12-31'
87 | AND UpdateTime = (
88 | SELECT MAX(UpdateTime)
89 | FROM AStockShareholderDB.LC_StockHoldingSt
90 | WHERE date(EndDate) = 'XXXX-12-31'
91 | ) order by InstitutionsHoldings desc limit 1 ,XXXX代表问题查询年度,sql语句禁止出现group by InnerCode;""",
92 | "xxx指标 新高 最多的交易日": """
93 | xxx指标 新高 最多的交易日 要用AStockMarketQuotesDB.CS_StockPatterns现有字段,例子中IfHighestTVRMThree字段可以根据情况灵活调整
94 | 查询成交量创近一季度新高的证券数量和交易日,示例sql语句:
95 | SELECT count(*) as num, TradingDay FROM AStockMarketQuotesDB.CS_StockPatterns where IfHighestTVRMThree=1 group by TradingDay ORDER BY num DESC limit 1;
96 | 查询某日成交量创近一季度新高的证券,示例sql语句:
97 | SELECT InnerCode, TradingDay FROM AStockMarketQuotesDB.CS_StockPatterns where IfHighestTVRMThree=1 and date(TradingDay) = '2021-12-23';
98 | """,
99 | "新高": """新高 要用AStockMarketQuotesDB.CS_StockPatterns现有字段
100 | 查询今天是2021年01月01日,创近半年新高的股票有几只。示例sql语句:SELECT count(*) FROM AStockMarketQuotesDB.CS_StockPatterns
101 | where IfHighestHPriceRMSix=1 and date(TradingDay)='2021-01-01';
102 | 判断某日 YY-MM-DD InnerCode XXXXXX 是否创近一周的新高,查询结果1代表是,IfHighestHPriceRW字段可以根据情况灵活调整 SELECT InnerCode,TradingDay,IfHighestHPriceRW FROM AStockMarketQuotesDB.CS_StockPatterns
103 | where date(TradingDay)='2021-12-20' and InnerCode = '311490'""",
104 | "成交额": """查询这家公司一周内成交额是多少。示例sql语句:SELECT TurnoverValueRW AS TurnoverValueWan
105 | FROM AStockMarketQuotesDB.QT_StockPerformance
106 | WHERE InnerCode = 1289 AND date(TradingDay) = '2021-06-17';""",
107 | "半年度报告": """查询XXXX年半年度报告的条件为:year(EndDate) = XXXX and InfoSource='半年度报告'""",
108 |
109 | }
110 | ```
111 |
112 | * 每次回答子问题时,使用问题中的关键词匹配,直接从表中匹配出可能和子问题相关的列描述;回答下一个子问题时,与一个子问题相关的列描述要删除。
113 |
114 | ## 快速开始
115 |
116 | 1. 运行以下命令以安装所需依赖:
117 |
118 | ```shell
119 | pip install -r requirements.txt
120 | ```
121 |
122 | 2. 打开soldier_baseline.ipynb文件。确保所有依赖库已正确安装。依次运行文件中的各个单元格,系统将自动完成问答流程。运行完成后,程序会将结果保存到
123 | result.json 文件中。如果你只想运行部分问题。可以调整start_idx和end_idx的值。具体位置相见 jupyter notebook最后一个代码块。
124 | 3. 本方案均使用官方提供数据
125 |
126 | ## 关键参数: {#1}
127 |
128 | * `MODEL_sql = "glm-4-plus"`: 用于生成SQL语句的模型名称,默认plus
129 | * `MODEL_rag = "glm-4-plus"`:
130 | 用于除生成SQL外,进行召回、改写、修复json等功能的模型名称,默认glm-4-plus,如果token数不够,可以换成glm-4-flashx或glm-4-flash,效果相差不大
131 |
132 | ## 核心功能详解
133 |
134 | ### 1. 工具函数
135 |
136 | * LSH(局部敏感哈希 Locality Sensitive Hashing)计算相似度
137 | * LLM生成数据:`async_llm_chain_call`异步多线程调用GLM等函数
138 | * 找到并修复GLM回复中的json数据:`find_json`, `fix_json`
139 | * 文件读取和表结构整理函数
140 |
141 | ### 2. 预处理数据
142 |
143 | * 改进题目提供的数据表结构:加入数据样例,删除没用的列
144 | * 根据题目提供的数据表结构生成表之间的关系
145 | * 生成关键词的别名,防止关键词匹配时无法找到相关的关键词,比如董事会秘书,别名董秘
146 | * 用LSH算法将人名和公司名的特征向量存储到本地,方面后续查找
147 |
148 | ### 3. 推理问答流程
149 |
150 | * `run_conversation`:根据问题,召回相关表,并调用`run_conversation_until_complete`生成问题答案
151 | * `run_conversation_until_complete`:分步写出子问题,解决子问题,最后将所有的对话总结,得到最终答案,每个原始问题的对话将保存在
152 | `dialog`中,示例:
153 |
154 | ```json
155 | [
156 | {
157 | "role": "system",
158 | "content": "\n请写sql帮我查询问题。\n问题:<山东国瓷功能材料股份有限公司2021年9月23日开盘价是多少?>\n已查询获得的事实:<('预处理程序通过表格:ConstantDB.SecuMain 查询到以下内容:\\n [\\n {\\n \"InnerCode\": 14953,\\n \"CompanyCode\": 165647,\\n \"SecuCode\": \"300285\",\\n \"ChiName\": \"山东国瓷功能材料股份有限公司\",\\n \"ChiNameAbbr\": \"国瓷材料\",\\n \"EngName\": \"Shandong Sinocera Functional Material Co.,Ltd.\",\\n \"EngNameAbbr\": \"SINOCERA MATERIAL\",\\n \"SecuAbbr\": \"国瓷材料\",\\n \"ChiSpelling\": \"GCCL\"\\n }\\n] \\n', ['ConstantDB.SecuMain'])>\n表结构:<[{'数据表名': 'AStockMarketQuotesDB.QT_DailyQuote', '数据表结构': [{'列名': 'InnerCode', '中文描述': '证券内部编码', '数据示例': '28'}, {'列名': 'TradingDay', '中文描述': '交易日', '数据示例': '2019-01-02 12:00:00.000'}, {'列名': 'PrevClosePrice', '中文描述': '昨收盘(元)', '数据示例': '4.31'}, {'列名': 'OpenPrice', '中文描述': '今开盘(元)', '数据示例': '4.35'}, {'列名': 'HighPrice', '中文描述': '最高价(元)', '数据示例': '4.41'}, {'列名': 'LowPrice', '中文描述': '最低价(元)', '数据示例': '4.26'}, {'列名': 'ClosePrice', '中文描述': '收盘价(元)', '数据示例': '4.3'}, {'列名': 'TurnoverVolume', '中文描述': '成交量(股)', '数据示例': '10189201.0', '注释': '当证券类别为指数时,股票指数成交量单位是股,基金指数成交量单位是份,债券指数成交量单位是元。'}, {'列名': 'TurnoverValue', '中文描述': '成交金额(元)', '数据示例': '43979784.24'}, {'列名': 'TurnoverDeals', '中文描述': '成交笔数(笔)', '数据示例': '5451'}, {'列名': 'XGRQ', '中文描述': '修改日期', '数据示例': '2019-01-02 03:18:09.923'}, {'列名': 'JSID', '中文描述': 'JSID', '数据示例': '599757489948'}]}, {'数据表名': 'ConstantDB.SecuMain', '数据表结构': [{'列名': 'InnerCode', '中文描述': '证券内部编码', '数据示例': '4427'}, {'列名': 'CompanyCode', '中文描述': '公司代码', '数据示例': '3710'}, {'列名': 'SecuCode', '中文描述': '证券代码', '数据示例': '002056'}, {'列名': 'ChiName', '中文描述': '中文名称', '数据示例': '横店集团东磁股份有限公司'}, {'列名': 'ChiNameAbbr', '中文描述': '中文名称缩写', '数据示例': '横店东磁'}, {'列名': 'EngName', '中文描述': '英文名称', '数据示例': 'Hengdian Group DMEGC Magnetics Co.,Ltd.'}, {'列名': 'EngNameAbbr', '中文描述': '英文名称缩写', '数据示例': 'DMEGC'}, {'列名': 'SecuAbbr', '中文描述': '证券简称', '数据示例': '横店东磁'}, {'列名': 'ChiSpelling', '中文描述': '拼音证券简称', '数据示例': 'HDDC'}, {'列名': 'ExtendedAbbr', '中文描述': '扩位简称', '数据示例': 'None'}, {'列名': 'ExtendedSpelling', '中文描述': '拼音扩位简称', '数据示例': 'None'}, {'列名': 'SecuMarket', '中文描述': '证券市场', '数据示例': '90', '注释': '证券市场(SecuMarket)与(CT_SystemConst)表中的DM字段关联,令LB = 201 AND DM IN (10,12,13,14,15,16,18,40,49,50,52,54,55,56,65,66,67,68,69,70,71,72,73,75,76,77,78,79,80,81,83,84,85,86,87,88,89,90,93,94,95,96,99,100,101,102,103,104,105,106,107,110,161,162,180,200,202,210,230,240,260,280,310,320,390,400,620,630,631,640,641,650,653,654,655,657,658,659,660,661,662,663,664,666,667,66302,66303,66305),得到证券市场的具体描述:10-上海期货交易所,12-中国银行间外汇市场,13-大连商品交易所,14-上海黄金交易所,15-郑州商品交易所,16-上海票据交易所,18-北京证券交易所,40-芝加哥商业交易所,49-澳大利亚证券交易所,50-新西兰证券交易所,52-埃及开罗及亚历山大证券交易所,54-阿根廷布宜诺斯艾利斯证券交易所,55-巴西圣保罗证券交易所,56-墨西哥证券交易所,65-印度尼西亚证券交易所,66-泰国证券交易所,67-韩国首尔证券交易所,68-东京证券交易所,69-新加坡证券交易所,70-台湾证券交易所,71-柜台交易市场,72-香港联交所,73-一级市场,75-亚洲其他交易所,76-美国证券交易所,77-美国纳斯达克证券交易所,78-纽约证券交易所,79-美国其他交易市场,80-加拿大多伦多证券交易所,81-三板市场,83-上海证券交易所,84-其他市场,85-伦敦证券交易所,86-法国巴黎证券交易所,87-德国法兰克福证券交易所,88-欧洲其他交易所,89-银行间债券市场,90-深圳证券交易所,93-上海银行间同业拆借市场,94-瑞士证券交易所,95-荷兰阿姆斯特丹证券交易所,96-约翰内斯堡证券交易所,99-东京同业拆借市场,100-美国国债回购市场,101-伦敦银行同业拆借市场,102-香港银行同业拆借市场,103-新加坡银行同业拆借市场,104-中国银行同业拆借市场,105-欧元银行同业拆借市场,106-布鲁塞尔证券交易所,107-雅加达证券交易所,110-以色列特拉维夫证券交易所,161-意大利证券交易所,162-哥本哈根证券交易所,180-挪威奥斯陆证券交易所,200-斯德哥尔摩证券交易所,202-伊斯坦布尔证券交易所,210-印度国家证券交易所,230-奥地利维也纳证券交易所,240-西班牙马德里证券交易所,260-爱尔兰证券交易所,280-菲律宾证券交易所,310-机构间私募产品报价与服务系统,320-俄罗斯莫斯科证券交易所,390-里斯本证券交易所,400-芝加哥期权交易所,620-胡志明市证券交易所,630-沪市代理深市市场,631-沪市代理港交所市场,640-深市代理沪市市场,641-深市代理港交所市场,650-国际外汇市场(晨星),653-上海环境能源交易所,654-北京绿色交易所,655-天津碳排放权交易中心,657-湖北碳排放权交易中心,658-重庆碳排放权交易中心,659-四川联合环境交易所,660-广州碳排放权交易所,661-海峡股权交易中心,662-深圳排放权交易所,663-欧洲能源交易所,664-全国碳排放权交易,666-布达佩斯证券交易所,667-全国温室气体自愿减排交易市场,66302-韩国ETS,66303-加拿大魁北克Cap-and-Trade(CaT),66305-美国区域温室气体倡议(RGGI)。'}, {'列名': 'SecuCategory', '中文描述': '证券类别', '数据示例': '1', '注释': '证券类别(SecuCategory)与(CT_SystemConst)表中的DM字段关联,令LB = 1177 AND DM IN (1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,26,27,28,29,30,31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,55,79,80,211),得到证券类别的具体描述:1-A股,2-B股,4-大盘,5-国债回购,6-国债现货,7-金融债券,8-开放式基金,9-可转换债券,10-其他,11-企业债券,12-企业债券回购,13-投资基金,14-央行票据,15-深市代理沪市股票,16-沪市代理深市股票,17-资产支持证券,18-资产证券化产品,19-买断式回购,20-衍生权证,21-股本权证,23-商业银行定期存款,26-收益增长线,27-新质押式回购,28-地方政府债,29-可交换公司债,30-拆借,31-信用风险缓释工具,32-浮息债计息基准利率,33-定期存款凭证,35-大额存款凭证,36-债券借贷,37-存款类机构质押式回购,38-存款类机构信用拆借,39-现货,40-货币对,41-中国存托凭证,42-协议回购,43-三方回购,44-利率互换品种,45-标准利率互换合约,46-报价回购,47-标准化票据,55-优先股,79-深市代理港交所股票,80-沪市代理港交所股票,211-自贸区债。'}, {'列名': 'ListedDate', '中文描述': '上市日期', '数据示例': '2006-08-02 12:00:00.000'}, {'列名': 'ListedSector', '中文描述': '上市板块', '数据示例': '1'}, {'列名': 'ISIN', '中文描述': 'ISIN代码', '数据示例': 'CNE000001N70'}, {'列名': 'XGRQ', '中文描述': '更新时间', '数据示例': '2021-04-03 09:24:16.313'}, {'列名': 'JSID', '中文描述': 'JSID', '数据示例': '670757056809'}]}]>\n表之间的关联信息如下:<['ConstantDB.SecuMain.InnerCode=AStockMarketQuotesDB.QT_DailyQuote.InnerCode']>\n表结构中列名可以引用使用,表结构中数据示例只是参考不能引用。\n我们现在开始查询当前问题,请你分步写出查询sql语句,我把查询结果告诉你,你再告诉我下一步,\n注意如果我返回的结果为空或者错误影响下一步调用,请重新告诉我sql语句。\n写sql时,请告诉我<这是第几步,这步做了什么事情>\n等你全部回答完成,不需要进行下一步调用时,记得提示我:<全部完成,答案如下>,将答案总结以json格式给我,只需要总结当前问题。\n查询技巧:sql查询年度时优先使用year()函数。sql查询语句不需要注释,不然会报错。sql中日期条件格式应参考这样date(TradingDay) = 'YYYY-MM-DD'。尽量利用表格中已有的字段。\n"
159 | },
160 | {
161 | "role": "user",
162 | "content": "下面开始解决问题:山东国瓷功能材料股份有限公司2021年9月23日开盘价是多少?"
163 | },
164 | {
165 | "role": "assistant",
166 | "content": "<这是第1步,这步做了什么事情> 根据提供的信息,首先需要确定山东国瓷功能材料股份有限公司的证券内部编码(InnerCode),然后使用这个编码来查询2021年9月23日的开盘价。\n\n```sql\nSELECT InnerCode\nFROM ConstantDB.SecuMain\nWHERE ChiName = '山东国瓷功能材料股份有限公司';\n```\n\n请运行上述SQL语句并返回结果。"
167 | },
168 | {
169 | "role": "user",
170 | "content": "{\n \"success\": true,\n \"data\": [\n {\n \"InnerCode\": 14953\n }\n ],\n \"count\": 1\n}"
171 | },
172 | {
173 | "role": "assistant",
174 | "content": "<这是第2步,这步做了什么事情> 使用第一步查询得到的InnerCode,接下来查询2021年9月23日的开盘价。\n\n```sql\nSELECT OpenPrice\nFROM AStockMarketQuotesDB.QT_DailyQuote\nWHERE InnerCode = 14953 AND TradingDay = '2021-09-23';\n```\n\n请运行上述SQL语句并返回结果。"
175 | },
176 | {
177 | "role": "user",
178 | "content": "{\n \"success\": true,\n \"data\": [\n {\n \"OpenPrice\": 41.12\n }\n ],\n \"count\": 1\n}"
179 | },
180 | {
181 | "role": "assistant",
182 | "content": "<这是第3步,这步做了什么事情> 由于已经成功获取了开盘价,现在不需要进一步查询其他列名。以下是查询结果的总结。\n\n```json\n{\n \"问题\": \"山东国瓷功能材料股份有限公司2021年9月23日开盘价是多少?\",\n \"答案\": \"41.12元\"\n}\n```"
183 | },
184 | {
185 | "role": "user",
186 | "content": "未找到SQL语句,请重新生成sql。或者在提示我:<全部完成,答案如下>后,直接回答问题。"
187 | },
188 | {
189 | "role": "assistant",
190 | "content": "<这是第1步,这步做了什么事情> 由于需要查询山东国瓷功能材料股份有限公司2021年9月23日的开盘价,首先需要找到该公司的证券内部编码。\n\n```sql\nSELECT InnerCode\nFROM ConstantDB.SecuMain\nWHERE ChiName = '山东国瓷功能材料股份有限公司';\n```\n\n请运行上述SQL语句并返回结果。"
191 | },
192 | {
193 | "role": "user",
194 | "content": "{\n \"success\": true,\n \"data\": [\n {\n \"InnerCode\": 14953\n }\n ],\n \"count\": 1\n}"
195 | },
196 | {
197 | "role": "assistant",
198 | "content": "<这是第2步,这步做了什么事情> 根据第一步查询到的证券内部编码(InnerCode),现在查询2021年9月23日的开盘价。\n\n```sql\nSELECT OpenPrice\nFROM AStockMarketQuotesDB.QT_DailyQuote\nWHERE InnerCode = 14953 AND TradingDay = '2021-09-23';\n```\n\n请运行上述SQL语句并返回结果。"
199 | },
200 | {
201 | "role": "user",
202 | "content": "{\n \"success\": true,\n \"data\": [\n {\n \"OpenPrice\": 41.12\n }\n ],\n \"count\": 1\n}"
203 | },
204 | {
205 | "role": "assistant",
206 | "content": "<这是第2步,这步做了什么事情> 已知山东国瓷功能材料股份有限公司的证券内部编码为14953,并且需要查询2021年9月23日的开盘价,可以直接使用以下SQL语句进行查询。\n\n```sql\nSELECT OpenPrice\nFROM AStockMarketQuotesDB.QT_DailyQuote\nWHERE InnerCode = 14953 AND TradingDay = '2021-09-23';\n```\n\n请运行上述SQL语句并返回结果。"
207 | }
208 | ]
209 | ```
210 |
211 | # 展望
212 |
213 | - [ ] 将分步规划和生成sql语句用2个agent分别实现,一个负责分步规划,一个负责生成sql语句
214 | - [ ] 加入bad case
215 |
--------------------------------------------------------------------------------
/baseline/soldier_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba>=0.42.1
2 | zhipuai>=2.1.5.20241204
3 | transformers>=4.47.0
4 | pandas>=2.2.3
5 | openpyxl>=3.1.5
6 | tqdm>=4.67.1
7 | datasketch>=1.6.5
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | line-length = 119
3 |
4 | [tool.ruff.lint]
5 | # Never enforce `E501` (line length violations).
6 | ignore = ["C901", "E501", "E741", "F402", "F823"]
7 | select = ["C", "E", "F", "I", "W"]
8 |
9 | # Ignore import violations in all `__init__.py` files.
10 | [tool.ruff.lint.per-file-ignores]
11 | "__init__.py" = ["E402", "F401", "F403", "F811"]
12 |
13 | [tool.ruff.lint.isort]
14 | lines-after-imports = 2
15 |
16 | [tool.ruff.format]
17 | # Like Black, use double quotes for strings.
18 | quote-style = "double"
19 |
20 | # Like Black, indent with spaces, rather than tabs.
21 | indent-style = "space"
22 |
23 | # Like Black, respect magic trailing commas.
24 | skip-magic-trailing-comma = false
25 |
26 | # Like Black, automatically detect the appropriate line ending.
27 | line-ending = "auto"
28 |
--------------------------------------------------------------------------------