├── .gitignore ├── LICENSE ├── README.md ├── data_full └── dataset │ └── Astronomical_dataset.json ├── faiss_index_langchain_full_ernie └── bm25retriever.pkl ├── rag_full ├── __init__.py └── rag_langchain.py ├── requirements.txt ├── resources ├── audio.wav ├── bg.jpg ├── bot.jpeg ├── custom_components │ └── custom_select.js ├── demo.py ├── dog.mp4 ├── image-bot.jpeg ├── list.json ├── main.png ├── modelscope.svg ├── music-bot.jpeg ├── screen.jpeg └── user.jpeg └── web_demo.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Public QiDiHui 2 | 3 | 4 | 5 | 6 | [![Contributors][contributors-shield]][contributors-url] 7 | [![Forks][forks-shield]][forks-url] 8 | [![Issues][issues-shield]][issues-url] 9 | [![MIT License][license-shield]][license-url] 10 | [![Stargazers][stars-shield]][stars-url] 11 |
12 | 13 | ## 项目概述 14 | 15 | 智慧启迪绘不仅仅是一个应用程序,它是一位能引领孩子们踏上奇妙求知之旅的伙伴,带领孩子们进入神奇知识世界的导航员,帮助父母陪伴自己孩子一起阅读《十万个为什么》系列丛书和解答阅读过程中更多疑问的知识助手。 16 | 17 | 本项目为我们公开的项目开发初期的迭代版本,包含了基本的问答功能,文生图和文生视频做了简化处理, 还请理解。 18 | 19 | ## 技术亮点 20 | 21 | - 借由RAG技术与多模态生成技术的力量,我们倾心打造了一款既具娱乐性又富含教育意义,充满趣味性和互动性的视频生成平台,在孩子们阅读《十万个为什么》系列丛书时,进一步激发他们的好奇心,培养更好的观察能力、思考能力和表达能力,成为开启孩子智慧大门的一把钥匙。 22 | - 两种LLM{appbuilder+ERNIEBot}接口: 充分运用百度文心一言的AIGC能力 23 | - 两种RAG框架: Langchain & LlamaIndex 24 | - RAG创新点: 新型向量数据库存储策略 25 | - 多模态生成: 文本、语音和视频,也支持语音输入 26 | - 流式输出交互UI: 在线生成,快速响应 27 | - 预生成图片, tts和视频, 提高体验 28 | 29 | > 以上亮点为我们项目的**完整功能**,并没有完全开源,**请为我们的项目Star以便关注最新更新** 30 | 31 | ## 主要文件功能介绍 32 | 33 | - 开源版本demo:[https://openxlab.org.cn/apps/detail/chg0901/Public_QiDiHui ](https://openxlab.org.cn/apps/detail/chg0901/Public_QiDiHui ) 34 | - AI Studio一键跑通教程: [https://aistudio.baidu.com/projectdetail/8185249](https://aistudio.baidu.com/projectdetail/8185249) 35 | 36 | - `requirements.txt`: 相关实验环境所需依赖包 37 | > GPU环境只需把faiss-cpu改为faiss-gpu即可, 可以加快建库 38 | - `web_demo.py`: Gradio Demo 39 | - `data_full/dataset/Astronomical_dataset.json`: 展示了示例数据, 我们只放置了10个QA数据,由于十万数据有数据隐私协议,书中包含的图片并没有开源,该数据只是展示我们的数据结构 40 | - `faiss_index_langchain_full_ernie/bm25retriever.pkl`: 采用 BM 25 Retriever方案的数据库,这里我们只开源了使用Langchain建的库 41 | - `rag_full/rag_langchain.py`: Langchain建库代码 42 | 43 | ## 有关宣传资料 44 | 45 | ### 更新 46 | 47 | #### 08/07/2024 项目在Paddle AI Studio上被加精置顶 48 | 49 | ![image](https://github.com/user-attachments/assets/5e33898a-baaf-49f1-a188-fd7f1219ebee) 50 | 51 | **《智慧启迪绘》项目介绍与【LIC2024 RAG赛道第一名方案】大揭秘!**:[https://aistudio.baidu.com/projectdetail/8185249](https://aistudio.baidu.com/projectdetail/8185249?channel=0&channelType=0&sUid=785756&shared=1&ts=1723035546414) 52 | 53 | ### B站视频: 【 LIC2024 RAG赛道智慧启迪绘】"十万个所以"团队 有关介绍视频 54 | 55 | - 【最新版本进展】[https://www.bilibili.com/video/BV1yT8SejEQ8/](https://www.bilibili.com/video/BV1yT8SejEQ8/) 56 | - 【产品说明】[https://www.bilibili.com/video/BV1kn4y1o7VY/](https://www.bilibili.com/video/BV1kn4y1o7VY/) 57 | - 【中期设计demo和进展】[https://www.bilibili.com/video/BV1rb421q7xe/](https://www.bilibili.com/video/BV1rb421q7xe/) 58 | - 【智慧启迪绘】流式输出 超快响应 [https://www.bilibili.com/video/BV1ss8qejEQ5/](https://www.bilibili.com/video/BV1ss8qejEQ5/ ) 59 | 60 | 61 | ## 启动部署方法 62 | 63 | ### 环境搭建 64 | 65 | 这里建议使用conda重新建立一个测试环境 66 | 67 | ```Bash 68 | # 搭建环境 69 | conda create -n QiDiHui python=3.10 70 | conda activate QiDiHui 71 | 72 | # 克隆本项目 73 | git clone https://github.com/chg0901/Public_QiDiHui.git 74 | 75 | # 安装软件库 76 | cd Public_QiDiHui 77 | pip install -r requirements.txt 78 | 79 | # 启动WebDemo 80 | python web_demo.py 81 | ``` 82 | 83 | ### 部署成功后截图 84 | 85 | ![](https://ai-studio-static-online.cdn.bcebos.com/279f504a36df433c863bd8d2db921fe87bbc2e05761c4e0b8d9b2e534c4c1fcf) 86 | 87 | ## QiDiHui整体功能逻辑 88 | 89 | ### 1. 总体图示 90 | 91 | QiDiHui的整体功能逻辑可以用下图表示: 92 | 93 | ![](https://ai-studio-static-online.cdn.bcebos.com/44ca4894cb6f4400a3ab2b3482cc68a1341b638b226441d3bbedcfa454b483d5) 94 | 95 | ### 2. 通过问题生成有声绘本 96 | 97 | QiDiHui支持直接通过一个问题生成有声绘本。 98 | 99 | - 若您体验的是OpenXLab上部署的版本: 100 | 101 | 您可以直接在输入框中输入问题(下图1.a)或者在示例问题中选择问题(下图1.b)之后直接点击“输入问题,生成有声绘本”即可,在线文生图可能需要等待较长时间。 102 | 103 | ![](https://ai-studio-static-online.cdn.bcebos.com/8a4f6aec13a747d2ae2feccadfb5010fc750870668a84577872e17ea21b0703f) 104 | 105 | - 若您体验的是星河社区上部署的版本: 106 | 107 | 我们为了适配星河社区的部署环境,解耦了有声绘本生成的各个步骤。问题的输入方式不变,但您需要按照下图中的数字顺序依次点击按钮,才可以看到生成的有声绘本。 108 | 109 | 110 | 111 | ### 3. 通过故事生成有声绘本 112 | 113 | **注意:您的故事长度需要在50字以上才能触发剧本生成操作。** 114 | 115 | QiDiHui还支持通过现有的故事生成绘本。 116 | 117 | - 若您体验的是OpenXLab上部署的版本: 118 | 119 | 您需要在输入框中输入您的故事,之后按照下图中数字顺序依次点击按钮,就可以看到生成的有声绘本了。 120 | 121 | ![](https://ai-studio-static-online.cdn.bcebos.com/4d73a363b6fd4456b31f110f696255e8ad93e4bf444f40d09916118350e57b9b) 122 | 123 | - 若您体验的是星河社区上部署的版本: 124 | 125 | 同样地,您需要按照下图中的数字顺序依次点击按钮,就可以看到生成结果了。 126 | ![](https://ai-studio-static-online.cdn.bcebos.com/4fa5611c172a4c31b1c60ab2e8dcdc5f08652158f3bf4bc99fe076d3e9c4d8c5) 127 | 128 | ### 4. 视频演示 129 | 130 | 请点击[此处](https://www.bilibili.com/video/BV1yT8SejEQ8/?share_source=copy_web&vd_source=fb12a11d11545b5c1139ee0654f2f1c5)跳转到B站观看高清视频和其他合集视频, 欢迎给我们点赞收藏投币一键三连! 131 | 132 | 133 | ### 体验链接 134 | 135 | - 智慧启迪绘 基于文心erniebot 和千帆appbuilder 最新体验链接】 136 | 137 | 1.AIStudio主体验链接[https://aistudio.baidu.com/application/detail/40487](https://aistudio.baidu.com/application/detail/40487) 138 | 2. OpenXLab 全功能版本[https://openxlab.org.cn/apps/detail/chg0901/QiDiHui_appbuilder_V2](https://openxlab.org.cn/apps/detail/chg0901/QiDiHui_appbuilder_V2) 139 | 140 | - 【开发版本1:智慧启迪绘 基于文心erniebot 体验链接】[https://openxlab.org.cn/apps/detail/chg0901/QiDiHui](https://openxlab.org.cn/apps/detail/chg0901/QiDiHui) 141 | - 【开发版本2:智慧启迪绘 基于千帆appbuilder 体验链接】[https://openxlab.org.cn/apps/detail/chg0901/QiDiHui_appbuilder](https://openxlab.org.cn/apps/detail/chg0901/QiDiHui_appbuilder) 142 | 143 | ## Star History 144 | 145 | [![Star History Chart](https://api.star-history.com/svg?repos=chg0901/Public_QiDiHui&type=Date)](https://star-history.com/#chg0901/Public_QiDiHui&Date) 146 | 147 | ## Contributors: 十万个所以团队 148 | 149 | 150 | 151 | 152 | 153 | 158 | ### 团队成员来自RAG兴趣小组,分别是 159 | 160 | - 1. 来自韩国光云大学的 计算机工程博士生 程宏 161 | - 2. 来自 复旦大学的 NLP准研究生 高杨帆 162 | - 3. 来自上海海洋大学的 NLP本科毕业生 彭文博 163 | - 4. 毕业于南京大学的 算法工程师 房宇亮 164 | - 5. 来自昌吉学院 计算机科学与技术专业大三的 郭志航 165 | 166 | ### 团队过往开源项目 167 | 168 | - 1. EmoLLM [https://github.com/SmartFlowAI/EmoLLM](https://github.com/SmartFlowAI/EmoLLM) 169 | - 2. 食神 [https://github.com/SmartFlowAI/TheGodOfCookery](https://github.com/SmartFlowAI/TheGodOfCookery) 170 | - 3. 峡谷小狐仙 [https://github.com/chg0901/Honor_of_Kings_Multi-modal_Dataset](https://github.com/chg0901/Honor_of_Kings_Multi-modal_Dataset) 171 | - 4. 程宏和郭志航是Datawhale鲸英助教团成员 172 | 173 | 174 | [your-project-path]: chg0901/Public_QiDiHui 175 | [contributors-shield]: https://img.shields.io/github/contributors/chg0901/Public_QiDiHui.svg?style=flat-square 176 | [contributors-url]: https://github.com/chg0901/Public_QiDiHui/graphs/contributors 177 | [forks-shield]: https://img.shields.io/github/forks/chg0901/Public_QiDiHui.svg?style=flat-square 178 | [forks-url]: https://github.com/chg0901/Public_QiDiHui/network/members 179 | [stars-shield]: https://img.shields.io/github/stars/chg0901/Public_QiDiHui.svg?style=flat-square 180 | [stars-url]: https://github.com/chg0901/Public_QiDiHui/stargazers 181 | [issues-shield]: https://img.shields.io/github/issues/chg0901/Public_QiDiHui.svg?style=flat-square 182 | [issues-url]: https://img.shields.io/github/issues/chg0901/Public_QiDiHui.svg 183 | [license-shield]: https://img.shields.io/github/license/chg0901/Public_QiDiHui.svg?style=flat-square 184 | [license-url]: https://github.com/chg0901/Public_QiDiHui/blob/main/LICENSE 185 | -------------------------------------------------------------------------------- /data_full/dataset/Astronomical_dataset.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "question": "为什么要研究天文学", 4 | "answer": "从古至今,天文学都是人类文明中不可或缺的重要学科,因为它与人类息息相关。昼夜交替,四季循环,人类自从诞生以来,无时无刻不在接触天文现象。明亮的太阳、皎洁的月光、灿烂的群星、壮观的日食……让我们产生了无数疑问:我们生活的地球在宇宙中是什么样的?太阳为什么会发出光和热?夜空中闪烁的星星是什么?除了地球之外,其他星球上还有没有生命?有没有外星人?彗星和小行星真会与地球相撞吗?宇宙到底有多大?宇宙是怎么产生的?……这些问题引起了人们的极大兴趣。古代人们在从事农牧业生产时,很早就懂得了利用天象来确定季节。古代的渔民和水手在茫茫大海上利用星星确定自己航行的方向,利用月亮的圆缺变化来判断潮水的涨落……现代科学技术的发展对天文学有了更多新的需求,天文学得到了飞速的发展。天文台编制的各种历表,不仅满足了人们日常生活的应用,而且更是航海、航空、大地测量、科学研究等部门迫切需要的。生活中离不开时间,近代科学更需要测定和记录精确的时间,天文台就承担了测定标准时间和提供时间服务的工作。各种天体和广袤的宇宙是理想的实验室,宇宙间存在着地面实验室无法达到的超大尺度、超大质量、超高速、超高(低)密度、超高(低)温、超高压、超真空和超强磁场等极端物理条件。例如质量比太阳大几十倍的星球,几十亿度的高温,几十亿大气压的高压,每立方厘米几十亿吨的超密物质,以及每立方厘米仅有一两个原子的超真空状态,等等。人们经常从天文学的新发现得到启发,然后再加以利用。这在科学史上有着大量生动的事例:从总结行星运动的规律得出了万有引力定律;观测到太阳上氦的光谱线后,才在地球上找到了氦元素;从计算太阳和新星爆发的能量,发现了人们原本不了解的核能源……天文学曾对数学和力学的发展起了奠基性的作用。天文学和物理学的结合产生了天体物理学,成为当代天文学的主流。宇宙中星际分子和有机分子的发现,以及地外生命的探索开创了天体化学和天体生物学的研究,并成为生命起源研究的重要领域。天文学同地球物理学和地学的密切结合,开辟了空间天气学和天文地球动力学等新的交叉学科。天文学是空间科学的先驱,又是它不可缺少的内容和依托。因此,天文学和自然科学的几乎所有学科互相渗透、互相促进,成为整个自然科学中不可缺少的重要组成部分。现代天文观测和研究追求极微弱信号的探测、极高的空间和时间分辨率、极精确的空间导向和定位以及极精密的计时等,因而在天文学研究中发展起来的天文技术、方法和新概念对人类的技术进步有着巨大的推动作用。当代地球与空间环境的保护和利用这一重大问题同人类生存和社会发展密切相关。它涉及全球气候变化研究,大气臭氧层保护,地震和旱涝的预测,甚至小行星撞击地球的监测等。太阳活动的剧烈变化还会造成无线电通信中断、电力系统故障、人造卫星损坏和变轨,以及威胁宇航员安全等重大灾害。卫星的监测、空间碎片的研究,以及自主的时间服务系统可以为国家安全和航天器的安全提供保障。所有这些,无一不同天文学的研究息息相关。天文学向我们揭示了自然界的真面目。几千年来人们对于地球的性质、地球在宇宙中的位置以及宇宙的结构等方面都曾有过错误的认识。假如没有天文学,这些错误的认识将会继续下去。波兰天文学家哥白尼冲破长达千余年的宗教束缚,提出了日心说,把自然科学从神学中解放出来,开创了人类思想史上第一次伟大的革命,是人类认识宇宙的第一次飞跃,就是最好的例证。在人类进入航天飞行的时代,天文学集中了人类对于自然认识的精华。天文学不仅可以培养人们强烈的求知欲望、勇于创新的精神和科学的思维方法,而且更有助于认识人类在自然界和宇宙中的地位,树立起正确的认识论和世界观。如果一个人对现代天文学的伟大成就一无所知,他就不能算是一个受过良好教育的人。正因为如此,世界上很多国家都把天文学列入了中学课程。上面仅仅从几方面简单介绍了天文学的发展和应用。随着激动人心的新发现不断涌现,新认识、新理论层出不穷,天文学空前地活跃起来,成为自然科学中最活跃的前沿学科之一,在人类认识宇宙的又一次飞跃中成为无可争辩的主角和带头学科,对现代科学的发展起了无可替代的推动作用。(方成)", 5 | "image": [ 6 | { 7 | "tushuo": "美丽的草帽星系M104", 8 | "path": "figure_0014_0001.jpg" 9 | }, 10 | { 11 | "tushuo": "麒麟座V838也许是已知最大的恒星之一,2002年曾发生剧烈的爆发", 12 | "path": "figure_0014_0002.jpg" 13 | }, 14 | { 15 | "tushuo": "太阳系(左)和一颗红矮星的行星系统比较图", 16 | "path": "figure_0015_0004.jpg" 17 | }, 18 | { 19 | "tushuo": "太阳风暴对各种空间探测器和卫星都可能造成影响", 20 | "path": "figure_0015_0005.jpg" 21 | } 22 | ], 23 | "source_book": "Astronomical", 24 | "source_file": "Section0002_0002.xhtml" 25 | }, 26 | { 27 | "question": "为什么天空看起来像个球", 28 | "answer": "“天似穹庐,笼盖四野”,“天如鸡子,地如卵中黄”,这是古代中国人对天空的描述。他们认为天空如同一个锅盖,或者一个球壳,将大地包裹在其中。古希腊天文学家托勒玫的“地心说”认为天空是由一层又一层的“天球”组成的,恒星则固定在最外层天球上。天空真是球形的吗?为何人们都有如此相似的感觉?其实这是一种错觉,产生这种错觉的原因,是因为星星太遥远了,我们完全无法判断哪颗星离得近,哪颗星离得远,以至于我们认知的距离感丧失了,错认为每颗星到我们的距离都是一样的,因此我们看到的天空就如同一个球形。虽然这种“天球”的感觉是因错觉造成的,但在天文观测中,我们却可以借助“天球”的概念来标定天体在天空的位置。在天文观测中的天球,是以观测者为中心,以无穷远为半径假想出的一个巨大球面。既然这个球面是无穷远的,那么我们也不用顾及哪颗星星距离我们远,哪颗星星距离我们近,而是把它们统一当作无穷远的天体来看待。这样一来,我们只需要在天球上画出网格坐标,就可以通过经度和纬度来记录天体的位置及变化。当天文学家通过望远镜观测时,用这些坐标数字就可以准确找寻天体。为了形象表示天球,人们还制作出了天球仪。中国汉代的浑象和西方古罗马时期的法尔内塞天球都是早期天球仪的代表。天球仪和地球仪样子差不多,也有南北极和赤道。不过地球是真实存在的,而天球是假想球。天球上画的不再是地面上的河流山川,而是天上的日月星辰。相对于地球仪而言,天球仪上的坐标要复杂得多,除了常用的赤道坐标系,还有根据太阳运行的轨道平面建立的黄道坐标系,根据银河系的盘面建立的银道坐标系,根据观测地的地平圈建立的地平坐标系等,它们都可以用来记录各种天体的位置。三维的天球仪是以天球之外观察者的角度来设计的,因而同地球上看到的方位刚好相反,使用并不方便。更常用的是投影后的二维星图。位于苏州的南宋石刻天文图就是这样绘制的。(张超)", 29 | "image": [ 30 | { 31 | "tushuo": "天球示意图", 32 | "path": "figure_0016_0006.jpg" 33 | } 34 | ], 35 | "source_book": "Astronomical", 36 | "source_file": "Section0002_0003.xhtml" 37 | }, 38 | { 39 | "question": "为什么天文学家不管天气预报", 40 | "answer": "如果你问一位天文学家“明天天气如何”,估计他会一愣。这是因为作为一名天文学家,他既不研究大气,也不从事气象预报工作。他感兴趣的,主要是地球大气层以外的事情。有些天文学家研究的是太阳系中的天体,包括火星、木星这样的行星,也关注彗星、小行星这些小质量天体,他们研究的是“行星科学”。有些天文学家喜欢研究太阳以及更加遥远的恒星,他们研究的是“恒星天文学”。有一些天文学家研究恒星等天体组成的银河系以及更远的星系,他们研究的是“星系天文学”。还有些天文学家喜欢探索整个宇宙的故事,他们关注宇宙在膨胀还是收缩,宇宙何时形成,以后又将怎样,这个领域称为“宇宙学”。可以说,地球大气层之外的所有天体,都是天文学家研究的对象。不过也有例外。比如,天文学中有一个门类称为“流星天文学”。流星是宇宙空间的流星体闯入大气层后,和大气摩擦而产生的发光现象。虽然这种现象发生在大气层内,但其本体来源于宇宙,因此同样也被天文学家所关注。和流星天文学类似,从外太空掉落到地面上的陨石,从太阳来的带电粒子产生的极光,宇宙中高能粒子在大气中产生的簇射,这些都是天文学研究的对象。那么天气预报应该归谁呢?原来,从事天气和气候方面研究的是气象学家。不过天文和气象在过去经常被人们放在一起来谈。因为中国早期的香港天文台、徐家汇天文台、青岛观象台等台站,最重要的工作是做气象数据记录,此外才是从事天文现象观测。这也难怪让人产生“天文气象是一家”的误会。随着学科划分越来越精细,天文学和气象学逐渐成为两个独立的学科,如今的天文台不再负责气象观测,天文学家可以专心研究自己钟爱的天体了。不过,由于天文是基于观测的一门学科,需要用望远镜去观测遥远的天体,了解观测时的天气如何就很重要。因此,观测天文学家才会密切关注天气变化,从而制定更好的观测方案。(张超)", 41 | "image": [ 42 | { 43 | "tushuo": "鱼眼镜头拍摄的帕拉纳尔天文台,天文学家关心的是大气层外的天体", 44 | "path": "figure_0016_0007.jpg" 45 | }, 46 | { 47 | "tushuo": "如今香港天文台(上图)仍在预报气象,热带风暴来临时会挂“风球”(下图)", 48 | "path": "figure_0017_0009.jpg" 49 | } 50 | ], 51 | "source_book": "Astronomical", 52 | "source_file": "Section0002_0003.xhtml" 53 | }, 54 | { 55 | "question": "为什么恒星会眨眼", 56 | "answer": "在晴朗的夜晚,我们会看到繁星闪烁,好像在“眨眼睛”,这是怎么回事儿呢?这是由于大气抖动引起的。如果在夏天远望灼热的沥青路面,就会发现路面上方的空气好像流水一样上下翻动。当我们透过空气看远方的景物,发现景物也变得模模糊糊、抖动不停。中国古人给这种现象起了一个好玩的名字叫“野马”。空气在冷热不均匀的时候就会出现密度的变化,它们使大气的折射性质发生变化,就像一个个小透镜,于是导致了这种抖动现象。星光是点光源,光线在经过地球大气层的过程中会遇到很多类似的“小透镜”,当这些“小透镜”抖动时,星光便有时分散,有时汇聚,在我们看来就忽明忽暗,如同“眨眼”。像金星等行星由于不是点光源而是面光源,张角往往大于密度不均的空气“透镜”的尺度,所以不易受到大气抖动的影响,一般看上去不会“眨眼”。天文学家用“视宁度”来表征大气抖动造成星星“眨眼”的程度,空气抖动会影响到望远镜观测天体的清晰度,因此专业天文台要建在空气相对“安静”,也就是“视宁度”好的地方。(张超)", 57 | "image": [ 58 | { 59 | "tushuo": "恒星会“眨眼”而行星较宁静的原理示意图", 60 | "path": "figure_0017_0010.jpg" 61 | } 62 | ], 63 | "source_book": "Astronomical", 64 | "source_file": "Section0002_0003.xhtml" 65 | }, 66 | { 67 | "question": "肉眼可以看到多少颗星星", 68 | "answer": "普通人用肉眼能看到多少星星?一起来数数吧。不要以为这是一件不可能完成的任务,只要方法得当,还是可以数清的。古希腊天文学家依巴谷把星星按亮度分为不同等级,很亮的星星定为1等,其次的定为2等,而把人眼勉强能看到的星星定为6等,更暗弱的星星,仅仅凭人眼就无法看到了。这样一来,我们只需要记录每个星等有多少颗星,便可以知道肉眼能看到多少星了。天文学家已经把这个工作做完了:全天有1等星20颗,2等星46颗,3等星134颗,4等星458颗,5等星1476颗,6等星4840颗,一共是6974颗。这将近7000颗是整个天球上肉眼可见的所有星星。不过,当我们在地面上看时,只有约一半的星星在地平线之上,另一半则沉于地下。而在地平线附近,星光由于要穿过浓密的大气层,减弱得更加厉害,因此在地平线附近低空的星星也很难被看到。这样算来,只有约3000颗星“幸存”。在实际观测中,由于受到很多观测条件的影响,比如月光干扰、大气透明度不佳、个人视力差别等因素,一般人们看到的星星数量还要少于3000颗。古希腊天文学家依巴谷据说视力极好,而他根据观测编制的星表中,星星的数量也不过千颗。因此当我们看到繁星满天的时候,仔细数一数,并没有我们想象的那么多。什么?你在晚上只见过十多颗星星?这也难怪,如今的城市发展迅速,夜空受到灯光的影响,其背景亮度在逐渐提高。如今在大城市中,暗于3等的星都淹没在明亮的夜空背景中,再除掉沉没于地平线下的那些星,即使天气再好,我们也只能看见二三十颗星了。(张超)", 69 | "image": [ 70 | { 71 | "tushuo": "", 72 | "path": "figure_0018_0011.jpg" 73 | } 74 | ], 75 | "source_book": "Astronomical", 76 | "source_file": "Section0002_0004.xhtml" 77 | }, 78 | { 79 | "question": "为什么天上有88个星座", 80 | "answer": "晴朗的夜空,繁星密布。远古人类经过长期的观察,发现群星组成的图案是恒定不变的,只有少数几颗亮星(行星)在众星之间游走。为了区分并称呼这些恒定不变的星星,人们将它们分组,取以专门的名称。不同的民族,组合方法不同,名称也各异。现代国际通行的星座划分,可溯源至古巴比伦。古巴比伦远在5000年前就有了最早的星座名称。公元前13世纪,已划分出黄道带上的12个星座,称为“黄道十二宫”,意为太阳周年运行过程中的12座行宫。以后又逐渐扩充,命名了更多星座。公元2世纪,古希腊天文学家托勒玫在总结前人认识的基础上,编制出含有48个星座的表。16~17世纪欧洲地理大发现,又补充了南天的一些星座。这时的星座概念,还只是一些肉眼可见的亮星之间的组合,星座与星座之间并没有明确的界限。随着天文望远镜技术的发展,越来越多的暗星被发现和深入研究,但它们属于哪一个星座,怎样标记和称呼它们,难以明确。1928年,为了天文学研究的需要,国际天文学联合会在荷兰莱顿举行的大会明确地将全天空划分为88个星座区域,沿天球赤道坐标系的赤经、赤纬线曲折分界,保留住传统的星座名字,用拉丁文规定其学术名称和由三个明确大小写的字母组成其缩写符号,全世界统一使用。其后,中国天文学会又确定了星座的中文译名,成为正式的学术名称。其他文明也有自己的星空划分和命名方法。比如中国古代把星空划分为三垣、四象、二十八宿等,它们在历史和文化上发挥过很大的作用,在功能方面与西方流行的星座是类似的。(苏宜)", 81 | "image": [ 82 | { 83 | "tushuo": "以北极点为中心的北天星座及其神话形象", 84 | "path": "figure_0018_0012.jpg" 85 | }, 86 | { 87 | "tushuo": "以南极点为中心的南天星座及其神话形象", 88 | "path": "figure_0018_0013.jpg" 89 | } 90 | ], 91 | "source_book": "Astronomical", 92 | "source_file": "Section0002_0004.xhtml" 93 | }, 94 | { 95 | "question": "为什么四季的星空不一样", 96 | "answer": "人在地球上任何位置,只能看到半个星空,另一半星空在地平线下。在南北两极,地球自转轴通过头顶,星空怎么转也还是那些星,地底下的不会升起来。而在一般纬度的地方,人们可以看到星空随季节不同而有规律地变换。四季星空的变换原因和地球公转有关。太阳之外的恒星距离地球都非常遥远,而地球距太阳较近,所以在地球绕太阳公转时,从地球上看起来太阳便在相对不动的众星之间运转,一年一个周期,这称为“太阳周年视运动”。因为地球的大气分子散射阳光,所以我们在白天看不到被太阳光辉淹没的半个星空,只能在夜晚看到反方向的另半个星空。这样,随着太阳的周年视运动,我们就轮流看到了不同的星空。春季里,太阳离飞马座不远,半夜时,反方向的狮子座高悬头顶;夏季时,太阳移至猎户座附近,半夜可见到天蝎座;秋季里,太阳在狮子座中,飞马座成为半夜星空的主角;到冬季,太阳移至天蝎座,所以半夜当空的是猎户座。这就是四季星空的变换。当然,星空的变换并不是突然换季,而是每晚都在连续地进行。因为太阳周年视运动一年转360°,折合每天在黄道上移动约1°,而由地球自转造成的星空东升西落,转过这1°约需4分钟。所以同一颗恒星每天升起的时间提前4分钟,一个月提前2小时,一年提前24小时,又回到原来的时间了。星移斗转,年复一年,星空总是这样有规律地运转不息。(苏宜)", 97 | "image": [ 98 | { 99 | "tushuo": "黄道十二宫", 100 | "path": "figure_0019_0014.jpg" 101 | }, 102 | { 103 | "tushuo": "四季星空的主要星座", 104 | "path": "figure_0019_0015.jpg" 105 | } 106 | ], 107 | "source_book": "Astronomical", 108 | "source_file": "Section0002_0004.xhtml" 109 | }, 110 | { 111 | "question": "为什么天文学家要使用星表", 112 | "answer": "天文学家解开宇宙的奥秘是从记录星星的位置开始的。那些为星星编了号的表格索引就是星表。中国已知最早的星表是由战国时期的甘德、石申分别编写的,他们的原作早已失传。后人编撰的合集——《甘石星经》,时至今日也只剩下他人引用的只言片语。古希腊天文学家依巴谷编制的西方第一份星表也是因为在其后的另一位古希腊天文学家托勒玫的著作中被述及才为世人所知,其中包括1000多颗恒星的资料。后来天文学家不断编制更精确的星表。丹麦天文学家第谷在1576年建立了汶岛天文台,完成了望远镜时代之前最为精确的天文观测。后来德国天文学家开普勒根据第谷的观测资料,编制发表了《鲁道夫星表》,为他发现行星运动定律打下了可靠的基础。但是这时欧洲的观测者们从未见过南极附近的星空。1676年,年仅20岁的英国天文学家哈雷乘坐东印度公司的航船到达南大西洋上的圣赫勒拿岛,在那里建立了第一个南天观测站。3年后他发表了第一份《南天星表》,因而被誉为“南天的第谷”,获得牛津大学的学位并被选入英国皇家学会。天空中不只有明亮耀眼的恒星,还有众多美丽的星云,它们也有自己的星表。18世纪的法国天文学家梅西叶在搜索彗星的过程中发现天空中有许多模糊的天体很容易和彗星混淆。他于是将110个此类天体编成表,以“梅西叶星表”为名发表。我们今天知道,这些相对黯淡的天体既包括弥漫的星云,也包括密集的星团,还包括遥远的星系。此表今称“梅西叶星云星团表”,表中所列的那些美丽的天体,直到今天仍是天文学家和天文爱好者们频繁观测的目标。经过400多年的观测积累,今天的星表类型已经非常丰富。从不同的天体类型到不同的观测波段,每一类天体都各有自己的星表。而且星表的内容也得到了极大的扩充,除了天体的位置之外,还包括距离、颜色、温度、光谱型、红移等许多信息。这些卷帙浩繁的数据曾经耗费了许多天文学家毕生的精力,现在我们可以通过互联网方便地使用这些成果。目前最大的天文星表数据库是由法国斯特拉斯堡天文数据中心开发维护的CDS数据库系统(http://cdsweb.u-strasbg.fr),所有历史上的重要星表以及最新的研究结果都能够在这个系统中统一查询。(余恒)", 113 | "image": [ 114 | { 115 | "tushuo": "敦煌出土的唐代星图显示了北极附近的“三垣”天区", 116 | "path": "figure_0020_0016.jpg" 117 | }, 118 | { 119 | "tushuo": "中国古人把接近赤道的天区分为“四象”,共二十八宿", 120 | "path": "figure_0020_0017.jpg" 121 | } 122 | ], 123 | "source_book": "Astronomical", 124 | "source_file": "Section0002_0005.xhtml" 125 | }, 126 | { 127 | "question": "为什么杜牧在诗中说“卧看牵牛织女星”", 128 | "answer": "唐代诗人杜牧在著名诗篇《秋夕》中写道:“银烛秋光冷画屏,轻罗小扇扑流萤。天阶夜色凉如水,卧看牵牛织女星。”夏秋之交,晴朗的夜空中,银河像一袭轻纱斜挂天际,牛郎织女隔河相望。牛郎星(天鹰座α星)两侧各有一颗小星,那是肩挑着的一双儿女,古称“河鼓三星”。比牛郎更亮的织女星(天琴座α星)孤寂地伫立对岸。十字形的天鹅座展翅翱翔在银河中央,尾羽上有一颗亮星,中文名天津四(天鹅座α星)。津一般指渡口,但组成中国古代星官“天津”的9颗恒星则像一条大船,横亘于银河中央,其中第4颗最亮。天津四与牛郎、织女形成的接近等腰的大三角形,称为夏季大三角,很容易找到。杜牧诗的最后一句,有的版本写作“坐看牵牛织女星”,但实际上“卧看”更具有合理性。因为“秋夕”指农历七月初七,大约在公历的8月上中旬,立秋节气前后,对于中国中原地区来说,夏季大三角位于头顶正上方,“坐看”或站着看会使人脖子发酸,难以坚持。这时暑热尚未退去,晚上人们在户外纳凉。刚刚嬉闹扑萤、玩累了的少男少女们,躺在临时搭起的凉床上“卧看牵牛织女星”,浮想着有关牛郎、织女七夕相会的浪漫而凄美的神话故事,应当是非常自然而且惬意的享受。(苏宜)", 129 | "image": [ 130 | { 131 | "tushuo": "夏季大三角", 132 | "path": "figure_0021_0018.jpg" 133 | } 134 | ], 135 | "source_book": "Astronomical", 136 | "source_file": "Section0002_0005.xhtml" 137 | }, 138 | { 139 | "question": "为什么杜甫有“人生不相见,动如参与商”的诗句", 140 | "answer": "“参”与“商”是天上两组著名的亮星。参即“参宿”,中国古代划分的二十八宿之一,大致相当于猎户座,是冬夜天空中最为壮丽的星座。中央排成一线的三颗亮星,是猎户腰带上的三颗宝石,中国古称参宿三星,正好位于天赤道上。在没有钟表的时代,它是漫长冬夜的天然的计时器。猎户左肩上的红色亮星是参宿四(猎户座α星),右膝下的蓝色亮星是参宿七(猎户座β星),都是超巨星,真实亮度比太阳大数万倍。“商”指夏夜星空中的红色亮星“心宿二”,又叫“大火”,其两侧各有一颗稍暗的星,合称心宿三星,是殷商时代判断季节指导农耕的重要星星。《诗经》“七月流火,九月授衣”,指的是农历七月“大火”星逐渐“流”向西方,盛夏将尽,农历九月之前要准备好过冬的衣服。心宿二属天蝎座,古巴比伦人将天蝎座想象为天上的一只大蝎子,硕大的躯体,弯弯的毒钩。杜甫名句“人生不相见,动如参与商”,意指盛夏出现的商星与隆冬出现的参宿此起彼落,永无见面的机会,以此抒发“安史之乱”中亲友离散、天各一方的感慨。(苏宜)", 141 | "image": [ 142 | { 143 | "tushuo": "猎户座", 144 | "path": "figure_0021_0019.jpg" 145 | }, 146 | { 147 | "tushuo": "天蝎座", 148 | "path": "figure_0021_0020.jpg" 149 | }, 150 | { 151 | "tushuo": "活动星图", 152 | "path": "figure_0021_0021.jpg" 153 | } 154 | ], 155 | "source_book": "Astronomical", 156 | "source_file": "Section0002_0005.xhtml" 157 | } 158 | ] -------------------------------------------------------------------------------- /faiss_index_langchain_full_ernie/bm25retriever.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/faiss_index_langchain_full_ernie/bm25retriever.pkl -------------------------------------------------------------------------------- /rag_full/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/rag_full/__init__.py -------------------------------------------------------------------------------- /rag_full/rag_langchain.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import jieba 5 | from langchain_core.documents import Document 6 | 7 | 8 | def resolve_assets(relative_path): 9 | """ 10 | 根据相对路径解析出绝对路径 11 | """ 12 | return os.path.join(os.path.dirname(__file__), relative_path) 13 | 14 | 15 | # 数据集路径 16 | data_path = resolve_assets("../data_full/dataset") 17 | 18 | # 向量数据库持久化路径 19 | faiss_name = resolve_assets("../faiss_index_langchain_full_ernie") 20 | 21 | 22 | def load_dataset(): 23 | """ 24 | 加载数据集文件 25 | """ 26 | book_list = ["Astronomical"] 27 | json_data = [] 28 | for book_name in book_list: 29 | with open(f"{data_path}/{book_name}_dataset.json", 'r', encoding='utf-8') as f: 30 | json_data.extend(json.load(f)) 31 | return json_data 32 | 33 | 34 | def generate_docs(json_data=None): 35 | """ 36 | 将原始数据转换为适用于问答对编码的langchain Document对象 37 | """ 38 | if json_data is None: 39 | json_data = load_dataset() 40 | 41 | # 创建待编码文档集 42 | docs = [] 43 | for i in range(len(json_data)): 44 | # 取出每个问答对的问题和答案 45 | question = json_data[i]['question'] 46 | answer = json_data[i]['answer'] 47 | image = json_data[i]['image'] 48 | source_book = json_data[i]['source_book'] 49 | source_file = json_data[i]['source_file'] 50 | # 问题部分作为文本节点的text用于编码和检索,答案部分存储在metadata中 51 | docs.append(Document(page_content=question, 52 | metadata={"answer": answer, "image": image, "source_book": source_book, 53 | "source_file": source_file})) 54 | return docs 55 | 56 | 57 | class FaissSearch: 58 | def __init__(self, path=faiss_name, top_k=5, threshold=10): 59 | """ 60 | 初始化函数,用于创建一个检索器对象。 61 | 62 | Args: 63 | path (str, optional): FAISS索引文件持久化的路径,默认为'faiss_name'。 64 | top_k (int, optional): 返回的相似度最高的k个结果,默认为5。 65 | threshold (float, optional): 相似度阈值,只有当相似度大于该阈值时,才会被作为候选结果,默认为10。 66 | """ 67 | self.top_k = top_k 68 | self.threshold = threshold 69 | self.retriever = get_BM25_retriever(top_k) 70 | 71 | def bm25_search(self, real_query): 72 | """ 73 | 调用BM25检索器 74 | """ 75 | print("开始检索") 76 | start_time = time.time() 77 | docs = self.retriever.get_relevant_documents(real_query) 78 | print("检索结束,耗时:", time.time() - start_time) 79 | retrieval_results = [] 80 | score = len(docs) 81 | print("初步检索结果") 82 | for doc in docs: 83 | # 遍历检索到的文档 84 | # 把答案从metadata中取出,作为检索结果返回 85 | # langchain的BM25检索器不返回score,只返回文档 86 | # 这里不进行过滤,而是直接用文档的顺序作为评分,靠前的文档评分高 87 | print("question:", doc.page_content) 88 | print("score:", score) 89 | for key, value in doc.metadata.items(): 90 | print(key+": "+str(value)) 91 | print() 92 | retrieval_results.append( 93 | {"content": doc.metadata["answer"], "score": score, "title": doc.page_content, 94 | "image": doc.metadata["image"], "source_book": doc.metadata["source_book"], 95 | "source_file": doc.metadata["source_file"]} 96 | ) 97 | score -= 1 98 | # 将文档的内容、评分和标题添加到结果列表中 99 | return retrieval_results 100 | 101 | def search(self, query, **kwargs): 102 | """ 103 | 执行检索 104 | """ 105 | real_query = query 106 | return self.bm25_search(real_query) 107 | 108 | 109 | def tokenize_chinese(text): 110 | """ 111 | 使用jieba进行中文分词 112 | """ 113 | # 我也不知道为什么,直接把jieba.lcut作为BM25Retriever.from_defaults的参数,会导致无法pickle序列化 114 | # 这里重新封装一下,就可以了 115 | return jieba.lcut(text) 116 | 117 | 118 | def create_BM25_retriever(top_k=5): 119 | """ 120 | 基于待编码文档集,创建BM25检索器,分词器使用jieba 121 | """ 122 | from langchain_community.retrievers import BM25Retriever 123 | docs = generate_docs() 124 | bm25_retriever = BM25Retriever.from_documents(documents=docs, preprocess_func=tokenize_chinese) 125 | bm25_retriever.k = top_k 126 | return bm25_retriever 127 | 128 | 129 | def get_BM25_retriever(top_k=5): 130 | """ 131 | 获取BM25检索器,如果已经存在则加载,否则创建并持久化 132 | """ 133 | import pickle 134 | if os.path.exists(faiss_name + "/bm25retriever.pkl"): 135 | bm25_retriever = pickle.load(open(faiss_name + "/bm25retriever.pkl", 'rb')) 136 | bm25_retriever.k = top_k 137 | else: 138 | bm25_retriever = create_BM25_retriever(top_k) 139 | pickle.dump(bm25_retriever, open(faiss_name + "/bm25retriever.pkl", 'wb')) 140 | return bm25_retriever 141 | 142 | 143 | def faiss_search_test(top_k=5, threshold=10): 144 | """ 145 | FaissSearch类的测试函数 146 | """ 147 | faiss_search = FaissSearch(top_k=top_k, threshold=threshold) 148 | results = faiss_search.search("天上有多少颗星星") 149 | print("FaissSearch类的测试结果") 150 | for result in results: 151 | print("content:", result["content"]) 152 | print("score:", result["score"]) 153 | print("title:", result["title"]) 154 | print("image:", result["image"]) 155 | print("source_book:", result["source_book"]) 156 | print("source_file:", result["source_file"]) 157 | print() 158 | 159 | 160 | def bm25_retriever_test(top_k=5): 161 | """ 162 | BM25检索器的测试函数 163 | """ 164 | bm25_retriever = get_BM25_retriever(top_k) 165 | docs = bm25_retriever.get_relevant_documents("天上有多少颗星星") 166 | print("BM25检索器的测试结果") 167 | for doc in docs: 168 | print("question:", doc.page_content) 169 | for key, value in doc.metadata.items(): 170 | print(key + ": " + str(value)) 171 | print() 172 | 173 | 174 | if __name__ == "__main__": 175 | # 启动FaissSearch类的测试 176 | faiss_search_test(top_k=3, threshold=10) 177 | # 启动BM25检索器的测试 178 | bm25_retriever_test(3) 179 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | erniebot_agent 3 | erniebot 4 | faiss_cpu 5 | gradio==4.21.0 6 | modelscope_studio==0.3.0 7 | rank_bm25 8 | requests 9 | packaging 10 | tqdm 11 | packaging 12 | urllib3 13 | langchain_community 14 | langchain_core 15 | llama_index 16 | unstructured 17 | spacy 18 | lxml 19 | llama-index-embeddings-jinaai 20 | llama-index-llms-openai 21 | llama-index-vector-stores-faiss 22 | jieba 23 | llama-index-embeddings-langchain 24 | 25 | # Install appbuilder-sdk at last, some features still need erniebot 26 | appbuilder-sdk==0.7.0 27 | -------------------------------------------------------------------------------- /resources/audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/audio.wav -------------------------------------------------------------------------------- /resources/bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/bg.jpg -------------------------------------------------------------------------------- /resources/bot.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/bot.jpeg -------------------------------------------------------------------------------- /resources/custom_components/custom_select.js: -------------------------------------------------------------------------------- 1 | (props, cc, { el, onUpdate }) => { 2 | const options = JSON.parse(props.options); 3 | el.innerHTML = ` 4 | ${options 5 | .map((option) => { 6 | return `
7 | 8 |
`; 9 | }) 10 | .join('')} 11 | `; 12 | onUpdate( 13 | () => { 14 | const inputs = Array.from(el.getElementsByTagName('input')); 15 | Array.from(el.getElementsByTagName('label')).forEach((label, i) => { 16 | label.addEventListener('click', () => { 17 | inputs.forEach((input) => { 18 | input.checked = false; 19 | }); 20 | const input = label.getElementsByTagName('input')[0]; 21 | input.checked = true; 22 | // Use cc.dispatch to trigger events. 23 | cc.dispatch(options[i]); 24 | }); 25 | }); 26 | }, 27 | { callAfterMount: true } 28 | ); 29 | }; 30 | -------------------------------------------------------------------------------- /resources/demo.py: -------------------------------------------------------------------------------- 1 | # import json 2 | 3 | # # 读取JSON文件 4 | # file_path = 'all_output.json' 5 | 6 | # with open(file_path, 'r', encoding='utf-8') as file: 7 | # data = json.load(file) 8 | 9 | # # 假设queries在文件中的键是'queries' 10 | # queries = [item['query'] for item in data] 11 | 12 | # # 将 query 值每十个为一组存为 list 13 | # grouped_queries = [queries[i:i + 10] for i in range(0, len(queries), 10)] 14 | 15 | # # 打印结果 16 | # output_data = {'grouped_queries': grouped_queries} 17 | 18 | # with open('grouped_queries.json', 'w', encoding='utf-8') as f: 19 | # json.dump(output_data, f, ensure_ascii=False, indent=4) 20 | 21 | # print("Grouped queries have been written to 'grouped_queries.json'") 22 | 23 | import json 24 | 25 | 26 | def read_grouped_queries(file_path): 27 | # 读取 JSON 文件 28 | with open(file_path, 'r', encoding='utf-8') as f: 29 | data = json.load(f) 30 | 31 | # 提取并返回 "grouped_queries" 列表 32 | return data.get(f'grouped_queries[{0}]', []) 33 | 34 | 35 | print(read_grouped_queries('grouped_queries.json')) 36 | -------------------------------------------------------------------------------- /resources/dog.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/dog.mp4 -------------------------------------------------------------------------------- /resources/image-bot.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/image-bot.jpeg -------------------------------------------------------------------------------- /resources/list.json: -------------------------------------------------------------------------------- 1 | { 2 | "grouped_queries": [ 3 | "为什么要研究天文学", 4 | "为什么天空看起来像个球", 5 | "为什么天文学家不管天气预报", 6 | "为什么恒星会眨眼", 7 | "肉眼可以看到多少颗星星", 8 | "为什么天上有88个星座", 9 | "为什么四季的星空不一样", 10 | "为什么天文学家要使用星表", 11 | "为什么杜牧在诗中说“卧看牵牛织女星”", 12 | "为什么杜甫有“人生不相见,动如参与商”的诗句" 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /resources/main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/main.png -------------------------------------------------------------------------------- /resources/modelscope.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resources/music-bot.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/music-bot.jpeg -------------------------------------------------------------------------------- /resources/screen.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/screen.jpeg -------------------------------------------------------------------------------- /resources/user.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chg0901/Public_QiDiHui/ca9b506a6cd104b489ce17619ea978b676196831/resources/user.jpeg -------------------------------------------------------------------------------- /web_demo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import warnings 5 | 6 | import appbuilder 7 | import gradio as gr 8 | import modelscope_studio as mgr 9 | 10 | from rag_full.rag_langchain import FaissSearch, tokenize_chinese # tokenize_chinese需要在这里import,否则pickle会报错 11 | 12 | warnings.filterwarnings('ignore') 13 | 14 | # 配置密钥与应用ID 15 | # 设置环境变量APPBUILDER_TOKEN,目前填的这个是官方的受限试用TOKEN 16 | os.environ["APPBUILDER_TOKEN"] = "bce-v3/ALTAK-DyYuSA9DVtgt3uLCJvj09/66566c4c39901e34c20829d07bbe993654c03452" 17 | app_id = "b2c236c9-e544-4d16-8e74-016c602ef342" 18 | 19 | # 创建一个FaissSearch实例,用于检索向量数据库,每次检索返回top_k个相关文档 20 | search_engine = FaissSearch(path='./faiss_index_llama_full_ernie', top_k=5, threshold=10) 21 | 22 | # 使用appbuilder创建LLM对话接口 23 | llm = appbuilder.AppBuilderClient(app_id) 24 | 25 | prompt = """ 26 | 检索结果: 27 | 第1个段落: 28 | {doc} 29 | 检索语句: 你的名字是启迪绘,擅长把科学事实编成有趣易懂的故事,以激发儿童对相关领域的好奇心,同时这个故事应当作为成为亲子共读的桥梁,助力孩子们在轻松愉快的氛围中学习成长。 30 | 你需要根据我的问题和你所知道的知识为我创作符合以上要求的故事。 31 | 我的问题是: 32 | {query} 33 | 请根据以上检索结果回答检索语句的问题 34 | """ 35 | docs = [] 36 | 37 | 38 | def resolve_assets(relative_path): 39 | return os.path.join(os.path.dirname(__file__), "resources", 40 | relative_path) 41 | 42 | 43 | conversation = [ 44 | [None, { 45 | "text": "你好,欢迎来到😶‍🌫️智慧启迪绘!", 46 | "flushing": False 47 | }], 48 | ] 49 | 50 | 51 | def get_last_bot_message(chatbot): 52 | return chatbot[-1][1] 53 | 54 | 55 | def create_video_bot_message(text: str): 56 | return { 57 | "text": text, 58 | } 59 | 60 | 61 | def create_image_bot_message(text: str): 62 | return { 63 | "text": text, 64 | } 65 | 66 | 67 | async def chat_bot_with_llm(_input: mgr.MultimodalInput, _chatbot): 68 | global prompt, docs, search_engine, llm 69 | _chatbot.append([_input, [""]]) 70 | yield gr.update(interactive=False, value=None), _chatbot 71 | 72 | docs = search_engine.search(_input.text) 73 | 74 | if len(docs) > 0: 75 | # 取出检索到的相关性最高的文档 76 | doc = docs[0]['content'] 77 | else: 78 | # 如果没有检索到相关文档,则将doc置为空字符串 79 | doc = "" 80 | 81 | final_prompt = prompt.replace('{query}', _input.text).replace('{doc}', doc) 82 | 83 | conversation_id = llm.create_conversation() 84 | messages = llm.run(conversation_id, final_prompt, stream=True) 85 | 86 | for content in messages.content: 87 | if content.answer is not None: 88 | _chatbot[-1][1][0] += content.answer 89 | yield { 90 | chat_bot_1: _chatbot, 91 | } 92 | 93 | 94 | async def chat_bot_with_llm_image(_input: mgr.MultimodalInput, _chatbot): 95 | global prompt, docs, search_engine, llm 96 | _chatbot.append([_input, [""]]) 97 | yield gr.update(interactive=False, value=None), _chatbot 98 | 99 | docs = search_engine.search(_input.text) 100 | 101 | if len(docs) > 0: 102 | # 取出检索到的相关性最高的文档 103 | doc = docs[0]['content'] 104 | # 取出图片信息 105 | image_paths = docs[0]['image'] 106 | else: 107 | # 如果没有检索到相关文档,则将doc置为空字符串 108 | doc = "" 109 | # 如果没有检索到相关文档,则将image_paths置为空列表 110 | image_paths = [] 111 | 112 | final_prompt = prompt.replace('{query}', _input.text).replace('{doc}', doc) 113 | 114 | conversation_id = llm.create_conversation() 115 | messages = llm.run(conversation_id, final_prompt, stream=True) 116 | 117 | for content in messages.content: 118 | if content.answer is not None: 119 | _chatbot[-1][1][0] += content.answer 120 | yield { 121 | chat_bot_2: _chatbot, 122 | } 123 | 124 | _chatbot[-1][1][0] += '''\n 125 | Demo 版本因十万图书版权限制,本项目代码只展示基础对话功能,不展示tts和视频生成,请关注我们的**Github** [chg0901/Public_QiDiHui](https://github.com/chg0901/Public_QiDiHui.git) , 后续更新, 敬请期待''' 126 | yield { 127 | chat_bot_2: _chatbot, 128 | } 129 | 130 | 131 | def chat_video(_input, _chatbot): 132 | _chatbot.append([_input, None]) 133 | yield gr.update(interactive=False, value=None), _chatbot 134 | _chatbot[-1][1] = [ 135 | create_video_bot_message("") 136 | ] 137 | 138 | time.sleep(1) 139 | get_last_bot_message(_chatbot)[0][ 140 | "text"] = f"""你好,欢迎来到😶‍🌫️智慧启迪绘 \n 141 | \n 142 | 143 | 144 | 145 | 146 | 147 | 148 | \n 149 | Demo 版本因十万图书版权限制,本项目代码只展示基础对话功能,不展示tts和视频生成,请关注我们的**Github** [chg0901/Public_QiDiHui](https://github.com/chg0901/Public_QiDiHui.git) , 后续更新, 敬请期待""" 150 | yield { 151 | chat_bot_3: _chatbot, 152 | } 153 | 154 | 155 | def flushed(): 156 | return gr.update(interactive=True) 157 | 158 | 159 | def read_grouped_queries(file_path): 160 | # 读取 JSON 文件 161 | with open(file_path, 'r', encoding='utf-8') as f: 162 | data = json.load(f) 163 | 164 | # 提取并返回 "grouped_queries" 列表 165 | grouped_queries = data.get('grouped_queries', []) 166 | 167 | # 返回第一个子列表,如果没有则返回空列表 168 | if grouped_queries: 169 | return grouped_queries 170 | else: 171 | return [] 172 | 173 | 174 | css = """ 175 | h1 { 176 | text-align: center; 177 | display: block; 178 | } 179 | """ 180 | json_path = "resources/list.json" 181 | 182 | avatar_images = [ 183 | # resolve_assets('user.jpeg'), 184 | # default bot avatar and name 185 | [{ 186 | "name": "Curious baby", 187 | "avatar": "https://oss.lingkongstudy.com.cn/blog/202405271251981.png" 188 | }], 189 | 190 | [{ 191 | "name": "QiDiHui", 192 | "avatar": "https://oss.lingkongstudy.com.cn/blog/202405261707489.jpg" 193 | }], 194 | 195 | ] 196 | 197 | # 创建Gradio界面 198 | with gr.Blocks(gr.themes.Soft(), css=css) as demo: 199 | html_code = """ 200 |

😶‍🌫️ 智慧启迪绘

201 | 202 |

203 | Logo 204 |

205 |
206 |

我们致力于创造一个既能娱乐也能教育的视频生成应用,将《十万个为什么》系列丛书的丰富知识转化为易于消化和吸收理解的内容,使之成为亲子共读的桥梁,助力孩子们在轻松愉快的氛围中学习成长。在这里,学习将不再是枯燥的任务,而是充满乐趣和惊喜的旅程。让我们携手,为孩子们构建一个充满想象力和知识启迪的成长空间,一起见证《十万个为什么》系列丛书丰富有趣的知识魔力!

207 |

为了方便在AIStudio中部署,在AIStudio中的部署中,开发团队对整体功能进行了解耦拆分,增加更多的点击触发互动功能

208 |

209 | 【AIStudio 主体验地址】: 210 | https://aistudio.baidu.com/application/detail/40487 211 |

212 |

213 | 【AIStudio 备用体验地址】: 214 | https://aistudio.baidu.com/application/detail/42990 215 |

216 |

217 | 【全功能版本请查看OpenXLab部署版本】: 218 | https://openxlab.org.cn/apps/detail/chg0901/QiDiHui_appbuilder_V2 219 |

220 |

221 | 【B站介绍视频】: 222 | B站 LIC2024 RAG赛道智慧启迪绘 "十万个所以"团队 有关介绍视频 223 |

224 | 225 |
226 | 227 | """ 228 | gr.Markdown(html_code) 229 | with gr.Tabs(): 230 | with gr.TabItem("对话"): 231 | chat_bot_1 = mgr.Chatbot( 232 | value=conversation, 233 | avatar_image_width=60, 234 | avatar_images=avatar_images, 235 | height=500, 236 | flushing_speed=6, 237 | ) 238 | state = gr.State([]) 239 | 240 | input = mgr.MultimodalInput() 241 | input.submit(fn=chat_bot_with_llm, inputs=[input, chat_bot_1], outputs=[input, chat_bot_1]) 242 | chat_bot_1.flushed(fn=flushed, outputs=[input]) 243 | 244 | with gr.Column(): 245 | with gr.Accordion(open=True, label="😀输入示例:"): # 使用 Accordion 组件创建可以折叠的区域 246 | gr.Examples(examples=read_grouped_queries(json_path), inputs=input, outputs=chat_bot_1) 247 | 248 | with gr.TabItem("文生图"): 249 | chat_bot_2 = mgr.Chatbot( 250 | value=conversation, 251 | avatar_image_width=40, 252 | avatar_images=avatar_images, 253 | height=500, 254 | flushing_speed=6, 255 | ) 256 | 257 | input = mgr.MultimodalInput() 258 | input.submit(fn=chat_bot_with_llm_image, inputs=[input, chat_bot_2], outputs=[input, chat_bot_2]) 259 | chat_bot_2.flushed(fn=flushed, outputs=[input]) 260 | 261 | with gr.Column(): 262 | with gr.Accordion(open=True, label="😀输入示例:"): # 使用 Accordion 组件创建可以折叠的区域 263 | gr.Examples(examples=read_grouped_queries(json_path), inputs=input, outputs=chat_bot_2) 264 | 265 | with gr.TabItem("文生视频"): 266 | chat_bot_3 = mgr.Chatbot( 267 | value=conversation, 268 | avatar_image_width=40, 269 | avatar_images=avatar_images, 270 | height=500, 271 | flushing_speed=6, 272 | ) 273 | 274 | input = mgr.MultimodalInput() 275 | input.submit(fn=chat_video, inputs=[input, chat_bot_3], outputs=[input, chat_bot_3]) 276 | chat_bot_3.flushed(fn=flushed, outputs=[input]) 277 | 278 | with gr.Column(): 279 | with gr.Accordion(open=True, label="😀输入示例:"): # 使用 Accordion 组件创建可以折叠的区域 280 | gr.Examples(examples=read_grouped_queries(json_path), inputs=input, outputs=chat_bot_3) 281 | 282 | # 启动Gradio应用 283 | if __name__ == "__main__": 284 | demo.queue().launch() 285 | --------------------------------------------------------------------------------