├── .gitignore ├── LICENSE ├── README.md ├── bert_server └── sentence_bert_server.py ├── common ├── get_ip.py ├── kill_program.py └── response_add_head.py ├── config ├── associative_questions_config.ini ├── befaq_conf.ini ├── es.ini └── sheetname.conf ├── data └── 线上用户反馈回复.xls ├── docker ├── README.md └── docker-compose.yml ├── es ├── es_create_index.py ├── es_del_data.py ├── es_del_index.py ├── es_operate.py ├── es_search_cn.py ├── jieba_befaq.py ├── read_excel.py ├── search_engines_operate.py ├── search_model │ └── .gitkeep ├── stopwords4_process_question_dedup.txt ├── train_search_model.py ├── userdict.txt ├── write_data2es.py └── write_vecs2bin.py ├── faq ├── bert_vect │ └── .gitkeep ├── deduplicate_threshold_op.py ├── get_final_data.py ├── get_question_vecs.py ├── jieba4befaq.py ├── matching_operate.py ├── re_rank.py └── retrieval_es.py ├── image └── BEFAQ 框架.png ├── logs └── .gitkeep ├── model └── .gitkeep ├── requirements.txt └── src ├── associative_questions_server.py └── main_faq.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea* 2 | .vscode 3 | __pycache__ 4 | nohup.out 5 | *.m 6 | log*.* 7 | search_model 8 | bert_vect 9 | model -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BEFAQ 2 | 3 | **BEFAQ(BERT-based Embedding Frequently Asked Question)** 开源项目是好好住面向多领域FAQ集合的问答系统框架。
4 |
我们将Sentence BERT模型应用到FAQ问答系统中。开发者可以使用BEFAQ系统快速构建和定制适用于特定业务场景的FAQ问答系统。
5 | 6 | ## BEFAQ的优点有: 7 | 8 |
(1)使用了Elasticsearch、Faiss、Annoy 作为召回引擎
9 |
(2)使用了Sentence BERT 语义向量(Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks)
10 |
(3)对同义问题有很好的支持
11 |
(4)支持多领域语料(保证了召回的数据是对应领域的,即使是同样的问题,也可以得到不同的答案。)
12 |
(5)提供了根据当前输入提示联想问题(suggest)功能的接口
13 | 14 | 15 | ## BEFAQ的框架结构如下图 16 | ![image](https://github.com/hhzrd/BERT-Embedding-Frequently-Asked-Question/blob/xiao/docker/image/BEFAQ%20%E6%A1%86%E6%9E%B6.png) 17 | 18 | 19 | ## 如何使用 20 | ### 1、通过docker的方式使用(docker中已经安装Es7.6.1、kibana、IK分词器和同义词功能,BEFAQ的代码也已经包含在docker中。) 21 | 我们提倡通过docker的方式快速上手,启动方式请参考根目录下的docker文件夹中的README.md 22 | 23 | ### 2、通过非docker的方式使用 24 | 25 | 26 | #### 2.1、在本机安装Es7.6.1和配套的kibana,配置Es的IK分词器和同义词功能 27 | 请参考博客[ES(Elasticsearch)7.6.1安装教程](https://blog.csdn.net/weixin_37792714/article/details/108025200)进行安装。如何已经配置过Es、IK分词器和同义词功能,可以略过这一步。但是记得把同义词同步到你的Es中。为了方便大家。相关文件的下载,都放在了百度网盘中,欢迎大家使用。链接:https://pan.baidu.com/s/1PxgINf6Q1UZBtcsYw6FU0w 密码:4q9h 28 | 29 | 30 | 在BEFAQ中,为了方便大家的使用,我们提供两种Elasticsearch的连接方式:使用用户名和密码的方式与不使用用户名密码的方式。如何修改请参看项目根目录下config文件夹的es.ini 配置文件中的说明。在我们的博客中,我们提供了Elasticsearch配置用户名和密码的方式。 31 | 32 | 33 | 34 | #### 2.2、下载项目代码并创建BEFAQ的虚拟环境 35 | 36 | conda create -n befaq python=3.6 -y 37 | source activate befaq 38 | git clone https://github.com/hhzrd/BERT-Embedding-Frequently-Asked-Question.git 39 | 进入BEFAQ的根目录,然后 40 | pip install -r requirements.txt 41 | 42 | #### 2.3、sentence-transformers 多语言预训练模型的下载 43 | 44 | 首先进入到项目的根目录,然后 45 | cd model 46 | wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distiluse-base-multilingual-cased.zip 47 | unzip distiluse-base-multilingual-cased.zip 48 | 请将模型文件都直接放在model文件夹下。 49 | 如果使用最新的模型报错(并且sentence_transformers==0.3.0),请到百度网盘中下载老版本的模型(适配sentence_transformers==0.3.0,transformers==3.0.2)。目前BEFAQ使用的sentence_transformers已经升级到1.2.0版本号。 50 | 51 | #### 2.4、excel数据格式 52 | 如果你想要先跑通代码尝试一下。可以先不配置自己的数据。 53 | 54 | excel表格请放置在项目根目录下的 data/文件下,例如目前是示例文件名为“线上用户反馈回复.xls” excel数据是QA数据的来源,其中的数据会被写入到Es中。大家下载源码后,可以打开这个文件具体看一下数据示例。 55 | 56 | sheet的名称表示不同的领域,比如,我的第一个领域,叫做“领域1”。其中,第一列是“数据填写人姓名”,可以为空。第二列是“答案”,不允许为空。第三列是“原始问题”,不允许为空。第三列以后是“同义问题”,同义问题的数量没有限制。可以有很多同义问题,也可以一个同义问题都没有。一行一条数据。 57 | 58 | sheet名为“词典”的,放置的是用户词典。比如,我不想让“好好住”这个词在分词的过程中被切开。就把这个词放置在词典中。一行一条数据。程序会自动读取到指定位置(用于jieba分词),但是Es中IK分词器的自定义词典需要自己添加 59 | sheet名为“停用词”的,放置的是停用词词典。一行一条数据。程序会自动读取到指定位置。 60 | sheet名为“同义词”的,是放置同义词的sheet。第一列是原义词,第二列及其之后是同义词。比如,番茄和西红柿是同义词。第一行列放番茄,第二列放西红柿。一行一条数据。同义词的数据需要自己写到Es的同义词表中,具体参看我上边提到ES(Elasticsearch)7.6.1安装教程的博客。因为你当下的服务器未必是Es的服务器,所以这里并没有用程序直接写入。 61 | 62 | 同义词,词典,停用词。多个领域共用。词典,停用词是给BEFAQ的jieba分词使用的。同义词是给Es使用的。 63 | 64 | 你可以在Excel中写上很多领域的数据,但是具体读取哪些领域的数据,项目根目录下config文件夹的sheetname.conf中可以配置。 65 | 66 | #### 2.5、修改BEFAQ的配置文件 67 | 68 | 项目根目录下的data/线上用户反馈回复.xls 是QA数据的来源,其中的数据会被写入到Es中。如果你想要先跑通代码尝试一下。可以先不配置自己的数据。 69 | 项目根目录下的config文件夹下sheetname.conf 是读取Excel文档数据的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。 70 | 项目根目录下的config文件夹的es.ini 是BEFAQ关于ES的配置文件。这个配置文件即使是想要先跑通代码尝试一下,也是需要修改的。这个配置文件里需要配置Es的IP(域名)和端口号,Es的登陆的用户名和密码。一定要根据自己的Es的配置进行修改,才能让BEFAQ连接上你的Es。 71 | 项目根目录下的config文件夹的befaq_conf.ini 是BEFAQ的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。 72 | 73 | 74 | #### 2.6、如何开启BEFAQ服务 75 | 76 | 进入项目的根目录,然后 77 | source activate befaq 78 | cd es 79 | 80 | 将数据从excel中的数据写到Es 81 | python write_data2es.py 82 | 83 | 将问题处理成Sentence BERT 向量,保存到bin类型文件中,便于后期读取问题的向量。 84 | python write_vecs2bin.py 85 | 86 | 训练Faiss和Annoy模型 87 | python train_search_model.py 88 | 89 | 启动BEFAQ服务 (如果数据没有发生变化,后期启动服务只需要进行这一步) 90 | 进入项目的根目录(cd ..),然后 91 | cd src 92 | 启动BEFAQ服务 93 | python main_faq.py 94 | 或者在后台中启动 95 | nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 & 96 | 97 | 查看项目运行状态 98 | ps -ef|grep main_faq.py 99 | 100 | 在终端中测试BEFAQ。BEFAQ的服务是post请求。(将127.0.0.1替换成自己的ip) 101 | 102 | curl -d "question=如何评价设计师&get_num=3&threshold=0.5&owner_name=领域1" http://127.0.0.1:8129/BEFAQ 103 | 104 | 接口url: 105 | http://127.0.0.1:8129/BEFAQ 106 | 接口参数说明 107 | question:用户的问题。必需 108 | get_num:接口最多返回几条数据。非必需,默认为3 109 | threshold:阈值,相似度高于或等于这个阈值的数据才会被接口返回。非必需,默认为0.5 110 | owner_name:数据所有者的名称,也就是excel中每个领域的数据对应的sheet name。用来区分多领域数据。必需 111 | 112 | 返回的数据格式: 113 | [ 114 | { 115 | "q_id": 2, 116 | "specific_q_id": 3, 117 | "question": "如何评价设计师", 118 | "answer": "你好。点击认证设计师头像,进入TA的个人主页,点击左下角「评价」即可进行评价。此外,设计师的荣耀值是根据设计师的站内数据综合计算,无法直接打分的哦。感谢你的支持。", 119 | "confidence": 1.0 120 | }, 121 | { 122 | "q_id": 6, 123 | "specific_q_id": 7, 124 | "question": "怎样把个人设计师转成机构设计师", 125 | "answer": "你好,可以登录好好住官网,再次点击提交设计师认证资料,即可重新修改哟;", 126 | "confidence": 0.6 127 | } 128 | ] 129 | 130 | 131 | #### 2.7、如何开启BEFAQ的联想词接口服务 132 | 133 | 如果想要启动根据当前输入联想问题的功能。 134 | 进入项目根目录,然后 135 | cd src 136 | python associative_questions_server.py 137 | 或者在后台中启动 138 | nohup python -u associative_questions_server.py >/dev/null 2>&1 & 139 | 140 | 查看项目运行状态 141 | ps -ef|grep associative_questions_server.py 142 | 143 | 144 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip) 145 | curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1" http://127.0.0.1:8128/associative_questions 146 | 147 | 接口url: 148 | http://127.0.0.1:8128/associative_questions 149 | 接口参数说明 150 | current_question: 151 | limit_num:接口最多返回几条数据。必需 152 | owner_name:数据所有者的名称,用来区分多领域数据。必需 153 | if_middle:是否允许用户当前输入的内容在中间的位置。非必需。默认为1,1为允许,0为不允许。 154 | 155 | 返回的数据格式: 156 | { 157 | "code": "1", 158 | "msg": "OK", 159 | "data": { 160 | "message": [ 161 | "按地区找设计师", 162 | "设计师可以选择同城吗", 163 | "怎样把个人设计师转成机构设计师" 164 | ] 165 | } 166 | } 167 | 168 | ## Authors 169 | 170 |
该项目的主要贡献者有:
171 | * [肖轶超](https://github.com/xiaoyichao)(好好住) 172 | * [徐忠杰](https://github.com/461025412)(好好住) 173 | * [王得祥](https://github.com/oksite)(好好住) 174 | * [向泳州](https://github.com/XiangYongzhou)(好好住) 175 | * [辛少普](https://github.com/hhzrd)(好好住) 176 | 177 | ## 参考文献: 178 | 179 |
[1] [百度AnyQ](https://github.com/baidu/AnyQ)
180 |
[2] [sentence-transformers](https://github.com/UKPLab/sentence-transformers)
181 |
[3] [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084)
182 | 183 | ## Copyright and License 184 | 185 | BEFAQ is provided under the [Apache-2.0 license](https://github.com/baidu/AnyQ/blob/master/LICENSE). 186 | -------------------------------------------------------------------------------- /bert_server/sentence_bert_server.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-11 08:42:52 6 | LastEditTime: 2021-06-18 17:41:43 7 | @Description: 获取SentenceBERT的向量 8 | ''' 9 | 10 | import numpy as np 11 | import torch 12 | import os 13 | import configparser 14 | from sentence_transformers import SentenceTransformer 15 | 16 | dir_name = os.path.abspath(os.path.dirname(__file__)) 17 | 18 | faq_config = configparser.ConfigParser() 19 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini")) 20 | Sentence_BERT_path = os.path.join(dir_name, "../", str( 21 | faq_config["AlgorithmConfiguration"]["Sentence_BERT_path"])) 22 | 23 | 24 | class SentenceBERT(object): 25 | ''' 26 | Author: xiaoyichao 27 | param {type} 28 | Description: SentenceBERT 29 | ''' 30 | 31 | def __init__(self): 32 | self.model = SentenceTransformer(Sentence_BERT_path) 33 | if torch.cuda.is_available(): 34 | self.model = self.model.to(torch.device("cuda")) 35 | print("Sentenence BERT使用的设备为:%s" % self.model.device) 36 | 37 | def normalize(self, vec): 38 | ''' 39 | Author: xiaoyichao 40 | param {type} 41 | Description: 矢量在用于相似度计算之前被归一化为单位长度,使得余弦相似性和点积相当。参考文章https://www.thinbug.com/q/41387000 42 | ''' 43 | norm = np.linalg.norm(vec) 44 | if norm == 0: 45 | return vec 46 | return vec/norm 47 | 48 | def get_bert(self, sentence_list): 49 | ''' 50 | Author: xiaoyichao 51 | param {type} 52 | Description: 返回(512,)纬度的SentenceBERT向量 53 | ''' 54 | sentences_vec = [] 55 | sentences_vec = np.array(self.model.encode(sentence_list)) 56 | sentences_vec_mean = np.mean(sentences_vec, axis=0).reshape(-1, 512) 57 | # sentences_vec_max = np.max(sentences_vec, axis=0).reshape(-1, 512) 58 | return np.array([self.normalize(sentences_vec_mean[0])]) 59 | 60 | def get_object(self): 61 | ''' 62 | Author: xiaoyichao 63 | param {type} 64 | Description: 返回SentenceBERT的对象 65 | ''' 66 | return self.model 67 | 68 | 69 | # # # 测试demo 70 | if __name__ == '__main__': 71 | sentenceBERT = SentenceBERT() 72 | sentences_vec = sentenceBERT.get_bert(sentence_list=["如何评价设计师"]) 73 | print(sentences_vec.shape) 74 | print(sentences_vec) 75 | -------------------------------------------------------------------------------- /common/get_ip.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-02-05 14:35:28 6 | LastEditTime: 2020-08-13 21:37:43 7 | @Description: 查询本机ip地址 8 | ''' 9 | import socket 10 | 11 | 12 | def get_host_ip(): 13 | ''' 14 | Author: xiaoyichao 15 | param {type} 16 | Description: 查询本机ip地址 17 | ''' 18 | try: 19 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 20 | s.connect(('8.8.8.8', 80)) 21 | ip = s.getsockname()[0] 22 | finally: 23 | s.close() 24 | 25 | return ip 26 | -------------------------------------------------------------------------------- /common/kill_program.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | Date: 2020-08-20 11:09:45 6 | LastEditTime: 2021-07-08 11:50:42 7 | Description: kill 进程 8 | ''' 9 | import os 10 | 11 | 12 | def kill_port(port): 13 | ''' 14 | @Author: xiaoyichao 15 | @param {*} 16 | @Description: 根据端口号杀掉程序 17 | ''' 18 | find_kill = "kill -9 $(lsof -i:%d -t)" % port 19 | try: 20 | result = os.popen(find_kill) 21 | print("%d端口程序kill 成功" % port) 22 | return result.read() 23 | except Exception: 24 | print("%d端口程序kill 失败" % port) -------------------------------------------------------------------------------- /common/response_add_head.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-04-23 15:52:51 6 | LastEditTime: 2021-03-10 18:04:03 7 | @Description: 用于接口返回数据,加入headers 8 | ''' 9 | 10 | from sanic.response import json 11 | 12 | 13 | def res_with_head(data_json): 14 | ''' 15 | Author: xiaoyichao 16 | param {type} 17 | Description: 用于接口返回数据,加入headers 18 | ''' 19 | return json( 20 | data_json, 21 | headers={ 22 | "Access-Control-Allow-Origin": "*", 23 | "Access-Control-Allow-Methods": "OPTIONS,HEAD,GET,POST", 24 | "Access-Control-Allow-Headers": "x-requested-with"}, 25 | status=200 26 | ) 27 | -------------------------------------------------------------------------------- /config/associative_questions_config.ini: -------------------------------------------------------------------------------- 1 | [ServerAddress] 2 | port = 8128 3 | # 联想功能的端口号 4 | [ServerInfo] 5 | work_number = 1 6 | # 进程数。支持多进程。 7 | -------------------------------------------------------------------------------- /config/befaq_conf.ini: -------------------------------------------------------------------------------- 1 | [ServerAddress] 2 | port = 8129 3 | #BEFAQ的端口号 4 | [ServerInfo] 5 | work_number = 1 6 | #启动的线程数,目前只能开启单线程 7 | [ESConfiguration] 8 | ES_num = 10 9 | #ES召回数据的数量 10 | [Faiss_Annoy_Configuration] 11 | engine_num = 5 12 | #Faiss和(或)Annoy召回的数量 13 | [AlgorithmConfiguration] 14 | Sentence_BERT_path = ./model/ 15 | # Sentence_BERT多语言模型的相对路径,若没有特殊需求,无需更改。 16 | consine = 0.6 17 | # Sentence_BERT高纬度空间下,余弦相似度算法在线性模型中所占的比重 18 | jaccard = 0.2 19 | # BM25算法在线性模型中所占的比重 20 | BM25 = 0.1 21 | # 编辑距离算法在线性模型中所占的比重 22 | edit_distance = 0.1 23 | # jaccard系数在线性模型中所占的比重 24 | use_other_when_es_none = 0 25 | # 0表示 ES没有数据的时候才用faiss或(和)annoy。1表示 ES有数据的时候也用Faiss或(和)Annoy。 26 | # 推荐使用参数0,因为Faiss或(和)Annoy的机制是一定会召回指定数量的数据,这是不利于后期计算相似度的,因为这其中很可能有你不想要召回的脏数据。 27 | # 在BEFAQ的设计中,ES根据jieba分词后各个关键字做召回,结果更可控,当ES没有召回数据的时候,再使用Faiss或(和)Annoy更好。 28 | use_faiss = 1 29 | # 是否使用Faiss,1表示使用,0表示不使用。 30 | use_annoy = 0 31 | # 是否使用Annoy,1表示使用,0表示不使用。 32 | # Faiss和Annoy可以选择都使用,也可以选择都不使用。推荐只使用Faiss就可以。 33 | # 两个都不使用的时候,use_other_when_es_none参数已经失效,因为此时只有ES用来召回数据 34 | 35 | [ServerInfo4Association] 36 | work_number = 2 37 | 38 | -------------------------------------------------------------------------------- /config/es.ini: -------------------------------------------------------------------------------- 1 | [ServerAddress] 2 | # 如果Es的服务部署在docker容器中 3 | # es_server_ip_port = http://elasticsearch4befaq:9200 4 | # 如果Es的服务在本机 5 | es_server_ip_port = http://127.0.0.1:9200 6 | #如果Es的服务在另一台服务器上,需要替换为自己ES的IP或域名、端口号 7 | #es_server_ip_port = http://xxx.xx.xx.xx:9200 8 | # 我们提供的Es docker是没有密码的,如果使用我们提供的Es docker,if_es_use_passwd = 0 即可 9 | # if_es_use_passwd =1 表示BEFAQ连接Es的时候使用用户名+密码的方式,0表示不使用用户名密码的方式。0的时候http_auth_user_name和http_auth_password参数是无效的。 10 | if_es_use_passwd = 0 11 | # Es的登陆的用户名 12 | http_auth_user_name = you Elasticsearch user_name 13 | # Es的登陆的用户名 14 | http_auth_password = you Elasticsearch password 15 | 16 | [ServerInfo] 17 | index_name_1 = index_faq_1 18 | # ES的索引1 的name 19 | index_name_2 = index_faq_2 20 | # ES的索引2 的name 21 | alias_name = index_faq 22 | # ES的索引别名name 23 | -------------------------------------------------------------------------------- /config/sheetname.conf: -------------------------------------------------------------------------------- 1 | [excel_name] 2 | name = 线上用户反馈回复.xls 3 | # 数据所在的Excel的名称。 Excel的路径为项目根目录下的 data/线上用户反馈回复.xls 4 | [QA_sheets] 5 | # 想要读取的多领域语料的sheet名,程序会把这些数据写入到ES中。 6 | sheets = 领域1,领域2,领域3,领域4 7 | [Synonyms] 8 | sheet = 同义词 9 | # 同义词的数据需要自己写到ES的同义词表中,具体文件路径请参看我写的ES安装过程的博客 10 | [Stopwords] 11 | sheet = 停用词 12 | # BEFAQ的jieba停用词表,程序会自动读取到 es/stopwords4_process_question_dedup.txt中 13 | [Userdict] 14 | # BEFAQ的jieba字典,程序会自动读取到 es/userdict.txt中 15 | sheet = 词典 -------------------------------------------------------------------------------- /data/线上用户反馈回复.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/data/线上用户反馈回复.xls -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # docker 方式启动程序 2 | 3 | ## 1、启动docker集群 4 | 首先请根据自己的系统安装docker-compose,然后才能启动docker-compose。 5 | 交互方式启动 6 | docker-compose up 7 | 后台方式启动 8 | docker-compose up -d 9 | 如果想要停止docker-compose 10 | docker-compose stop 11 | ## 2、进入BEFAQ的doker 12 | Es相关的测试数据已经写到了Es的docker内。如果需要更新数据,请参考项目根目录下的README.md 13 | 进入befaq的docker 14 | docker exec -it befaq /bin/bash 15 | ## 3、启动BEFAQ服务 16 | 进入项目根目录 17 | cd /projects/BERT-Embedding-Frequently-Asked-Question/ 18 | cd es 19 | 将数据从excel中的数据写到Es 20 | python write_data2es.py 21 | 22 | 将问题处理成Sentence BERT 向量,保存到bin类型文件中,便于后期读取问题的向量。 23 | python write_vecs2bin.py 24 | 25 | 训练Faiss和Annoy模型 26 | python train_search_model.py 27 | 28 | 进入src文件夹,启动BEFAQ服务 29 | cd ../src 30 | python main_faq.py 31 | 或者在后台中启动 32 | nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 & 33 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip) 34 | curl -d "question=忘记原始密码如何修改密码?&get_num=3&threshold=0.5&owner_name=领域1" http://127.0.0.1:8129/BEFAQ 35 | 如何手动kill BEFAQ服务 36 | kill -9 $(lsof -i:8129 -t) 37 | ## 4、启动BEFAQ的联想词接口服务 38 | cd /projects/BEFAQ 39 | cd src 40 | python associative_questions_server.py 41 | 或者在后台中启动 42 | nohup python -u associative_questions_server.py >/dev/null 2>&1 & 43 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip) 44 | curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1" http://127.0.0.1:8128/associative_questions 45 | ## 5、测试接口 46 | 请参考项目根目录下的README.md 47 | 48 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | services: 3 | kibana: 4 | image: xiaoyichao1993/kibana-7.6.1:latest 5 | container_name: kibana4befaq 6 | links: 7 | - elasticsearch4befaq 8 | ports: 9 | - 5601:5601 10 | 11 | elasticsearch4befaq: 12 | image: xiaoyichao1993/es7-befaq:latest 13 | container_name: es4befaq 14 | cap_add: 15 | - IPC_LOCK 16 | volumes: 17 | - esdata1:/usr/share/elasticsearch/data 18 | ports: 19 | - 9200:9200 20 | environment: 21 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 22 | - cluster.name=befaq-es 23 | - bootstrap.memory_lock=true 24 | - discovery.type=single-node 25 | 26 | befaq: 27 | image: xiaoyichao1993/befaq:latest 28 | container_name: befaq 29 | links: 30 | - elasticsearch4befaq 31 | ports: 32 | - 8129:8129 33 | - 8128:8128 34 | stdin_open: true 35 | tty: true 36 | depends_on: 37 | - elasticsearch4befaq 38 | 39 | volumes: 40 | esdata1: 41 | driver: local 42 | -------------------------------------------------------------------------------- /es/es_create_index.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-01-02 16:55:23 6 | LastEditTime: 2021-06-25 14:12:52 7 | @Description: 创建一个索引,仅供测试。 8 | 9 | ''' 10 | from es_operate import ESCURD 11 | from elasticsearch import Elasticsearch 12 | import os 13 | import configparser 14 | 15 | 16 | dir_name = os.path.abspath(os.path.dirname(__file__)) 17 | es_config = configparser.ConfigParser() 18 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 19 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 20 | 21 | 22 | # 使用配置文件中的index_name,也可以自己命名,创建其他名称的索引 23 | index_name_1 = es_config["ServerInfo"]["index_name_1"] 24 | index_name_2 = es_config["ServerInfo"]["index_name_2"] 25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 26 | if if_es_use_passwd == "1": 27 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 28 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 29 | es_connect = Elasticsearch( 30 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 31 | else: 32 | 33 | es_connect = Elasticsearch( 34 | es_server_ip_port) 35 | 36 | es_faq = ESCURD(es_connect) 37 | 38 | if __name__ == "__main__": 39 | es_faq.create_index(index_name=index_name_1) 40 | es_faq.create_index(index_name=index_name_2) 41 | -------------------------------------------------------------------------------- /es/es_del_data.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-19 19:01:17 6 | LastEditTime: 2021-06-25 14:13:35 7 | @Description: 删除索引,仅供测试。 8 | ''' 9 | 10 | from es_operate import ESCURD 11 | from elasticsearch import Elasticsearch 12 | import configparser 13 | import os 14 | import sys 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 16 | 17 | 18 | dir_name = os.path.abspath(os.path.dirname(__file__)) 19 | es_config = configparser.ConfigParser() 20 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 22 | 23 | 24 | # 使用配置文件中的index_name,也可以自己命名,创建其他名称的索引 25 | index_name = es_config["ServerInfo"]["index_name_1"] 26 | 27 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 28 | if if_es_use_passwd == "1": 29 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 30 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 31 | es_connect = Elasticsearch( 32 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 33 | else: 34 | 35 | es_connect = Elasticsearch( 36 | es_server_ip_port) 37 | 38 | 39 | es_faq = ESCURD(es_connect) 40 | 41 | if __name__ == "__main__": 42 | owner_names = ["领域1,领域2,领域3"] 43 | for owner_name in owner_names: 44 | es_faq.del_data(index_name, owner_name) 45 | -------------------------------------------------------------------------------- /es/es_del_index.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-01-02 16:55:23 6 | LastEditTime: 2021-06-06 21:54:28 7 | @Description: 删除ES的索引, del_index_name 是要删除的索引的名字 8 | 9 | ''' 10 | 11 | from es_operate import ESCURD 12 | from elasticsearch import Elasticsearch 13 | import os 14 | import sys 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 16 | import configparser 17 | import os 18 | import sys 19 | 20 | 21 | dir_name = os.path.abspath(os.path.dirname(__file__)) 22 | es_config = configparser.ConfigParser() 23 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 24 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 25 | 26 | 27 | index_name_1 = es_config["ServerInfo"]["index_name_1"] 28 | index_name_2 = es_config["ServerInfo"]["index_name_2"] 29 | 30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 31 | if if_es_use_passwd == "1": 32 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 33 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 34 | es_connect = Elasticsearch( 35 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 36 | else: 37 | 38 | es_connect = Elasticsearch( 39 | es_server_ip_port) 40 | 41 | 42 | es_faq = ESCURD(es_connect) 43 | 44 | if __name__ == "__main__": 45 | es_faq.del_index(index_name=index_name_1) 46 | es_faq.del_index(index_name=index_name_2) 47 | -------------------------------------------------------------------------------- /es/es_operate.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-05-21 15:31:50 6 | LastEditTime: 2021-06-18 15:52:23 7 | @Description: ES相关操作的类 8 | 9 | ''' 10 | from elasticsearch.helpers import bulk 11 | 12 | 13 | class ESCURD(object): 14 | def __init__(self, es): 15 | self.es = es 16 | 17 | def create_index(self, index_name): 18 | ''' 19 | @Author: xiaoyichao 20 | @param {type} 21 | @Description: 创建索引 22 | ''' 23 | mappings_cn = { 24 | "settings": { 25 | "index.max_ngram_diff": 10, 26 | "number_of_shards": 5, 27 | "number_of_replicas": 1, 28 | "analysis": { 29 | "filter": { 30 | "local_synonym": { 31 | "type": "synonym", 32 | "synonyms_path": "synonyms/synonym.txt" 33 | }, 34 | "edge_ngram_filter": { 35 | "type": "edge_ngram", 36 | "min_gram": 1, 37 | "max_gram": 50 38 | } 39 | }, 40 | "analyzer": { 41 | "text_ik": { 42 | "type": "custom", 43 | "tokenizer": "ik_smart", 44 | "filter": ["lowercase"] 45 | }, 46 | "text_ik_s": { 47 | "type": "custom", 48 | "tokenizer": "ik_smart", 49 | "filter": [ 50 | "lowercase", 51 | "local_synonym" 52 | ] 53 | }, 54 | 55 | "save_origin_split": { 56 | "type": "custom", 57 | "tokenizer": "standard", 58 | "filter": [ 59 | "lowercase" 60 | ] 61 | }, 62 | "keyword_cn": { 63 | "type": "custom", 64 | "tokenizer": "keyword", 65 | "filter": [ 66 | "lowercase", 67 | "edge_ngram_filter" 68 | ] 69 | }, 70 | "ngram_tokenizer_analyzer": { 71 | "type": "custom", 72 | "tokenizer": "ngram_tokenizer", 73 | "filter": [ 74 | "lowercase" 75 | ] 76 | } 77 | 78 | }, 79 | "tokenizer": { 80 | "ngram_tokenizer": { 81 | "type": "ngram", 82 | "min_gram": 1, 83 | "max_gram": 6, 84 | "token_chars": [ 85 | "letter", 86 | "digit"] 87 | } 88 | 89 | } 90 | } 91 | }, 92 | "mappings": { 93 | "properties": { 94 | "original_question": { 95 | "type": "text", 96 | "analyzer": "save_origin_split", 97 | "search_analyzer": "save_origin_split" 98 | }, 99 | "original_question_cn_left": { 100 | "type": "text", 101 | "analyzer": "keyword_cn", 102 | "search_analyzer": "keyword" 103 | }, 104 | "original_question_cn_middle": { 105 | "type": "text", 106 | "analyzer": "ngram_tokenizer_analyzer", 107 | "search_analyzer": "keyword" 108 | }, 109 | "process_question": { 110 | "type": "text", 111 | "analyzer": "text_ik", 112 | "search_analyzer": "text_ik_s" 113 | }, 114 | "answer": { 115 | "type": "text" 116 | }, 117 | "q_id": { 118 | "type": "integer" 119 | }, 120 | "specific_q_id": { 121 | "type": "integer" 122 | }, 123 | "id": { 124 | "type": "integer" 125 | }, 126 | "owner_name": { 127 | "type": "keyword" 128 | } 129 | } 130 | } 131 | } 132 | 133 | if self.es.indices.exists(index=index_name) is True: 134 | print("索引 %s 之前已经存在" % index_name) 135 | else: 136 | self.es.indices.create(index=index_name, body=mappings_cn) 137 | print("成功创建索引: %s" % index_name) 138 | 139 | def del_index(self, index_name): 140 | # 删除索引 141 | if self.es.indices.exists(index=index_name) is True: 142 | res = self.es.indices.delete(index_name) 143 | print("删除索引:", index_name) 144 | return res 145 | else: 146 | print("想要删除的索引 %s 不存在" % index_name) 147 | return 148 | 149 | def del_data(self, index_name, owner_name): 150 | # 删除owner_name对用的数据 151 | query = {'query': {'match': {'owner_name': owner_name}}} 152 | 153 | res = self.es.delete_by_query( 154 | index=index_name, body=query) 155 | print("删除数据:", res) 156 | 157 | def insert_more(self, index_name, actions, owner_name): 158 | ''' 159 | @Author: xiaoyichao 160 | @param {type}: 161 | @Description: 添加多条数据 162 | 163 | ''' 164 | res, _ = bulk(self.es, actions, index=index_name, 165 | raise_on_error=True) 166 | print("%s 向ES中添加了%d条数据" % (owner_name, res)) 167 | 168 | def search_data(self, index_name, owner_name, query_word_list, limit_num): 169 | ''' 170 | @Author: xiaoyichao 171 | @param {type} 172 | @Description: 查询ES数据 173 | ''' 174 | limit_num = int(limit_num) 175 | 176 | should_list = [] 177 | for word in query_word_list: 178 | match = { 179 | "match": { 180 | "process_question": word 181 | } 182 | } 183 | should_list.append(match) 184 | bool_inside_value = {"should": should_list} 185 | list_must_value_2 = {} 186 | list_must_value_2["bool"] = bool_inside_value 187 | 188 | list_must_value_1 = [ 189 | { 190 | "match_phrase": { 191 | "owner_name": owner_name 192 | } 193 | } 194 | ] 195 | 196 | must_list = [] 197 | must_list.append(list_must_value_1) 198 | must_list.append(list_must_value_2) 199 | 200 | dic_bool_value = {} 201 | dic_bool_value["must"] = must_list 202 | 203 | dic_bool = {} 204 | dic_bool["bool"] = dic_bool_value 205 | 206 | doc = {} 207 | doc["query"] = dic_bool 208 | doc["_source"] = ["q_id", "process_question", 209 | "original_question", "answer", "specific_q_id"] 210 | doc["size"] = limit_num 211 | 212 | print("ES查询语句:", doc) 213 | 214 | res = self.es.search( 215 | index=index_name, body=doc) 216 | return res 217 | 218 | def search_cn(self, index_name, owner_name, current_question, search_limit_num, if_middle=True): 219 | ''' 220 | @Author: xiaoyichao 221 | @param {type} 222 | @Description: 查询中文提示词 223 | ''' 224 | search_limit_num = int(search_limit_num) 225 | 226 | doc = {} 227 | if if_middle: # 从中间开始搜索 228 | 229 | doc["query"] = { 230 | "bool": { 231 | "must": [ 232 | [ 233 | { 234 | "match": { 235 | "owner_name": owner_name 236 | } 237 | }, 238 | { 239 | "match": {"original_question_cn_middle": current_question} 240 | } 241 | 242 | ]] 243 | } 244 | } 245 | 246 | else: 247 | 248 | doc["query"] = { 249 | "bool": { 250 | "must": [ 251 | [ 252 | { 253 | "match": { 254 | "owner_name": owner_name 255 | } 256 | }, 257 | { 258 | "match": {"original_question_cn_left": current_question} 259 | } 260 | 261 | ]] 262 | } 263 | } 264 | doc["_source"] = ["original_question", "q_id"] 265 | doc["size"] = search_limit_num 266 | 267 | # print("ES查询语句:", doc) 268 | 269 | res = self.es.search( 270 | index=index_name, body=doc) 271 | return res 272 | 273 | def search4search_engine(self, index_name, owner_name, question): 274 | ''' 275 | @Author: xiaoyichao 276 | @param {type} 277 | @Description: 查询annoy或faiss检索出的question的对应信息,例如q_id等 278 | ''' 279 | doc = {} 280 | 281 | doc["query"] = { 282 | "bool": { 283 | "must": [ 284 | [ 285 | { 286 | "match": { 287 | "owner_name": owner_name 288 | } 289 | }, 290 | { 291 | "match_phrase": {"original_question": question} 292 | } 293 | 294 | ]] 295 | } 296 | } 297 | 298 | doc["_source"] = ["q_id", "specific_q_id", "process_question", 299 | "original_question", "answer"] 300 | 301 | print("ES查询语句:", doc) 302 | 303 | res = self.es.search( 304 | index=index_name, body=doc) 305 | return res 306 | 307 | def es_put_alias(self, index_name, alias_name): 308 | ''' 309 | Author: xiaoyichao 310 | param {type} 311 | Description: 添加别名和索引的连接 312 | ''' 313 | res = self.es.indices.put_alias(index=index_name, name=alias_name) 314 | print("添加别名%s和索引%s的连接" % (alias_name, index_name)) 315 | return res 316 | 317 | def es_get_alias(self, alias_name): 318 | ''' 319 | Author: xiaoyichao 320 | param {type} 321 | Description: 获取当前别名下的索引 322 | ''' 323 | try: 324 | res = self.es.indices.get_alias(name=alias_name) 325 | current_index = list(res.keys())[0] 326 | print("获取当前别名%s下的索引" % alias_name) 327 | return current_index 328 | except Exception: 329 | return 330 | 331 | def es_del_alias(self, index_name, alias_name): 332 | ''' 333 | Author: xiaoyichao 334 | param {type} 335 | Description: 删除别名和索引的连接 336 | ''' 337 | try: 338 | res = self.es.indices.delete_alias( 339 | index=index_name, name=alias_name) 340 | print("删除别名%s和索引%s的连接" % (alias_name, index_name)) 341 | return res 342 | except Exception: 343 | return 344 | -------------------------------------------------------------------------------- /es/es_search_cn.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-12 07:19:00 6 | LastEditTime: 2021-03-10 19:05:51 7 | @Description: 用于实现搜索框的中文提示词的类 8 | ''' 9 | from elasticsearch import Elasticsearch 10 | import configparser 11 | import os 12 | import sys 13 | os.chdir(sys.path[0]) 14 | sys.path.append("../") 15 | from es.es_operate import ESCURD 16 | 17 | 18 | dir_name = os.path.abspath(os.path.dirname(__file__)) 19 | es_config = configparser.ConfigParser() 20 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 22 | 23 | index_name = es_config["ServerInfo"]["alias_name"] 24 | 25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 26 | if if_es_use_passwd == "1": 27 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 28 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 29 | es_connect = Elasticsearch( 30 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 31 | else: 32 | 33 | es_connect = Elasticsearch( 34 | es_server_ip_port) 35 | 36 | es_faq = ESCURD(es_connect) 37 | 38 | 39 | class SearchData4Association(object): 40 | # 实现搜索框的中文提示词的类 41 | def search_question_cn(self, owner_name, current_question, limit_num, if_middle): 42 | current_question = current_question.lower() 43 | search_limit_num = 100 44 | 45 | retrieve_data = es_faq.search_cn( 46 | index_name, owner_name, current_question, search_limit_num, if_middle) 47 | 48 | retrieve_results = retrieve_data["hits"] 49 | max_result_len = retrieve_results["total"]["value"] 50 | hits = retrieve_results["hits"] 51 | maybe_original_questions = [] 52 | q_ids = [] 53 | if limit_num < max_result_len: 54 | result_len = limit_num 55 | else: 56 | result_len = max_result_len 57 | for i in range(result_len): 58 | qu_an_id = hits[i]["_source"] 59 | original_question = qu_an_id["original_question"] 60 | q_id = qu_an_id["q_id"] 61 | maybe_original_questions.append(original_question) 62 | q_ids.append(q_id) 63 | q_id_set = set() 64 | deduplication_maybe_questions = [] 65 | # q_id去重复并根据相关度排序 66 | for q_id, maybe_original_question in zip(q_ids, maybe_original_questions): 67 | if q_id not in q_id_set: 68 | deduplication_maybe_questions.append(maybe_original_question) 69 | 70 | return deduplication_maybe_questions 71 | -------------------------------------------------------------------------------- /es/jieba_befaq.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-03-24 13:25:41 6 | LastEditTime: 2021-06-06 21:13:52 7 | @Description: 用于写入ES的process_question字段时去掉同义词。比如,怎样,如何这些词。 8 | ''' 9 | import jieba 10 | import os 11 | dir_name = os.path.abspath(os.path.dirname(__file__)) 12 | 13 | 14 | class StopwordsBEFAQ(object): 15 | 16 | def stopwordslist(self, filepath): 17 | stopwords = [line.strip() for line in open( 18 | filepath, 'r', encoding='utf-8').readlines()] 19 | return set(stopwords) 20 | 21 | # 对句子进行分词 22 | def seg_sentence4faq(self, sentence): 23 | # 创建用户字典 24 | userdict = os.path.join(dir_name, 'userdict.txt') 25 | jieba.load_userdict(userdict) 26 | sentence_seged = jieba.cut(sentence.strip()) 27 | stopwords_file = os.path.join( 28 | dir_name, 'stopwords4_process_question_dedup.txt') 29 | stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径 30 | outstr = "" # 分隔符号 31 | for word in sentence_seged: 32 | if word not in stopwords: 33 | if word != '\t': 34 | outstr += word 35 | outstr += "" # 分隔符号 36 | return outstr 37 | 38 | def seg_sentence4customer_service(self, sentence): 39 | # 创建用户字典 40 | userdict = os.path.join(dir_name, 'userdict.txt') 41 | jieba.load_userdict(userdict) 42 | sentence_seged = jieba.cut(sentence.strip()) 43 | # stopwords_file = os.path.join( 44 | # dir_name, 'stopwords4_process_question_dedup.txt') 45 | # stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径 46 | outstr = "" # 分隔符号 47 | for word in sentence_seged: 48 | # if word not in stopwords: 49 | if word != '\t': 50 | outstr += word 51 | outstr += "" # 分隔符号 52 | return outstr 53 | 54 | -------------------------------------------------------------------------------- /es/read_excel.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | Date: 2020-08-13 11:34:47 6 | LastEditTime: 2021-06-18 16:31:16 7 | Description: 用于读取excel表格的类 8 | ''' 9 | import os 10 | import sys 11 | import xlrd 12 | import configparser 13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 14 | 15 | dir_name = os.path.abspath(os.path.dirname(__file__)) 16 | 17 | 18 | class ExcelData(object): 19 | 20 | def __init__(self): 21 | self.excel_config = configparser.ConfigParser() 22 | self.excel_config.read(os.path.join(dir_name, "../config/sheetname.conf")) 23 | self.sheet_names = self.excel_config["QA_sheets"]["sheets"].split(",") 24 | self.excel_name = self.excel_config["excel_name"]["name"] 25 | self.synonyms_sheet = self.excel_config["Synonyms"]["sheet"] 26 | self.stopwords_sheet = self.excel_config["Stopwords"]["sheet"] 27 | self.excel_file = os.path.join(dir_name, "../data/", self.excel_name) 28 | self.id = 0 29 | 30 | def get_sheet_names(self): 31 | ''' 32 | Author: xiaoyichao 33 | param {type} 34 | Description: 返回要读取的sheet的名称组成的list 35 | ''' 36 | return self.sheet_names 37 | 38 | def read_sheet(self, sheet_name): 39 | ''' 40 | Author: xiaoyichao 41 | param {type} 42 | Description: 读取excel中某个sheet的数据 43 | ''' 44 | try: 45 | book = xlrd.open_workbook(filename=self.excel_file) 46 | table = book.sheet_by_name(sheet_name) 47 | nrows = table.nrows 48 | ncols = table.ncols 49 | sheet_list = [] 50 | for row in range(1, nrows): 51 | for col in range(2, ncols): 52 | cell_value = table.cell(row, col).value 53 | if cell_value != "": 54 | q_id = row 55 | original_question = cell_value 56 | answer = table.cell(row, 1).value 57 | self.id += 1 58 | owner_name = sheet_name 59 | sheet_list.append( 60 | [q_id, original_question, answer, self.id, owner_name]) 61 | return sheet_list 62 | except Exception: 63 | print("Exception") 64 | return [] 65 | 66 | def read_QA_data(self): 67 | ''' 68 | Author: xiaoyichao 69 | param {type} 70 | Description: 读取excel中的问答数据 71 | ''' 72 | excel_list = [] 73 | for sheet_name in self.sheet_names: 74 | sheet_list = self.read_sheet(sheet_name) 75 | excel_list.append(sheet_list) 76 | return excel_list 77 | 78 | 79 | # exceldata = ExcelData() 80 | # excel_list = exceldata.read_QA_data() 81 | # print(excel_list) 82 | -------------------------------------------------------------------------------- /es/search_engines_operate.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-19 17:14:35 6 | LastEditTime: 2020-08-25 17:50:47 7 | @Description: 训练annoy文件,不用faiss 是因为faiss不支持float64,最大精度floa32. 8 | 也有利用annoy 检索的功能 9 | ''' 10 | 11 | from annoy import AnnoyIndex 12 | import faiss 13 | from faiss import normalize_L2 14 | import os 15 | import sys 16 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 17 | from faq.get_question_vecs import ReadVec2bin 18 | 19 | dir_name = os.path.abspath(os.path.dirname(__file__)) 20 | read_vec2bin = ReadVec2bin() 21 | 22 | 23 | class SearchEngine(object): 24 | def train_annoy(self, owner_name): 25 | bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name) 26 | annoy_index_path = os.path.join( 27 | dir_name, './search_model/%s_annoy.index' % owner_name) 28 | tc_index = AnnoyIndex(f=512, metric='angular') 29 | 30 | if os.path.exists(os.path.join(dir_name, './search_model')) is False: 31 | os.mkdir(os.path.join(dir_name, './search_model')) 32 | 33 | if os.path.exists(annoy_index_path): 34 | os.remove(annoy_index_path) 35 | print("删除旧的 %s_annoy.index文件" % owner_name) 36 | 37 | for i, vec in enumerate(bert_vecs): 38 | tc_index.add_item(i, vec) 39 | tc_index.build(100) 40 | tc_index.save(annoy_index_path) 41 | print("写入 %s_annoy.index文件" % owner_name) 42 | 43 | def train_faiss(self, owner_name): 44 | bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name) 45 | d = 512 # dimension 46 | nb = len(bert_vecs) # database size 47 | faiss_index_path = os.path.join( 48 | dir_name, './search_model/%s_faiss.index' % owner_name) 49 | training_vectors = bert_vecs.astype('float32') 50 | normalize_L2(training_vectors) 51 | index = faiss.IndexFlatIP(d) 52 | index.train(training_vectors) 53 | index.add(training_vectors) 54 | if os.path.exists(os.path.join(dir_name, './search_model')) is False: 55 | os.mkdir(os.path.join(dir_name, './search_model')) 56 | 57 | if os.path.exists(faiss_index_path): 58 | os.remove(faiss_index_path) 59 | print("删除旧的 %s_faiss.index文件" % owner_name) 60 | 61 | faiss.write_index(index, faiss_index_path) 62 | print("写入 %s_faiss.index文件" % owner_name) 63 | -------------------------------------------------------------------------------- /es/search_model/.gitkeep: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file !.gitkeep -------------------------------------------------------------------------------- /es/stopwords4_process_question_dedup.txt: -------------------------------------------------------------------------------- 1 | ? 2 | hello 3 | hi 4 | 一下 5 | 一个 6 | 上 7 | 不 8 | 为什么 9 | 么 10 | 么么哒 11 | 了 12 | 什么 13 | 你好 14 | 再见 15 | 可以 16 | 吗 17 | 吧 18 | 呢 19 | 哈 20 | 哈哈 21 | 哈哈哈 22 | 哈喽 23 | 哪个 24 | 哪里 25 | 啊 26 | 啦 27 | 嗨 28 | 在 29 | 在不在 30 | 在吗 31 | 在哪 32 | 在哪里 33 | 好 34 | 好哒 35 | 好滴 36 | 好的 37 | 如何 38 | 希望 39 | 怎么 40 | 怎么样 41 | 怎样 42 | 怎样才能 43 | 您好 44 | 想 45 | 想要 46 | 感谢 47 | 我 48 | 我想 49 | 我的 50 | 找不到 51 | 拜拜 52 | 时 53 | 时候 54 | 有人吗 55 | 有没有 56 | 的 57 | 真 58 | 真希望 59 | 要 60 | 请问 61 | 谢谢 62 | 谢谢啦 63 | 这个 64 | 那个 65 | 问 66 | 问一下 67 | 问题 68 | 非常 69 | ? 70 | -------------------------------------------------------------------------------- /es/train_search_model.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-19 17:14:35 6 | LastEditTime: 2020-08-25 18:05:41 7 | @Description: 8 | ''' 9 | from read_excel import ExcelData 10 | import os 11 | import sys 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | from search_engines_operate import SearchEngine 14 | 15 | exceldata = ExcelData() 16 | sheet_names = exceldata.get_sheet_names() 17 | search_engine = SearchEngine() 18 | 19 | for sheet_name in sheet_names: 20 | search_engine.train_annoy(owner_name=sheet_name) 21 | search_engine.train_faiss(owner_name=sheet_name) 22 | -------------------------------------------------------------------------------- /es/userdict.txt: -------------------------------------------------------------------------------- 1 | 好好住 2 | ipad 3 | ipad pro 4 | 平板 5 | 平板电脑 6 | 夜间模式 7 | 暗黑模式 8 | 同城 9 | 当地 10 | 投诉 11 | 维权 12 | 盗用 13 | 盗图 14 | 入驻 15 | 申请 16 | 入住 17 | 认证 18 | 更换 19 | 更改 20 | ppt 21 | 课件 22 | pdf 23 | 表格 24 | 在哪 25 | 找不到 26 | 日常 27 | 常见 28 | 推送 29 | 推荐 30 | 闪退 31 | bug 32 | 异常 33 | 历史推送 34 | 往期推送 35 | 装修日记 36 | 装修记录 37 | 装修待办 38 | 装修记账 39 | 账号 40 | 账户 41 | -------------------------------------------------------------------------------- /es/write_data2es.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-01-02 16:55:23 6 | LastEditTime: 2021-03-01 19:13:26 7 | @Description: 将数据写到ES中 8 | 9 | ''' 10 | from es_operate import ESCURD 11 | from elasticsearch import Elasticsearch 12 | from jieba_befaq import StopwordsBEFAQ 13 | from read_excel import ExcelData 14 | import os 15 | # import sys 16 | # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 17 | import configparser 18 | 19 | dir_name = os.path.abspath(os.path.dirname(__file__)) 20 | es_config = configparser.ConfigParser() 21 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 22 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 23 | 24 | 25 | alias_name = es_config["ServerInfo"]["alias_name"] 26 | index_name_1 = es_config["ServerInfo"]["index_name_1"] 27 | index_name_2 = es_config["ServerInfo"]["index_name_2"] 28 | index_name_set = set([index_name_1, index_name_2]) 29 | 30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 31 | if if_es_use_passwd == "1": 32 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 33 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 34 | es_connect = Elasticsearch( 35 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 36 | else: 37 | 38 | es_connect = Elasticsearch( 39 | es_server_ip_port) 40 | 41 | es_faq = ESCURD(es_connect) 42 | stopwords4BEFAQ = StopwordsBEFAQ() 43 | 44 | 45 | class ReadsSqlData2ES(object): 46 | def __init__(self): 47 | self.exceldata = ExcelData() 48 | self.excel_list = self.exceldata.read_QA_data() 49 | 50 | def write_data2es(self, index_name): 51 | ''' 52 | @Author: xiaoyichao 53 | @param {type} 54 | @Description: 将数据写到ES中 55 | ''' 56 | 57 | for sheet_data in self.excel_list: 58 | actions = [] 59 | num = 0 60 | owner_name = "未命名领域" 61 | for info in sheet_data: 62 | num += 1 63 | q_id, original_question, answer, id, owner_name = info[ 64 | 0], info[1], info[2], info[3], info[4] 65 | process_question = original_question.lower() 66 | process_question = stopwords4BEFAQ.seg_sentence4faq( 67 | sentence=process_question) 68 | action_name = "action"+str(num) 69 | action_name = {} 70 | action_name["_index"] = index_name 71 | action_name["_source"] = { 72 | "q_id": q_id, 73 | "specific_q_id": id, 74 | "original_question": original_question, 75 | "process_question": process_question, 76 | "original_question_cn_middle": original_question.lower(), 77 | "original_question_cn_left": original_question.lower(), 78 | "answer": answer, 79 | "owner_name": owner_name 80 | } 81 | actions.append(action_name) 82 | es_faq.insert_more(index_name=index_name, actions=actions, owner_name=owner_name) 83 | 84 | 85 | if __name__ == "__main__": 86 | read_sql_data = ReadsSqlData2ES() 87 | current_index = es_faq.es_get_alias(alias_name=alias_name) 88 | new_index_set = index_name_set-set([current_index]) 89 | new_index = new_index_set.pop() 90 | es_faq.del_index(index_name=new_index) 91 | es_faq.create_index(index_name=new_index) 92 | read_sql_data.write_data2es(index_name=new_index) 93 | es_faq.es_put_alias(index_name=new_index, alias_name=alias_name) 94 | es_faq.es_del_alias(index_name=current_index, alias_name=alias_name) 95 | -------------------------------------------------------------------------------- /es/write_vecs2bin.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-01-02 16:55:23 6 | LastEditTime: 2021-06-25 15:27:08 7 | @Description: 将问题的集合的向量写入bin文件 8 | 9 | ''' 10 | 11 | 12 | import numpy as np 13 | from read_excel import ExcelData 14 | import os 15 | import sys 16 | os.chdir(sys.path[0]) 17 | sys.path.append("../") 18 | from bert_server.sentence_bert_server import SentenceBERT 19 | 20 | 21 | dir_name = os.path.abspath(os.path.dirname(__file__)) 22 | 23 | 24 | class WriteVec2bin(object): 25 | def __init__(self): 26 | self.exceldata = ExcelData() 27 | self.excel_list = self.exceldata.read_QA_data() 28 | self.sheet_names = self.exceldata.get_sheet_names() 29 | self.sentenceBERT = SentenceBERT() 30 | 31 | def write_bert_vecs(self, owner_name, num): 32 | ''' 33 | @Author: xiaoyichao 34 | @param {type} 35 | @Description: 句向量都进行写入bin文件 36 | ''' 37 | if os.path.exists(os.path.join(dir_name, '../faq/bert_vect')) is False: 38 | os.mkdir(os.path.join(dir_name, '../faq/bert_vect')) 39 | bert_vecs_path = os.path.join( 40 | dir_name, '../faq/bert_vect/%s_bert_vecs.npy' % (owner_name)) 41 | bert_sentences_path = os.path.join( 42 | dir_name, '../faq/bert_vect/%s_bert_sentences.txt' % (owner_name)) 43 | orgin_query_vecs = np.zeros(shape=(1, 512)) 44 | with open(bert_sentences_path, "w") as f: 45 | f.write("数据库中的问题"+"\n") 46 | for info in self.excel_list[num]: 47 | original_question = info[1] 48 | f.write(original_question+"\n") 49 | orgin_query = original_question.replace(",", " ") 50 | orgin_query_list = orgin_query.split(' ') 51 | orgin_query_vec = self.sentenceBERT.get_bert(orgin_query_list) 52 | orgin_query_vecs = np.concatenate( 53 | (orgin_query_vecs, orgin_query_vec), axis=0) 54 | if os.path.exists(bert_vecs_path): 55 | os.remove(bert_vecs_path) 56 | print("删除旧的BERT向量文件") 57 | # 将铺平的向量reshape 58 | orgin_query_vecs = np.reshape(orgin_query_vecs, (-1, 512)) 59 | np.save(bert_vecs_path, orgin_query_vecs) 60 | 61 | print("BERT向量文件写入", bert_vecs_path) 62 | 63 | def write_bert_vecs4sheets(self): 64 | ''' 65 | Author: xiaoyichao 66 | param {type} 67 | Description: 对每个领域语料的句向量都进行写入bin文件的操作 68 | ''' 69 | for i, sheet_name in enumerate(self.sheet_names): 70 | self.write_bert_vecs(owner_name=sheet_name, num=i) 71 | 72 | 73 | if __name__ == "__main__": 74 | write_vec2bin = WriteVec2bin() 75 | write_vec2bin.write_bert_vecs4sheets() 76 | -------------------------------------------------------------------------------- /faq/bert_vect/.gitkeep: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file !.gitkeep -------------------------------------------------------------------------------- /faq/deduplicate_threshold_op.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-05-22 12:24:06 6 | LastEditTime: 2021-06-06 22:00:53 7 | @Description: 对重排序后的数据,根据q_id进行去重复,卡阈值。低于置信度阈值的数据不要 8 | ''' 9 | 10 | 11 | class DeduplicateThreshold(object): 12 | def dedu_thr(self, q_ids, re_rank_sim_list, threshold): 13 | high_confidence_q_id_pos = [] 14 | if len(q_ids) > 0: 15 | q_id_dict = {} 16 | # 获取 q_id和position关系的字典 17 | for position, id in enumerate(q_ids): 18 | if id not in q_id_dict: 19 | q_id_dict[id] = [position] 20 | else: 21 | q_id_dict[id].append(position) 22 | # print("召回的q_id_dict:", q_id_dict) 23 | # 对q_id去重复,某个q_id下存在多个数据的,取其中最高相似度的结果,某个q_id下只有一个数据的直接取这个数据,也就是第0个数据 24 | unique_q_ids_pos = [] 25 | for poss in q_id_dict.values(): 26 | max_sim_pos = poss[0] 27 | if len(poss) > 1: 28 | for qid_pos in poss: 29 | if re_rank_sim_list[qid_pos] > re_rank_sim_list[max_sim_pos]: 30 | max_sim_pos = qid_pos 31 | unique_q_ids_pos.append(max_sim_pos) 32 | # 对去重复后的q_id,卡阈值,高于置信度的才要。 33 | for q_id_pos in unique_q_ids_pos: 34 | if re_rank_sim_list[q_id_pos] >= threshold: 35 | high_confidence_q_id_pos.append(q_id_pos) 36 | return high_confidence_q_id_pos 37 | else: 38 | return high_confidence_q_id_pos 39 | 40 | -------------------------------------------------------------------------------- /faq/get_final_data.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | @LastEditors: xiaoyichao 5 | @Date: 2020-05-23 16:21:51 6 | @LastEditTime: 2020-07-23 14:45:05 7 | @Description: FAQ模块。根据去重复,卡阈值之后留下的q_id,取出对应的question,answer,相似度 8 | ''' 9 | 10 | 11 | class FinalData(object): 12 | def get_json_confidence(self, json_data): 13 | return json_data["confidence"] 14 | 15 | def get_qa(self, high_confidence_q_id_pos, maybe_questions, maybe_answers, re_rank_sim, get_num, retrieval_q_ids, specific_q_ids): 16 | return_data = [] 17 | for q_id_pos in high_confidence_q_id_pos: 18 | single_json = {} 19 | single_json["q_id"] = retrieval_q_ids[q_id_pos] 20 | single_json["specific_q_id"] = specific_q_ids[q_id_pos] 21 | single_json["question"] = maybe_questions[q_id_pos] 22 | single_json["answer"] = maybe_answers[q_id_pos] 23 | single_json["confidence"] = round(re_rank_sim[q_id_pos], 2) 24 | return_data.append(single_json) 25 | return_data.sort(reverse=True, key=self.get_json_confidence) 26 | # 对返回数据的数量进行限制。 27 | if len(high_confidence_q_id_pos) > get_num: 28 | return return_data[:get_num] 29 | else: 30 | return return_data 31 | -------------------------------------------------------------------------------- /faq/get_question_vecs.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-09 14:45:34 6 | LastEditTime: 2021-06-25 15:04:53 7 | @Description: 获取问题集合的BERT向量 8 | ''' 9 | 10 | 11 | import numpy as np 12 | 13 | import os 14 | import sys 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 16 | from es.read_excel import ExcelData 17 | 18 | exceldata = ExcelData() 19 | sheet_names = exceldata.get_sheet_names() 20 | dir_name = os.path.abspath(os.path.dirname(__file__)) 21 | 22 | 23 | class ReadVec2bin(object): 24 | def __init__(self): 25 | self.owner_name_sentence = {} 26 | self.owner_name_bert_vecs = {} 27 | for sheet_name in sheet_names: 28 | bert_vecs_path = os.path.join( 29 | dir_name, './bert_vect/%s_bert_vecs.npy' % (sheet_name)) 30 | bert_sentences_path = os.path.join( 31 | dir_name, './bert_vect/%s_bert_sentences.txt' % (sheet_name)) 32 | 33 | with open(bert_sentences_path, "r", encoding="utf8")as sent: 34 | sentences = sent.read() 35 | sentences = sentences.strip("\n") 36 | sentences = sentences.split("\n") 37 | self.owner_name_sentence[sheet_name] = sentences[1:] 38 | bert_vecs = np.load(bert_vecs_path) 39 | self.owner_name_bert_vecs[sheet_name] = bert_vecs[1:] 40 | 41 | def read_bert_sents(self, owner_name): 42 | return self.owner_name_sentence[owner_name] 43 | 44 | def read_bert_vecs(self, owner_name): 45 | return self.owner_name_bert_vecs[owner_name] 46 | 47 | -------------------------------------------------------------------------------- /faq/jieba4befaq.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-03-24 13:25:41 6 | LastEditTime: 2021-02-23 17:45:49 7 | @Description: 对用户的FAQ问题。去掉停用词,比如,怎样,如何这些词。然后进入ES搜索 8 | ''' 9 | import jieba 10 | import os 11 | dir_name = os.path.abspath(os.path.dirname(__file__)) 12 | 13 | 14 | class JiebaBEFAQ(object): 15 | 16 | def stopwordslist(self, filepath): 17 | stopwords = [line.strip() for line in open( 18 | filepath, 'r', encoding='utf-8').readlines()] 19 | return set(stopwords) 20 | 21 | # 对句子进行分词 22 | def seg_sentence(self, sentence): 23 | # 创建用户字典 24 | userdict = os.path.join(dir_name, '../es/userdict.txt') 25 | jieba.load_userdict(userdict) 26 | sentence_seged = jieba.cut(sentence.strip()) 27 | stopwords_file = os.path.join( 28 | dir_name, '../es/stopwords4_process_question_dedup.txt') 29 | stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径 30 | outstr = "" # 分隔符号 31 | for word in sentence_seged: 32 | if word not in stopwords: 33 | if word != '\t': 34 | outstr += word 35 | outstr += "" # 分隔符号 36 | return outstr 37 | 38 | def get_list(self, sentence): 39 | ''' 40 | Author: xiaoyichao 41 | param {type} 42 | Description: 将句子变成切次词后的list 43 | ''' 44 | sentence_terms = list(jieba.cut(sentence)) 45 | return sentence_terms 46 | -------------------------------------------------------------------------------- /faq/matching_operate.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-05-12 20:46:56 6 | LastEditTime: 2021-06-25 16:08:05 7 | @Description: 8 | ''' 9 | import numpy as np 10 | import jieba 11 | import Levenshtein 12 | import time 13 | import configparser 14 | from sklearn.metrics.pairwise import cosine_similarity 15 | from gensim.summarization import bm25 16 | import os 17 | import sys 18 | os.chdir(sys.path[0]) 19 | sys.path.append("../") 20 | from faq.get_question_vecs import ReadVec2bin 21 | from faq.jieba4befaq import JiebaBEFAQ 22 | from bert_server.sentence_bert_server import SentenceBERT 23 | 24 | 25 | dir_name = os.path.abspath(os.path.dirname(__file__)) 26 | faq_config = configparser.ConfigParser() 27 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini")) 28 | 29 | 30 | class Matching(object): 31 | def __init__(self): 32 | self.read_vec2bin = ReadVec2bin() 33 | self.jiebaBEFAQ = JiebaBEFAQ() 34 | self.sentenceBERT = SentenceBERT() 35 | 36 | def cosine_sim(self, orgin_query, retrieval_questions, owner_name): 37 | ''' 38 | @Author: xiaoyichao 39 | @param {type} 40 | @Description: BERT空间的余弦相似度 41 | ''' 42 | sentences = self.read_vec2bin.read_bert_sents(owner_name=owner_name) 43 | bert_vecs = self.read_vec2bin.read_bert_vecs(owner_name=owner_name) 44 | orgin_query = orgin_query.replace(",", " ") 45 | orgin_query_list = orgin_query.split(' ') 46 | print("orgin_query_list", orgin_query_list) 47 | 48 | orgin_query_vec = self.sentenceBERT.get_bert( 49 | sentence_list=orgin_query_list) 50 | if orgin_query_vec != np.array([]): # 如果BERT服务正常 51 | retrieval_questions_vec = [] 52 | for retrieval_question in retrieval_questions: 53 | # 获取事先计算好的问题BERT 向量 54 | index_pos = sentences.index(retrieval_question) 55 | retrieval_question_vec = bert_vecs[index_pos] 56 | retrieval_question_vec = retrieval_question_vec.reshape(-1, 512) 57 | retrieval_questions_vec.append(retrieval_question_vec) 58 | 59 | retrieval_questions_vec = np.array( 60 | retrieval_questions_vec).reshape(-1, 512) 61 | 62 | # 计算出来的余弦相似度可能与理论值不一致,这是计算机存储机制导致的。通过四舍五入和异常处理,来规避异常数据出现在最后的结果中。 63 | sim_list = cosine_similarity( 64 | orgin_query_vec, retrieval_questions_vec)[0].tolist() 65 | 66 | # print('SKlearn:', end_time-begin_time) 67 | normalized_sim_list = [] 68 | for sim in sim_list: 69 | if sim > 1.0: 70 | sim = 1.0 71 | normalized_sim_list.append(sim) 72 | 73 | return normalized_sim_list 74 | else: # 如果BERT服务超时了 75 | normalized_sim_list = [] 76 | return normalized_sim_list 77 | 78 | def jaccrad(self, question, reference): # reference为源句子,question为候选句子 79 | ''' 80 | @Author: xiaoyichao 81 | @param {type} 82 | @Description: 计算两个句子的jaccard相似度 83 | ''' 84 | terms_reference = jieba.cut(reference) # 默认精准模式 85 | question = question.replace("\n", "") 86 | terms_model = jieba.cut(question) 87 | grams_reference = list(terms_reference) 88 | grams_model = list(terms_model) 89 | temp = 0 90 | for i in grams_reference: 91 | if i in grams_model: 92 | temp = temp+1 93 | fenmu = len(grams_model)+len(grams_reference)-temp # 并集 94 | jaccard_coefficient = float(temp/fenmu) # 交集 95 | return jaccard_coefficient 96 | 97 | def jaccard_sim(self, orgin_query, retrieval_questions): 98 | ''' 99 | @Author: xiaoyichao 100 | @param {type} 101 | @Description: 计算query 和潜在问题的jaccard相似度 102 | ''' 103 | sim_list = [] 104 | for retrieval_question in retrieval_questions: 105 | jaccard_coefficient = self.jaccrad( 106 | question=orgin_query, reference=retrieval_question) 107 | sim_list.append(jaccard_coefficient) 108 | return sim_list 109 | 110 | def bm25_sim(self, orgin_query, retrieval_questions): 111 | ''' 112 | @Author: xiaoyichao 113 | @param {type} 114 | @Description: 计算query 和潜在问题的BM25相似度 115 | ''' 116 | jieba_corpus = [] 117 | for corpu in retrieval_questions: 118 | line_seg = self.jiebaBEFAQ.get_list(corpu) 119 | jieba_corpus.append(line_seg) 120 | jieba_question = self.jiebaBEFAQ.get_list(orgin_query) 121 | bm25Model = bm25.BM25(jieba_corpus) 122 | sim_list = bm25Model.get_scores(jieba_question) 123 | normalized_sim_list = [] 124 | max_sim = max(sim_list) 125 | for sim in sim_list: 126 | if sim == 0: 127 | normalized_sim = 0 128 | else: 129 | normalized_sim = sim/max_sim 130 | normalized_sim_list.append(normalized_sim) 131 | 132 | return normalized_sim_list 133 | 134 | def edit_distance_sim(self, orgin_query, retrieval_questions): 135 | ''' 136 | @Author: xiaoyichao 137 | @param {type} 138 | @Description: 计算query 和潜在问题的编辑距离的相似度 139 | ''' 140 | sim_list = [] 141 | max_len = max(len(orgin_query), max([len(x) for x in retrieval_questions])) 142 | for corpu in retrieval_questions: 143 | edit_distance = Levenshtein.distance(orgin_query, corpu) 144 | sim = 1 - edit_distance * 1.0 / max_len 145 | sim_list.append(sim) 146 | return sim_list 147 | 148 | 149 | if __name__ == "__main__": 150 | matching = Matching() 151 | question = "如何评价设计师" 152 | normalized_sim_list = matching.cosine_sim( 153 | question, ["如何评价设计师"], "领域1") 154 | print(normalized_sim_list) 155 | -------------------------------------------------------------------------------- /faq/re_rank.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-05-22 13:54:44 6 | LastEditTime: 2021-06-06 21:42:47 7 | @Description: 线性模型的重排序,给予不同的算法不同的权重 8 | ''' 9 | 10 | 11 | class ReRank(object): 12 | def linear_model(self, consin_sim, jaccard_sim, bm25_sim, edit_distance_sim, consine_weight, jaccard_weight, BM25_weight, edit_distance_weight): 13 | if consin_sim != []: 14 | tmp_multiple_sims = [i * consine_weight + j*jaccard_weight + k*BM25_weight + l*edit_distance_weight 15 | for i, j, k, l in zip(consin_sim, jaccard_sim, bm25_sim, edit_distance_sim)] 16 | multiple_sims = [] 17 | if consine_weight + jaccard_weight + BM25_weight + edit_distance_weight ==1: 18 | for multiple_sim in tmp_multiple_sims: 19 | if multiple_sim > 1.0: 20 | multiple_sim = 1.0 21 | multiple_sims.append(multiple_sim) 22 | else: 23 | multiple_sims = tmp_multiple_sims 24 | return multiple_sims 25 | else: 26 | multiple_sims = jaccard_sim 27 | return multiple_sims 28 | 29 | -------------------------------------------------------------------------------- /faq/retrieval_es.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-01-02 16:55:23 6 | LastEditTime: 2021-06-18 16:00:32 7 | @Description: 使用ES召回数据和Faiss(annoy)召回数据 8 | 9 | ''' 10 | 11 | from elasticsearch import Elasticsearch 12 | from annoy import AnnoyIndex 13 | import numpy as np 14 | import faiss 15 | import os 16 | import sys 17 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 18 | from es.es_operate import ESCURD 19 | import configparser 20 | from bert_server.sentence_bert_server import SentenceBERT 21 | from faq.get_question_vecs import ReadVec2bin 22 | 23 | 24 | dir_name = os.path.abspath(os.path.dirname(__file__)) 25 | es_config = configparser.ConfigParser() 26 | es_config.read(os.path.join(dir_name, "../config/es.ini")) 27 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"] 28 | 29 | 30 | index_name = es_config["ServerInfo"]["alias_name"] 31 | 32 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"] 33 | if if_es_use_passwd == "1": 34 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"] 35 | http_auth_password = es_config["ServerAddress"]["http_auth_password"] 36 | es_connect = Elasticsearch( 37 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password)) 38 | else: 39 | 40 | es_connect = Elasticsearch( 41 | es_server_ip_port) 42 | 43 | es_faq = ESCURD(es_connect) 44 | sentenceBERT = SentenceBERT() 45 | read_vec2bin = ReadVec2bin() 46 | 47 | 48 | class SearchData(object): 49 | ''' 50 | Author: xiaoyichao 51 | param {type} 52 | Description: 用于召回数据,会使用ES,Annoy,Faiss,具体使用哪些可以自己配置 53 | ''' 54 | def search_es(self, owner_name, query_word_list, ES_limit_num): 55 | ''' 56 | Author: xiaoyichao 57 | param {type} 58 | Description: 使用ES召回 59 | ''' 60 | retrieve_data = es_faq.search_data( 61 | index_name=index_name, owner_name=owner_name, query_word_list=query_word_list, limit_num=ES_limit_num) 62 | retrieve_results = retrieve_data["hits"] 63 | max_result_len = retrieve_results["total"]["value"] 64 | # max_score = retrieve_results["max_score"] 65 | hits = retrieve_results["hits"] 66 | maybe_original_questions = [] 67 | maybe_process_questions = [] 68 | maybe_answers = [] 69 | specific_q_ids = [] 70 | q_ids = [] 71 | if ES_limit_num < max_result_len: 72 | result_len = ES_limit_num 73 | else: 74 | result_len = max_result_len 75 | for i in range(result_len): 76 | qu_an_id = hits[i]["_source"] 77 | original_question = qu_an_id["original_question"] 78 | process_question = qu_an_id["process_question"] 79 | answer = qu_an_id["answer"] 80 | q_id = qu_an_id["q_id"] 81 | specific_q_id = qu_an_id["specific_q_id"] 82 | maybe_original_questions.append(original_question) 83 | maybe_process_questions.append(process_question) 84 | maybe_answers.append(answer) 85 | q_ids.append(q_id) 86 | specific_q_ids.append(specific_q_id) 87 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids 88 | 89 | def search_annoy(self, owner_name, question, num=5): 90 | ''' 91 | Author: xiaoyichao 92 | param {type} 93 | Description: 使用Annoy 召回 94 | ''' 95 | sentences = read_vec2bin.read_bert_sents(owner_name=owner_name) 96 | annoy_index_path = os.path.join( 97 | dir_name, '../es/search_model/%s_annoy.index' % owner_name) 98 | encodearrary = sentenceBERT.get_bert([question]) 99 | tc_index = AnnoyIndex(f=512, metric='angular') 100 | tc_index.load(annoy_index_path) 101 | # items = tc_index.get_nns_by_vector( 102 | # encodearrary[0], num, include_distances=True) 103 | items = tc_index.get_nns_by_vector( 104 | encodearrary[0], num, include_distances=True) 105 | sim_questions = [sentences[num_annoy] for num_annoy in items[0]] 106 | # sims = items[1] 107 | # index_nums = items[0] 108 | return sim_questions 109 | 110 | def search_faiss(self, owner_name, question, num=5): 111 | ''' 112 | Author: xiaoyichao 113 | param {type} 114 | Description: 使用Faiss 召回 115 | ''' 116 | sentences = read_vec2bin.read_bert_sents(owner_name=owner_name) 117 | faiss_index_path = os.path.join( 118 | dir_name, '../es/search_model/%s_faiss.index' % owner_name) 119 | index = faiss.read_index(faiss_index_path) 120 | question_vec = sentenceBERT.get_bert([question]).astype('float32') 121 | index.nprobe = 1 122 | sims, index_nums = index.search(question_vec, num) 123 | sim_questions = [sentences[num_faiss] for num_faiss in index_nums[0]] 124 | # index_nums = index_nums[0].tolist() 125 | # sims = sims[0].tolist() 126 | return sim_questions 127 | 128 | def merge_op(self, question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num): 129 | ''' 130 | Author: xiaoyichao 131 | param {type} 132 | Description: 合并ES与faiss或(和)annoy的结果 133 | ''' 134 | if use_faiss == 1 and use_annoy == 0: 135 | print("use_faiss") 136 | mayey_search_questions = self.search_faiss( 137 | owner_name, question, num=engine_limit_num) 138 | elif use_faiss == 0 and use_annoy == 1: 139 | print("use_annoy") 140 | mayey_search_questions = self.search_annoy( 141 | owner_name, question, num=engine_limit_num) 142 | elif use_faiss == 1 and use_annoy == 1: 143 | print("use_annoy and use_faiss ") 144 | mayey_search_questions_faiss = self.search_faiss( 145 | owner_name, question, num=engine_limit_num) 146 | mayey_search_questions_annoy = self.search_annoy( 147 | owner_name, question, num=engine_limit_num) 148 | mayey_search_questions = list( 149 | set(mayey_search_questions_faiss+mayey_search_questions_annoy)) 150 | else: 151 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids 152 | print("ES检索出的数据", maybe_original_questions) 153 | # 再去查ES的数据,跟ES的数据做合并。去重复。 154 | for sim_question in mayey_search_questions: 155 | if sim_question not in set(maybe_original_questions): 156 | print("faiss、annoy 检索出的新数据", sim_question) 157 | retrieve_data = es_faq.search4search_engine( 158 | index_name, owner_name, question=sim_question) 159 | retrieve_results = retrieve_data["hits"] 160 | max_result_len = retrieve_results["total"]["value"] 161 | # max_score = retrieve_results["max_score"] 162 | hits = retrieve_results["hits"] 163 | 164 | if max_result_len >= 1: 165 | for i in range(1): 166 | qu_an_id = hits[i]["_source"] 167 | original_question = qu_an_id["original_question"] 168 | process_question = qu_an_id["process_question"] 169 | answer = qu_an_id["answer"] 170 | q_id = qu_an_id["q_id"] 171 | specific_q_id = qu_an_id["specific_q_id"] 172 | maybe_original_questions.append(original_question) 173 | maybe_process_questions.append(process_question) 174 | maybe_answers.append(answer) 175 | q_ids.append(q_id) 176 | specific_q_ids.append(specific_q_id) 177 | # 合并数据 178 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids 179 | 180 | def search_merge(self, owner_name, question, query_word_list, use_other_when_es_none, use_faiss=0, use_annoy=0, engine_limit_num=5, ES_limit_num=10): 181 | # 首先用ES检索 182 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.search_es( 183 | owner_name=owner_name, query_word_list=query_word_list, ES_limit_num=ES_limit_num) 184 | if use_other_when_es_none is False: 185 | if len(maybe_original_questions) == 0: # ES没有数据的时候才用faiss或(和)annoy 186 | # 推荐使用这种方式,因为faiss和annoy一定会召回指定数量的数据。这其中很可能会出现你不想看到的数据。当ES召回数据量为0的时候,再利用Fasis或(和)annoy召回数据 187 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op( 188 | question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num) 189 | else: # ES有数据的时候也用faiss或(和)annoy。 190 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op( 191 | question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num) 192 | 193 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids 194 | -------------------------------------------------------------------------------- /image/BEFAQ 框架.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/image/BEFAQ 框架.png -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file !.gitkeep -------------------------------------------------------------------------------- /model/.gitkeep: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file !.gitkeep -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentence_transformers==1.2.0 2 | jieba==0.39 3 | elasticsearch==7.7.0 4 | annoy==1.16.3 5 | xlrd==1.2.0 6 | numpy==1.18.2 7 | faiss_cpu==1.6.3 8 | sanic==20.6.3 9 | scikit_learn==0.23.2 10 | transformers==4.6.1 11 | python-Levenshtein==0.12.2 12 | gensim==3.8.3 13 | uvloop==0.14.0 -------------------------------------------------------------------------------- /src/associative_questions_server.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-06-12 08:15:51 6 | LastEditTime: 2021-06-18 16:28:41 7 | @Description: 8 | ''' 9 | from sanic import Sanic 10 | import sanic 11 | import configparser 12 | import os 13 | import sys 14 | os.chdir(sys.path[0]) 15 | sys.path.append("../") 16 | from es.es_search_cn import SearchData4Association 17 | from common.response_add_head import res_with_head 18 | from common.kill_program import kill_port 19 | 20 | 21 | dir_name = os.path.abspath(os.path.dirname(__file__)) 22 | search_data = SearchData4Association() 23 | 24 | 25 | # 接口会返回json数据 26 | app = Sanic() 27 | app = Sanic("associative questions") 28 | 29 | 30 | @app.route("/associative_questions", methods=["POST", "HEAD"]) 31 | async def associative_questions(request): 32 | 33 | # 接收到的参数 34 | current_question = str(request.form.get("current_question")) 35 | limit_num = int(request.form.get("limit_num")) 36 | owner_name = str(request.form.get("owner_name")) 37 | if_middle = int(request.form.get("if_middle", default=1)) 38 | if if_middle == 1: 39 | if_middle = True 40 | if if_middle == 0: 41 | if_middle = False 42 | else: 43 | if_middle = True 44 | 45 | maybe_original_questions = search_data.search_question_cn( 46 | owner_name=owner_name, current_question=current_question, limit_num=limit_num, if_middle=if_middle) 47 | 48 | answer_json = {} 49 | answer_json["code"] = "1" 50 | answer_json["msg"] = "OK" 51 | answer_json["data"] = { 52 | "message": maybe_original_questions} 53 | return res_with_head(answer_json) 54 | 55 | 56 | @app.route("/", methods=["GET", "HEAD"]) 57 | async def alibaba_operator_check(request): 58 | print("alibaba SLB checking server status") 59 | return sanic.response.text(200) 60 | 61 | 62 | if __name__ == "__main__": 63 | root_config = configparser.ConfigParser() 64 | root_config.read(os.path.join( 65 | dir_name, "../config/associative_questions_config.ini")) 66 | port = int(root_config["ServerAddress"]["port"]) 67 | 68 | kill_port(port) 69 | 70 | app.run(host="0.0.0.0", 71 | port=port, 72 | workers=int(root_config["ServerInfo"]["work_number"]), 73 | debug=False, access_log=False) 74 | -------------------------------------------------------------------------------- /src/main_faq.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | ''' 3 | @Author: xiaoyichao 4 | LastEditors: xiaoyichao 5 | @Date: 2020-05-12 20:46:56 6 | @Description: FAQ功能的主程序文件 7 | ''' 8 | import time 9 | import jieba 10 | import configparser 11 | from sanic import Sanic 12 | from sanic.response import json 13 | from sanic import response 14 | import os 15 | import sys 16 | os.chdir(sys.path[0]) 17 | sys.path.append("../") 18 | from common.kill_program import kill_port 19 | from es.es_search_cn import SearchData4Association 20 | from common.response_add_head import res_with_head 21 | from faq.jieba4befaq import JiebaBEFAQ 22 | from faq.retrieval_es import SearchData 23 | from faq.matching_operate import Matching 24 | from faq.deduplicate_threshold_op import DeduplicateThreshold 25 | from faq.re_rank import ReRank 26 | from faq.get_final_data import FinalData 27 | 28 | 29 | dir_name = os.path.abspath(os.path.dirname(__file__)) 30 | 31 | faq_config = configparser.ConfigParser() 32 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini")) 33 | consine_weight = float(faq_config["AlgorithmConfiguration"]["consine"]) 34 | jaccard_weight = float(faq_config["AlgorithmConfiguration"]["jaccard"]) 35 | BM25_weight = float(faq_config["AlgorithmConfiguration"]["BM25"]) 36 | edit_distance_weight = float(faq_config["AlgorithmConfiguration"]["edit_distance"]) 37 | use_faiss = int(faq_config["AlgorithmConfiguration"]["use_faiss"]) 38 | use_annoy = int(faq_config["AlgorithmConfiguration"]["use_annoy"]) 39 | engine_num = int(faq_config["Faiss_Annoy_Configuration"]["engine_num"]) 40 | ES_num = int(faq_config["ESConfiguration"]["ES_num"]) 41 | use_other_when_es_none = int(faq_config["AlgorithmConfiguration"]["use_other_when_es_none"]) 42 | if use_other_when_es_none == 1: 43 | use_other_when_es_none = True 44 | else: 45 | use_other_when_es_none = False 46 | 47 | 48 | jiebaBEFAQ = JiebaBEFAQ() 49 | search_data = SearchData() 50 | match_ing = Matching() 51 | rerank = ReRank() 52 | final_data = FinalData() 53 | deduplicate_threshold = DeduplicateThreshold() 54 | search_data4association = SearchData4Association() 55 | 56 | app = Sanic() 57 | app = Sanic("Feedback BEFAQ") 58 | 59 | 60 | @app.route("/BEFAQ", methods=["POST", "HEAD"]) 61 | async def myfaq(request): 62 | orgin_query = str(request.form.get("question")) 63 | owner_name = str(request.form.get("owner_name")) 64 | get_num = int(request.form.get("get_num", default=3)) 65 | threshold = float(request.form.get("threshold", default=0.5)) 66 | 67 | # 给ES使用的结巴分词 68 | process_query = jiebaBEFAQ.seg_sentence( 69 | sentence=orgin_query) 70 | query_terms = jieba.cut(process_query) 71 | query_word_list = list(query_terms) 72 | 73 | maybe_original_questions, maybe_process_questions, maybe_answers, retrieval_q_ids, specific_q_ids = search_data.search_merge( 74 | owner_name=owner_name, question=orgin_query, query_word_list=query_word_list, use_faiss=use_faiss, use_annoy=use_annoy, engine_limit_num=engine_num, ES_limit_num=ES_num, use_other_when_es_none=use_other_when_es_none) 75 | 76 | if len(retrieval_q_ids) > 0: # ES(或faiss 或 annoy )中检索到了数据 77 | # cosine_sim的retrieval_questions使用的maybe_original_questions,orgin_query使用的没有处理过的query 78 | consin_sim = match_ing.cosine_sim( 79 | orgin_query=orgin_query, retrieval_questions=maybe_original_questions, owner_name=owner_name) 80 | print("consin_sim:", consin_sim) 81 | 82 | # jaccard_sim的retrieval_questions使用的maybe_process_questions,orgin_query使用的是去掉停用词的query 83 | jaccard_sim = match_ing.jaccard_sim( 84 | orgin_query=process_query, retrieval_questions=maybe_process_questions) 85 | print("jaccard_sim:", jaccard_sim) 86 | 87 | bm25_sim = match_ing.bm25_sim( 88 | orgin_query=process_query, retrieval_questions=maybe_process_questions) 89 | print("bm25_sim:", bm25_sim) 90 | 91 | edit_distance_sim = match_ing.edit_distance_sim( 92 | orgin_query=process_query, retrieval_questions=maybe_process_questions) 93 | print("edit_distance_sim:", edit_distance_sim) 94 | 95 | re_rank_sim = rerank.linear_model( 96 | consin_sim=consin_sim, jaccard_sim=jaccard_sim, bm25_sim=bm25_sim, edit_distance_sim=edit_distance_sim, 97 | consine_weight=consine_weight, jaccard_weight=jaccard_weight, BM25_weight=BM25_weight, edit_distance_weight=edit_distance_weight) 98 | 99 | print("retrieval_q_ids:", retrieval_q_ids) 100 | print("maybe_original_questions:", maybe_original_questions) 101 | print("maybe_process_questions:", maybe_process_questions) 102 | print("re_rank_sim:", re_rank_sim) 103 | 104 | high_confidence_q_id_pos = deduplicate_threshold.dedu_thr( 105 | q_ids=retrieval_q_ids, re_rank_sim_list=re_rank_sim, threshold=threshold) 106 | print("high_confidence_q_id_pos:", high_confidence_q_id_pos) 107 | 108 | return_data = final_data.get_qa( 109 | high_confidence_q_id_pos, maybe_original_questions, maybe_answers, re_rank_sim=re_rank_sim, get_num=get_num, retrieval_q_ids=retrieval_q_ids, specific_q_ids=specific_q_ids) 110 | 111 | print("return_data", return_data) 112 | return json(return_data) 113 | else: # ES中没有检索到数据 114 | return_data = [] 115 | return json(return_data) 116 | 117 | 118 | @app.route("/associative_questions", methods=["POST", "HEAD"]) 119 | async def associative_questions(request): 120 | # 接收到的参数 121 | current_question = str(request.form.get("current_question")) 122 | limit_num = int(request.form.get("limit_num")) 123 | owner_name = str(request.form.get("owner_name")) 124 | if_middle = int(request.form.get("if_middle", default=1)) 125 | if if_middle == 1: 126 | if_middle = True 127 | elif if_middle == 0: 128 | if_middle = False 129 | else: 130 | if_middle = True 131 | 132 | maybe_original_questions = search_data4association.search_question_cn( 133 | owner_name, current_question, limit_num, if_middle) 134 | 135 | answer_json = {} 136 | answer_json["code"] = "1" 137 | answer_json["msg"] = "OK" 138 | answer_json["data"] = { 139 | "message": maybe_original_questions} 140 | return res_with_head(answer_json) 141 | 142 | 143 | @app.route("/", methods=["GET", "HEAD"]) 144 | async def alibaba_operator_check(request): 145 | print("alibaba SLB checking server status") 146 | return response.text(200) 147 | 148 | 149 | if __name__ == "__main__": 150 | 151 | port = int(faq_config["ServerAddress"]["port"]) 152 | kill_port(port) 153 | # 启动http 服务 154 | app.run(host="0.0.0.0", 155 | port=port, 156 | workers=int(faq_config["ServerInfo"]["work_number"]), 157 | debug=False, access_log=False) 158 | --------------------------------------------------------------------------------