├── .gitignore
├── LICENSE
├── README.md
├── bert_server
└── sentence_bert_server.py
├── common
├── get_ip.py
├── kill_program.py
└── response_add_head.py
├── config
├── associative_questions_config.ini
├── befaq_conf.ini
├── es.ini
└── sheetname.conf
├── data
└── 线上用户反馈回复.xls
├── docker
├── README.md
└── docker-compose.yml
├── es
├── es_create_index.py
├── es_del_data.py
├── es_del_index.py
├── es_operate.py
├── es_search_cn.py
├── jieba_befaq.py
├── read_excel.py
├── search_engines_operate.py
├── search_model
│ └── .gitkeep
├── stopwords4_process_question_dedup.txt
├── train_search_model.py
├── userdict.txt
├── write_data2es.py
└── write_vecs2bin.py
├── faq
├── bert_vect
│ └── .gitkeep
├── deduplicate_threshold_op.py
├── get_final_data.py
├── get_question_vecs.py
├── jieba4befaq.py
├── matching_operate.py
├── re_rank.py
└── retrieval_es.py
├── image
└── BEFAQ 框架.png
├── logs
└── .gitkeep
├── model
└── .gitkeep
├── requirements.txt
└── src
├── associative_questions_server.py
└── main_faq.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea*
2 | .vscode
3 | __pycache__
4 | nohup.out
5 | *.m
6 | log*.*
7 | search_model
8 | bert_vect
9 | model
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BEFAQ
2 |
3 | **BEFAQ(BERT-based Embedding Frequently Asked Question)** 开源项目是好好住面向多领域FAQ集合的问答系统框架。
4 |
我们将Sentence BERT模型应用到FAQ问答系统中。开发者可以使用BEFAQ系统快速构建和定制适用于特定业务场景的FAQ问答系统。
5 |
6 | ## BEFAQ的优点有:
7 |
8 |
(1)使用了Elasticsearch、Faiss、Annoy 作为召回引擎
9 |
(2)使用了Sentence BERT 语义向量(Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks)
10 |
(3)对同义问题有很好的支持
11 |
(4)支持多领域语料(保证了召回的数据是对应领域的,即使是同样的问题,也可以得到不同的答案。)
12 |
(5)提供了根据当前输入提示联想问题(suggest)功能的接口
13 |
14 |
15 | ## BEFAQ的框架结构如下图
16 | 
17 |
18 |
19 | ## 如何使用
20 | ### 1、通过docker的方式使用(docker中已经安装Es7.6.1、kibana、IK分词器和同义词功能,BEFAQ的代码也已经包含在docker中。)
21 | 我们提倡通过docker的方式快速上手,启动方式请参考根目录下的docker文件夹中的README.md
22 |
23 | ### 2、通过非docker的方式使用
24 |
25 |
26 | #### 2.1、在本机安装Es7.6.1和配套的kibana,配置Es的IK分词器和同义词功能
27 | 请参考博客[ES(Elasticsearch)7.6.1安装教程](https://blog.csdn.net/weixin_37792714/article/details/108025200)进行安装。如何已经配置过Es、IK分词器和同义词功能,可以略过这一步。但是记得把同义词同步到你的Es中。为了方便大家。相关文件的下载,都放在了百度网盘中,欢迎大家使用。链接:https://pan.baidu.com/s/1PxgINf6Q1UZBtcsYw6FU0w 密码:4q9h
28 |
29 |
30 | 在BEFAQ中,为了方便大家的使用,我们提供两种Elasticsearch的连接方式:使用用户名和密码的方式与不使用用户名密码的方式。如何修改请参看项目根目录下config文件夹的es.ini 配置文件中的说明。在我们的博客中,我们提供了Elasticsearch配置用户名和密码的方式。
31 |
32 |
33 |
34 | #### 2.2、下载项目代码并创建BEFAQ的虚拟环境
35 |
36 | conda create -n befaq python=3.6 -y
37 | source activate befaq
38 | git clone https://github.com/hhzrd/BERT-Embedding-Frequently-Asked-Question.git
39 | 进入BEFAQ的根目录,然后
40 | pip install -r requirements.txt
41 |
42 | #### 2.3、sentence-transformers 多语言预训练模型的下载
43 |
44 | 首先进入到项目的根目录,然后
45 | cd model
46 | wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distiluse-base-multilingual-cased.zip
47 | unzip distiluse-base-multilingual-cased.zip
48 | 请将模型文件都直接放在model文件夹下。
49 | 如果使用最新的模型报错(并且sentence_transformers==0.3.0),请到百度网盘中下载老版本的模型(适配sentence_transformers==0.3.0,transformers==3.0.2)。目前BEFAQ使用的sentence_transformers已经升级到1.2.0版本号。
50 |
51 | #### 2.4、excel数据格式
52 | 如果你想要先跑通代码尝试一下。可以先不配置自己的数据。
53 |
54 | excel表格请放置在项目根目录下的 data/文件下,例如目前是示例文件名为“线上用户反馈回复.xls” excel数据是QA数据的来源,其中的数据会被写入到Es中。大家下载源码后,可以打开这个文件具体看一下数据示例。
55 |
56 | sheet的名称表示不同的领域,比如,我的第一个领域,叫做“领域1”。其中,第一列是“数据填写人姓名”,可以为空。第二列是“答案”,不允许为空。第三列是“原始问题”,不允许为空。第三列以后是“同义问题”,同义问题的数量没有限制。可以有很多同义问题,也可以一个同义问题都没有。一行一条数据。
57 |
58 | sheet名为“词典”的,放置的是用户词典。比如,我不想让“好好住”这个词在分词的过程中被切开。就把这个词放置在词典中。一行一条数据。程序会自动读取到指定位置(用于jieba分词),但是Es中IK分词器的自定义词典需要自己添加
59 | sheet名为“停用词”的,放置的是停用词词典。一行一条数据。程序会自动读取到指定位置。
60 | sheet名为“同义词”的,是放置同义词的sheet。第一列是原义词,第二列及其之后是同义词。比如,番茄和西红柿是同义词。第一行列放番茄,第二列放西红柿。一行一条数据。同义词的数据需要自己写到Es的同义词表中,具体参看我上边提到ES(Elasticsearch)7.6.1安装教程的博客。因为你当下的服务器未必是Es的服务器,所以这里并没有用程序直接写入。
61 |
62 | 同义词,词典,停用词。多个领域共用。词典,停用词是给BEFAQ的jieba分词使用的。同义词是给Es使用的。
63 |
64 | 你可以在Excel中写上很多领域的数据,但是具体读取哪些领域的数据,项目根目录下config文件夹的sheetname.conf中可以配置。
65 |
66 | #### 2.5、修改BEFAQ的配置文件
67 |
68 | 项目根目录下的data/线上用户反馈回复.xls 是QA数据的来源,其中的数据会被写入到Es中。如果你想要先跑通代码尝试一下。可以先不配置自己的数据。
69 | 项目根目录下的config文件夹下sheetname.conf 是读取Excel文档数据的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。
70 | 项目根目录下的config文件夹的es.ini 是BEFAQ关于ES的配置文件。这个配置文件即使是想要先跑通代码尝试一下,也是需要修改的。这个配置文件里需要配置Es的IP(域名)和端口号,Es的登陆的用户名和密码。一定要根据自己的Es的配置进行修改,才能让BEFAQ连接上你的Es。
71 | 项目根目录下的config文件夹的befaq_conf.ini 是BEFAQ的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。
72 |
73 |
74 | #### 2.6、如何开启BEFAQ服务
75 |
76 | 进入项目的根目录,然后
77 | source activate befaq
78 | cd es
79 |
80 | 将数据从excel中的数据写到Es
81 | python write_data2es.py
82 |
83 | 将问题处理成Sentence BERT 向量,保存到bin类型文件中,便于后期读取问题的向量。
84 | python write_vecs2bin.py
85 |
86 | 训练Faiss和Annoy模型
87 | python train_search_model.py
88 |
89 | 启动BEFAQ服务 (如果数据没有发生变化,后期启动服务只需要进行这一步)
90 | 进入项目的根目录(cd ..),然后
91 | cd src
92 | 启动BEFAQ服务
93 | python main_faq.py
94 | 或者在后台中启动
95 | nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 &
96 |
97 | 查看项目运行状态
98 | ps -ef|grep main_faq.py
99 |
100 | 在终端中测试BEFAQ。BEFAQ的服务是post请求。(将127.0.0.1替换成自己的ip)
101 |
102 | curl -d "question=如何评价设计师&get_num=3&threshold=0.5&owner_name=领域1" http://127.0.0.1:8129/BEFAQ
103 |
104 | 接口url:
105 | http://127.0.0.1:8129/BEFAQ
106 | 接口参数说明
107 | question:用户的问题。必需
108 | get_num:接口最多返回几条数据。非必需,默认为3
109 | threshold:阈值,相似度高于或等于这个阈值的数据才会被接口返回。非必需,默认为0.5
110 | owner_name:数据所有者的名称,也就是excel中每个领域的数据对应的sheet name。用来区分多领域数据。必需
111 |
112 | 返回的数据格式:
113 | [
114 | {
115 | "q_id": 2,
116 | "specific_q_id": 3,
117 | "question": "如何评价设计师",
118 | "answer": "你好。点击认证设计师头像,进入TA的个人主页,点击左下角「评价」即可进行评价。此外,设计师的荣耀值是根据设计师的站内数据综合计算,无法直接打分的哦。感谢你的支持。",
119 | "confidence": 1.0
120 | },
121 | {
122 | "q_id": 6,
123 | "specific_q_id": 7,
124 | "question": "怎样把个人设计师转成机构设计师",
125 | "answer": "你好,可以登录好好住官网,再次点击提交设计师认证资料,即可重新修改哟;",
126 | "confidence": 0.6
127 | }
128 | ]
129 |
130 |
131 | #### 2.7、如何开启BEFAQ的联想词接口服务
132 |
133 | 如果想要启动根据当前输入联想问题的功能。
134 | 进入项目根目录,然后
135 | cd src
136 | python associative_questions_server.py
137 | 或者在后台中启动
138 | nohup python -u associative_questions_server.py >/dev/null 2>&1 &
139 |
140 | 查看项目运行状态
141 | ps -ef|grep associative_questions_server.py
142 |
143 |
144 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip)
145 | curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1" http://127.0.0.1:8128/associative_questions
146 |
147 | 接口url:
148 | http://127.0.0.1:8128/associative_questions
149 | 接口参数说明
150 | current_question:
151 | limit_num:接口最多返回几条数据。必需
152 | owner_name:数据所有者的名称,用来区分多领域数据。必需
153 | if_middle:是否允许用户当前输入的内容在中间的位置。非必需。默认为1,1为允许,0为不允许。
154 |
155 | 返回的数据格式:
156 | {
157 | "code": "1",
158 | "msg": "OK",
159 | "data": {
160 | "message": [
161 | "按地区找设计师",
162 | "设计师可以选择同城吗",
163 | "怎样把个人设计师转成机构设计师"
164 | ]
165 | }
166 | }
167 |
168 | ## Authors
169 |
170 |
该项目的主要贡献者有:
171 | * [肖轶超](https://github.com/xiaoyichao)(好好住)
172 | * [徐忠杰](https://github.com/461025412)(好好住)
173 | * [王得祥](https://github.com/oksite)(好好住)
174 | * [向泳州](https://github.com/XiangYongzhou)(好好住)
175 | * [辛少普](https://github.com/hhzrd)(好好住)
176 |
177 | ## 参考文献:
178 |
179 |
[1] [百度AnyQ](https://github.com/baidu/AnyQ)
180 |
[2] [sentence-transformers](https://github.com/UKPLab/sentence-transformers)
181 |
[3] [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084)
182 |
183 | ## Copyright and License
184 |
185 | BEFAQ is provided under the [Apache-2.0 license](https://github.com/baidu/AnyQ/blob/master/LICENSE).
186 |
--------------------------------------------------------------------------------
/bert_server/sentence_bert_server.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-11 08:42:52
6 | LastEditTime: 2021-06-18 17:41:43
7 | @Description: 获取SentenceBERT的向量
8 | '''
9 |
10 | import numpy as np
11 | import torch
12 | import os
13 | import configparser
14 | from sentence_transformers import SentenceTransformer
15 |
16 | dir_name = os.path.abspath(os.path.dirname(__file__))
17 |
18 | faq_config = configparser.ConfigParser()
19 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
20 | Sentence_BERT_path = os.path.join(dir_name, "../", str(
21 | faq_config["AlgorithmConfiguration"]["Sentence_BERT_path"]))
22 |
23 |
24 | class SentenceBERT(object):
25 | '''
26 | Author: xiaoyichao
27 | param {type}
28 | Description: SentenceBERT
29 | '''
30 |
31 | def __init__(self):
32 | self.model = SentenceTransformer(Sentence_BERT_path)
33 | if torch.cuda.is_available():
34 | self.model = self.model.to(torch.device("cuda"))
35 | print("Sentenence BERT使用的设备为:%s" % self.model.device)
36 |
37 | def normalize(self, vec):
38 | '''
39 | Author: xiaoyichao
40 | param {type}
41 | Description: 矢量在用于相似度计算之前被归一化为单位长度,使得余弦相似性和点积相当。参考文章https://www.thinbug.com/q/41387000
42 | '''
43 | norm = np.linalg.norm(vec)
44 | if norm == 0:
45 | return vec
46 | return vec/norm
47 |
48 | def get_bert(self, sentence_list):
49 | '''
50 | Author: xiaoyichao
51 | param {type}
52 | Description: 返回(512,)纬度的SentenceBERT向量
53 | '''
54 | sentences_vec = []
55 | sentences_vec = np.array(self.model.encode(sentence_list))
56 | sentences_vec_mean = np.mean(sentences_vec, axis=0).reshape(-1, 512)
57 | # sentences_vec_max = np.max(sentences_vec, axis=0).reshape(-1, 512)
58 | return np.array([self.normalize(sentences_vec_mean[0])])
59 |
60 | def get_object(self):
61 | '''
62 | Author: xiaoyichao
63 | param {type}
64 | Description: 返回SentenceBERT的对象
65 | '''
66 | return self.model
67 |
68 |
69 | # # # 测试demo
70 | if __name__ == '__main__':
71 | sentenceBERT = SentenceBERT()
72 | sentences_vec = sentenceBERT.get_bert(sentence_list=["如何评价设计师"])
73 | print(sentences_vec.shape)
74 | print(sentences_vec)
75 |
--------------------------------------------------------------------------------
/common/get_ip.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-02-05 14:35:28
6 | LastEditTime: 2020-08-13 21:37:43
7 | @Description: 查询本机ip地址
8 | '''
9 | import socket
10 |
11 |
12 | def get_host_ip():
13 | '''
14 | Author: xiaoyichao
15 | param {type}
16 | Description: 查询本机ip地址
17 | '''
18 | try:
19 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
20 | s.connect(('8.8.8.8', 80))
21 | ip = s.getsockname()[0]
22 | finally:
23 | s.close()
24 |
25 | return ip
26 |
--------------------------------------------------------------------------------
/common/kill_program.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | Date: 2020-08-20 11:09:45
6 | LastEditTime: 2021-07-08 11:50:42
7 | Description: kill 进程
8 | '''
9 | import os
10 |
11 |
12 | def kill_port(port):
13 | '''
14 | @Author: xiaoyichao
15 | @param {*}
16 | @Description: 根据端口号杀掉程序
17 | '''
18 | find_kill = "kill -9 $(lsof -i:%d -t)" % port
19 | try:
20 | result = os.popen(find_kill)
21 | print("%d端口程序kill 成功" % port)
22 | return result.read()
23 | except Exception:
24 | print("%d端口程序kill 失败" % port)
--------------------------------------------------------------------------------
/common/response_add_head.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-04-23 15:52:51
6 | LastEditTime: 2021-03-10 18:04:03
7 | @Description: 用于接口返回数据,加入headers
8 | '''
9 |
10 | from sanic.response import json
11 |
12 |
13 | def res_with_head(data_json):
14 | '''
15 | Author: xiaoyichao
16 | param {type}
17 | Description: 用于接口返回数据,加入headers
18 | '''
19 | return json(
20 | data_json,
21 | headers={
22 | "Access-Control-Allow-Origin": "*",
23 | "Access-Control-Allow-Methods": "OPTIONS,HEAD,GET,POST",
24 | "Access-Control-Allow-Headers": "x-requested-with"},
25 | status=200
26 | )
27 |
--------------------------------------------------------------------------------
/config/associative_questions_config.ini:
--------------------------------------------------------------------------------
1 | [ServerAddress]
2 | port = 8128
3 | # 联想功能的端口号
4 | [ServerInfo]
5 | work_number = 1
6 | # 进程数。支持多进程。
7 |
--------------------------------------------------------------------------------
/config/befaq_conf.ini:
--------------------------------------------------------------------------------
1 | [ServerAddress]
2 | port = 8129
3 | #BEFAQ的端口号
4 | [ServerInfo]
5 | work_number = 1
6 | #启动的线程数,目前只能开启单线程
7 | [ESConfiguration]
8 | ES_num = 10
9 | #ES召回数据的数量
10 | [Faiss_Annoy_Configuration]
11 | engine_num = 5
12 | #Faiss和(或)Annoy召回的数量
13 | [AlgorithmConfiguration]
14 | Sentence_BERT_path = ./model/
15 | # Sentence_BERT多语言模型的相对路径,若没有特殊需求,无需更改。
16 | consine = 0.6
17 | # Sentence_BERT高纬度空间下,余弦相似度算法在线性模型中所占的比重
18 | jaccard = 0.2
19 | # BM25算法在线性模型中所占的比重
20 | BM25 = 0.1
21 | # 编辑距离算法在线性模型中所占的比重
22 | edit_distance = 0.1
23 | # jaccard系数在线性模型中所占的比重
24 | use_other_when_es_none = 0
25 | # 0表示 ES没有数据的时候才用faiss或(和)annoy。1表示 ES有数据的时候也用Faiss或(和)Annoy。
26 | # 推荐使用参数0,因为Faiss或(和)Annoy的机制是一定会召回指定数量的数据,这是不利于后期计算相似度的,因为这其中很可能有你不想要召回的脏数据。
27 | # 在BEFAQ的设计中,ES根据jieba分词后各个关键字做召回,结果更可控,当ES没有召回数据的时候,再使用Faiss或(和)Annoy更好。
28 | use_faiss = 1
29 | # 是否使用Faiss,1表示使用,0表示不使用。
30 | use_annoy = 0
31 | # 是否使用Annoy,1表示使用,0表示不使用。
32 | # Faiss和Annoy可以选择都使用,也可以选择都不使用。推荐只使用Faiss就可以。
33 | # 两个都不使用的时候,use_other_when_es_none参数已经失效,因为此时只有ES用来召回数据
34 |
35 | [ServerInfo4Association]
36 | work_number = 2
37 |
38 |
--------------------------------------------------------------------------------
/config/es.ini:
--------------------------------------------------------------------------------
1 | [ServerAddress]
2 | # 如果Es的服务部署在docker容器中
3 | # es_server_ip_port = http://elasticsearch4befaq:9200
4 | # 如果Es的服务在本机
5 | es_server_ip_port = http://127.0.0.1:9200
6 | #如果Es的服务在另一台服务器上,需要替换为自己ES的IP或域名、端口号
7 | #es_server_ip_port = http://xxx.xx.xx.xx:9200
8 | # 我们提供的Es docker是没有密码的,如果使用我们提供的Es docker,if_es_use_passwd = 0 即可
9 | # if_es_use_passwd =1 表示BEFAQ连接Es的时候使用用户名+密码的方式,0表示不使用用户名密码的方式。0的时候http_auth_user_name和http_auth_password参数是无效的。
10 | if_es_use_passwd = 0
11 | # Es的登陆的用户名
12 | http_auth_user_name = you Elasticsearch user_name
13 | # Es的登陆的用户名
14 | http_auth_password = you Elasticsearch password
15 |
16 | [ServerInfo]
17 | index_name_1 = index_faq_1
18 | # ES的索引1 的name
19 | index_name_2 = index_faq_2
20 | # ES的索引2 的name
21 | alias_name = index_faq
22 | # ES的索引别名name
23 |
--------------------------------------------------------------------------------
/config/sheetname.conf:
--------------------------------------------------------------------------------
1 | [excel_name]
2 | name = 线上用户反馈回复.xls
3 | # 数据所在的Excel的名称。 Excel的路径为项目根目录下的 data/线上用户反馈回复.xls
4 | [QA_sheets]
5 | # 想要读取的多领域语料的sheet名,程序会把这些数据写入到ES中。
6 | sheets = 领域1,领域2,领域3,领域4
7 | [Synonyms]
8 | sheet = 同义词
9 | # 同义词的数据需要自己写到ES的同义词表中,具体文件路径请参看我写的ES安装过程的博客
10 | [Stopwords]
11 | sheet = 停用词
12 | # BEFAQ的jieba停用词表,程序会自动读取到 es/stopwords4_process_question_dedup.txt中
13 | [Userdict]
14 | # BEFAQ的jieba字典,程序会自动读取到 es/userdict.txt中
15 | sheet = 词典
--------------------------------------------------------------------------------
/data/线上用户反馈回复.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/data/线上用户反馈回复.xls
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # docker 方式启动程序
2 |
3 | ## 1、启动docker集群
4 | 首先请根据自己的系统安装docker-compose,然后才能启动docker-compose。
5 | 交互方式启动
6 | docker-compose up
7 | 后台方式启动
8 | docker-compose up -d
9 | 如果想要停止docker-compose
10 | docker-compose stop
11 | ## 2、进入BEFAQ的doker
12 | Es相关的测试数据已经写到了Es的docker内。如果需要更新数据,请参考项目根目录下的README.md
13 | 进入befaq的docker
14 | docker exec -it befaq /bin/bash
15 | ## 3、启动BEFAQ服务
16 | 进入项目根目录
17 | cd /projects/BERT-Embedding-Frequently-Asked-Question/
18 | cd es
19 | 将数据从excel中的数据写到Es
20 | python write_data2es.py
21 |
22 | 将问题处理成Sentence BERT 向量,保存到bin类型文件中,便于后期读取问题的向量。
23 | python write_vecs2bin.py
24 |
25 | 训练Faiss和Annoy模型
26 | python train_search_model.py
27 |
28 | 进入src文件夹,启动BEFAQ服务
29 | cd ../src
30 | python main_faq.py
31 | 或者在后台中启动
32 | nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 &
33 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip)
34 | curl -d "question=忘记原始密码如何修改密码?&get_num=3&threshold=0.5&owner_name=领域1" http://127.0.0.1:8129/BEFAQ
35 | 如何手动kill BEFAQ服务
36 | kill -9 $(lsof -i:8129 -t)
37 | ## 4、启动BEFAQ的联想词接口服务
38 | cd /projects/BEFAQ
39 | cd src
40 | python associative_questions_server.py
41 | 或者在后台中启动
42 | nohup python -u associative_questions_server.py >/dev/null 2>&1 &
43 | 在终端中测试联想功能。服务是post请求。(如果不是本机,请将127.0.0.1替换成自己的ip)
44 | curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1" http://127.0.0.1:8128/associative_questions
45 | ## 5、测试接口
46 | 请参考项目根目录下的README.md
47 |
48 |
--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.1'
2 | services:
3 | kibana:
4 | image: xiaoyichao1993/kibana-7.6.1:latest
5 | container_name: kibana4befaq
6 | links:
7 | - elasticsearch4befaq
8 | ports:
9 | - 5601:5601
10 |
11 | elasticsearch4befaq:
12 | image: xiaoyichao1993/es7-befaq:latest
13 | container_name: es4befaq
14 | cap_add:
15 | - IPC_LOCK
16 | volumes:
17 | - esdata1:/usr/share/elasticsearch/data
18 | ports:
19 | - 9200:9200
20 | environment:
21 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
22 | - cluster.name=befaq-es
23 | - bootstrap.memory_lock=true
24 | - discovery.type=single-node
25 |
26 | befaq:
27 | image: xiaoyichao1993/befaq:latest
28 | container_name: befaq
29 | links:
30 | - elasticsearch4befaq
31 | ports:
32 | - 8129:8129
33 | - 8128:8128
34 | stdin_open: true
35 | tty: true
36 | depends_on:
37 | - elasticsearch4befaq
38 |
39 | volumes:
40 | esdata1:
41 | driver: local
42 |
--------------------------------------------------------------------------------
/es/es_create_index.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-01-02 16:55:23
6 | LastEditTime: 2021-06-25 14:12:52
7 | @Description: 创建一个索引,仅供测试。
8 |
9 | '''
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | import os
13 | import configparser
14 |
15 |
16 | dir_name = os.path.abspath(os.path.dirname(__file__))
17 | es_config = configparser.ConfigParser()
18 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
19 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
20 |
21 |
22 | # 使用配置文件中的index_name,也可以自己命名,创建其他名称的索引
23 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
24 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
26 | if if_es_use_passwd == "1":
27 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
28 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
29 | es_connect = Elasticsearch(
30 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
31 | else:
32 |
33 | es_connect = Elasticsearch(
34 | es_server_ip_port)
35 |
36 | es_faq = ESCURD(es_connect)
37 |
38 | if __name__ == "__main__":
39 | es_faq.create_index(index_name=index_name_1)
40 | es_faq.create_index(index_name=index_name_2)
41 |
--------------------------------------------------------------------------------
/es/es_del_data.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-19 19:01:17
6 | LastEditTime: 2021-06-25 14:13:35
7 | @Description: 删除索引,仅供测试。
8 | '''
9 |
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | import configparser
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 |
17 |
18 | dir_name = os.path.abspath(os.path.dirname(__file__))
19 | es_config = configparser.ConfigParser()
20 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
22 |
23 |
24 | # 使用配置文件中的index_name,也可以自己命名,创建其他名称的索引
25 | index_name = es_config["ServerInfo"]["index_name_1"]
26 |
27 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
28 | if if_es_use_passwd == "1":
29 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
30 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
31 | es_connect = Elasticsearch(
32 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
33 | else:
34 |
35 | es_connect = Elasticsearch(
36 | es_server_ip_port)
37 |
38 |
39 | es_faq = ESCURD(es_connect)
40 |
41 | if __name__ == "__main__":
42 | owner_names = ["领域1,领域2,领域3"]
43 | for owner_name in owner_names:
44 | es_faq.del_data(index_name, owner_name)
45 |
--------------------------------------------------------------------------------
/es/es_del_index.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-01-02 16:55:23
6 | LastEditTime: 2021-06-06 21:54:28
7 | @Description: 删除ES的索引, del_index_name 是要删除的索引的名字
8 |
9 | '''
10 |
11 | from es_operate import ESCURD
12 | from elasticsearch import Elasticsearch
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | import configparser
17 | import os
18 | import sys
19 |
20 |
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 | es_config = configparser.ConfigParser()
23 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
24 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
25 |
26 |
27 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
28 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
29 |
30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
31 | if if_es_use_passwd == "1":
32 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
33 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
34 | es_connect = Elasticsearch(
35 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
36 | else:
37 |
38 | es_connect = Elasticsearch(
39 | es_server_ip_port)
40 |
41 |
42 | es_faq = ESCURD(es_connect)
43 |
44 | if __name__ == "__main__":
45 | es_faq.del_index(index_name=index_name_1)
46 | es_faq.del_index(index_name=index_name_2)
47 |
--------------------------------------------------------------------------------
/es/es_operate.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-05-21 15:31:50
6 | LastEditTime: 2021-06-18 15:52:23
7 | @Description: ES相关操作的类
8 |
9 | '''
10 | from elasticsearch.helpers import bulk
11 |
12 |
13 | class ESCURD(object):
14 | def __init__(self, es):
15 | self.es = es
16 |
17 | def create_index(self, index_name):
18 | '''
19 | @Author: xiaoyichao
20 | @param {type}
21 | @Description: 创建索引
22 | '''
23 | mappings_cn = {
24 | "settings": {
25 | "index.max_ngram_diff": 10,
26 | "number_of_shards": 5,
27 | "number_of_replicas": 1,
28 | "analysis": {
29 | "filter": {
30 | "local_synonym": {
31 | "type": "synonym",
32 | "synonyms_path": "synonyms/synonym.txt"
33 | },
34 | "edge_ngram_filter": {
35 | "type": "edge_ngram",
36 | "min_gram": 1,
37 | "max_gram": 50
38 | }
39 | },
40 | "analyzer": {
41 | "text_ik": {
42 | "type": "custom",
43 | "tokenizer": "ik_smart",
44 | "filter": ["lowercase"]
45 | },
46 | "text_ik_s": {
47 | "type": "custom",
48 | "tokenizer": "ik_smart",
49 | "filter": [
50 | "lowercase",
51 | "local_synonym"
52 | ]
53 | },
54 |
55 | "save_origin_split": {
56 | "type": "custom",
57 | "tokenizer": "standard",
58 | "filter": [
59 | "lowercase"
60 | ]
61 | },
62 | "keyword_cn": {
63 | "type": "custom",
64 | "tokenizer": "keyword",
65 | "filter": [
66 | "lowercase",
67 | "edge_ngram_filter"
68 | ]
69 | },
70 | "ngram_tokenizer_analyzer": {
71 | "type": "custom",
72 | "tokenizer": "ngram_tokenizer",
73 | "filter": [
74 | "lowercase"
75 | ]
76 | }
77 |
78 | },
79 | "tokenizer": {
80 | "ngram_tokenizer": {
81 | "type": "ngram",
82 | "min_gram": 1,
83 | "max_gram": 6,
84 | "token_chars": [
85 | "letter",
86 | "digit"]
87 | }
88 |
89 | }
90 | }
91 | },
92 | "mappings": {
93 | "properties": {
94 | "original_question": {
95 | "type": "text",
96 | "analyzer": "save_origin_split",
97 | "search_analyzer": "save_origin_split"
98 | },
99 | "original_question_cn_left": {
100 | "type": "text",
101 | "analyzer": "keyword_cn",
102 | "search_analyzer": "keyword"
103 | },
104 | "original_question_cn_middle": {
105 | "type": "text",
106 | "analyzer": "ngram_tokenizer_analyzer",
107 | "search_analyzer": "keyword"
108 | },
109 | "process_question": {
110 | "type": "text",
111 | "analyzer": "text_ik",
112 | "search_analyzer": "text_ik_s"
113 | },
114 | "answer": {
115 | "type": "text"
116 | },
117 | "q_id": {
118 | "type": "integer"
119 | },
120 | "specific_q_id": {
121 | "type": "integer"
122 | },
123 | "id": {
124 | "type": "integer"
125 | },
126 | "owner_name": {
127 | "type": "keyword"
128 | }
129 | }
130 | }
131 | }
132 |
133 | if self.es.indices.exists(index=index_name) is True:
134 | print("索引 %s 之前已经存在" % index_name)
135 | else:
136 | self.es.indices.create(index=index_name, body=mappings_cn)
137 | print("成功创建索引: %s" % index_name)
138 |
139 | def del_index(self, index_name):
140 | # 删除索引
141 | if self.es.indices.exists(index=index_name) is True:
142 | res = self.es.indices.delete(index_name)
143 | print("删除索引:", index_name)
144 | return res
145 | else:
146 | print("想要删除的索引 %s 不存在" % index_name)
147 | return
148 |
149 | def del_data(self, index_name, owner_name):
150 | # 删除owner_name对用的数据
151 | query = {'query': {'match': {'owner_name': owner_name}}}
152 |
153 | res = self.es.delete_by_query(
154 | index=index_name, body=query)
155 | print("删除数据:", res)
156 |
157 | def insert_more(self, index_name, actions, owner_name):
158 | '''
159 | @Author: xiaoyichao
160 | @param {type}:
161 | @Description: 添加多条数据
162 |
163 | '''
164 | res, _ = bulk(self.es, actions, index=index_name,
165 | raise_on_error=True)
166 | print("%s 向ES中添加了%d条数据" % (owner_name, res))
167 |
168 | def search_data(self, index_name, owner_name, query_word_list, limit_num):
169 | '''
170 | @Author: xiaoyichao
171 | @param {type}
172 | @Description: 查询ES数据
173 | '''
174 | limit_num = int(limit_num)
175 |
176 | should_list = []
177 | for word in query_word_list:
178 | match = {
179 | "match": {
180 | "process_question": word
181 | }
182 | }
183 | should_list.append(match)
184 | bool_inside_value = {"should": should_list}
185 | list_must_value_2 = {}
186 | list_must_value_2["bool"] = bool_inside_value
187 |
188 | list_must_value_1 = [
189 | {
190 | "match_phrase": {
191 | "owner_name": owner_name
192 | }
193 | }
194 | ]
195 |
196 | must_list = []
197 | must_list.append(list_must_value_1)
198 | must_list.append(list_must_value_2)
199 |
200 | dic_bool_value = {}
201 | dic_bool_value["must"] = must_list
202 |
203 | dic_bool = {}
204 | dic_bool["bool"] = dic_bool_value
205 |
206 | doc = {}
207 | doc["query"] = dic_bool
208 | doc["_source"] = ["q_id", "process_question",
209 | "original_question", "answer", "specific_q_id"]
210 | doc["size"] = limit_num
211 |
212 | print("ES查询语句:", doc)
213 |
214 | res = self.es.search(
215 | index=index_name, body=doc)
216 | return res
217 |
218 | def search_cn(self, index_name, owner_name, current_question, search_limit_num, if_middle=True):
219 | '''
220 | @Author: xiaoyichao
221 | @param {type}
222 | @Description: 查询中文提示词
223 | '''
224 | search_limit_num = int(search_limit_num)
225 |
226 | doc = {}
227 | if if_middle: # 从中间开始搜索
228 |
229 | doc["query"] = {
230 | "bool": {
231 | "must": [
232 | [
233 | {
234 | "match": {
235 | "owner_name": owner_name
236 | }
237 | },
238 | {
239 | "match": {"original_question_cn_middle": current_question}
240 | }
241 |
242 | ]]
243 | }
244 | }
245 |
246 | else:
247 |
248 | doc["query"] = {
249 | "bool": {
250 | "must": [
251 | [
252 | {
253 | "match": {
254 | "owner_name": owner_name
255 | }
256 | },
257 | {
258 | "match": {"original_question_cn_left": current_question}
259 | }
260 |
261 | ]]
262 | }
263 | }
264 | doc["_source"] = ["original_question", "q_id"]
265 | doc["size"] = search_limit_num
266 |
267 | # print("ES查询语句:", doc)
268 |
269 | res = self.es.search(
270 | index=index_name, body=doc)
271 | return res
272 |
273 | def search4search_engine(self, index_name, owner_name, question):
274 | '''
275 | @Author: xiaoyichao
276 | @param {type}
277 | @Description: 查询annoy或faiss检索出的question的对应信息,例如q_id等
278 | '''
279 | doc = {}
280 |
281 | doc["query"] = {
282 | "bool": {
283 | "must": [
284 | [
285 | {
286 | "match": {
287 | "owner_name": owner_name
288 | }
289 | },
290 | {
291 | "match_phrase": {"original_question": question}
292 | }
293 |
294 | ]]
295 | }
296 | }
297 |
298 | doc["_source"] = ["q_id", "specific_q_id", "process_question",
299 | "original_question", "answer"]
300 |
301 | print("ES查询语句:", doc)
302 |
303 | res = self.es.search(
304 | index=index_name, body=doc)
305 | return res
306 |
307 | def es_put_alias(self, index_name, alias_name):
308 | '''
309 | Author: xiaoyichao
310 | param {type}
311 | Description: 添加别名和索引的连接
312 | '''
313 | res = self.es.indices.put_alias(index=index_name, name=alias_name)
314 | print("添加别名%s和索引%s的连接" % (alias_name, index_name))
315 | return res
316 |
317 | def es_get_alias(self, alias_name):
318 | '''
319 | Author: xiaoyichao
320 | param {type}
321 | Description: 获取当前别名下的索引
322 | '''
323 | try:
324 | res = self.es.indices.get_alias(name=alias_name)
325 | current_index = list(res.keys())[0]
326 | print("获取当前别名%s下的索引" % alias_name)
327 | return current_index
328 | except Exception:
329 | return
330 |
331 | def es_del_alias(self, index_name, alias_name):
332 | '''
333 | Author: xiaoyichao
334 | param {type}
335 | Description: 删除别名和索引的连接
336 | '''
337 | try:
338 | res = self.es.indices.delete_alias(
339 | index=index_name, name=alias_name)
340 | print("删除别名%s和索引%s的连接" % (alias_name, index_name))
341 | return res
342 | except Exception:
343 | return
344 |
--------------------------------------------------------------------------------
/es/es_search_cn.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-12 07:19:00
6 | LastEditTime: 2021-03-10 19:05:51
7 | @Description: 用于实现搜索框的中文提示词的类
8 | '''
9 | from elasticsearch import Elasticsearch
10 | import configparser
11 | import os
12 | import sys
13 | os.chdir(sys.path[0])
14 | sys.path.append("../")
15 | from es.es_operate import ESCURD
16 |
17 |
18 | dir_name = os.path.abspath(os.path.dirname(__file__))
19 | es_config = configparser.ConfigParser()
20 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
22 |
23 | index_name = es_config["ServerInfo"]["alias_name"]
24 |
25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
26 | if if_es_use_passwd == "1":
27 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
28 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
29 | es_connect = Elasticsearch(
30 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
31 | else:
32 |
33 | es_connect = Elasticsearch(
34 | es_server_ip_port)
35 |
36 | es_faq = ESCURD(es_connect)
37 |
38 |
39 | class SearchData4Association(object):
40 | # 实现搜索框的中文提示词的类
41 | def search_question_cn(self, owner_name, current_question, limit_num, if_middle):
42 | current_question = current_question.lower()
43 | search_limit_num = 100
44 |
45 | retrieve_data = es_faq.search_cn(
46 | index_name, owner_name, current_question, search_limit_num, if_middle)
47 |
48 | retrieve_results = retrieve_data["hits"]
49 | max_result_len = retrieve_results["total"]["value"]
50 | hits = retrieve_results["hits"]
51 | maybe_original_questions = []
52 | q_ids = []
53 | if limit_num < max_result_len:
54 | result_len = limit_num
55 | else:
56 | result_len = max_result_len
57 | for i in range(result_len):
58 | qu_an_id = hits[i]["_source"]
59 | original_question = qu_an_id["original_question"]
60 | q_id = qu_an_id["q_id"]
61 | maybe_original_questions.append(original_question)
62 | q_ids.append(q_id)
63 | q_id_set = set()
64 | deduplication_maybe_questions = []
65 | # q_id去重复并根据相关度排序
66 | for q_id, maybe_original_question in zip(q_ids, maybe_original_questions):
67 | if q_id not in q_id_set:
68 | deduplication_maybe_questions.append(maybe_original_question)
69 |
70 | return deduplication_maybe_questions
71 |
--------------------------------------------------------------------------------
/es/jieba_befaq.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-03-24 13:25:41
6 | LastEditTime: 2021-06-06 21:13:52
7 | @Description: 用于写入ES的process_question字段时去掉同义词。比如,怎样,如何这些词。
8 | '''
9 | import jieba
10 | import os
11 | dir_name = os.path.abspath(os.path.dirname(__file__))
12 |
13 |
14 | class StopwordsBEFAQ(object):
15 |
16 | def stopwordslist(self, filepath):
17 | stopwords = [line.strip() for line in open(
18 | filepath, 'r', encoding='utf-8').readlines()]
19 | return set(stopwords)
20 |
21 | # 对句子进行分词
22 | def seg_sentence4faq(self, sentence):
23 | # 创建用户字典
24 | userdict = os.path.join(dir_name, 'userdict.txt')
25 | jieba.load_userdict(userdict)
26 | sentence_seged = jieba.cut(sentence.strip())
27 | stopwords_file = os.path.join(
28 | dir_name, 'stopwords4_process_question_dedup.txt')
29 | stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径
30 | outstr = "" # 分隔符号
31 | for word in sentence_seged:
32 | if word not in stopwords:
33 | if word != '\t':
34 | outstr += word
35 | outstr += "" # 分隔符号
36 | return outstr
37 |
38 | def seg_sentence4customer_service(self, sentence):
39 | # 创建用户字典
40 | userdict = os.path.join(dir_name, 'userdict.txt')
41 | jieba.load_userdict(userdict)
42 | sentence_seged = jieba.cut(sentence.strip())
43 | # stopwords_file = os.path.join(
44 | # dir_name, 'stopwords4_process_question_dedup.txt')
45 | # stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径
46 | outstr = "" # 分隔符号
47 | for word in sentence_seged:
48 | # if word not in stopwords:
49 | if word != '\t':
50 | outstr += word
51 | outstr += "" # 分隔符号
52 | return outstr
53 |
54 |
--------------------------------------------------------------------------------
/es/read_excel.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | Date: 2020-08-13 11:34:47
6 | LastEditTime: 2021-06-18 16:31:16
7 | Description: 用于读取excel表格的类
8 | '''
9 | import os
10 | import sys
11 | import xlrd
12 | import configparser
13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14 |
15 | dir_name = os.path.abspath(os.path.dirname(__file__))
16 |
17 |
18 | class ExcelData(object):
19 |
20 | def __init__(self):
21 | self.excel_config = configparser.ConfigParser()
22 | self.excel_config.read(os.path.join(dir_name, "../config/sheetname.conf"))
23 | self.sheet_names = self.excel_config["QA_sheets"]["sheets"].split(",")
24 | self.excel_name = self.excel_config["excel_name"]["name"]
25 | self.synonyms_sheet = self.excel_config["Synonyms"]["sheet"]
26 | self.stopwords_sheet = self.excel_config["Stopwords"]["sheet"]
27 | self.excel_file = os.path.join(dir_name, "../data/", self.excel_name)
28 | self.id = 0
29 |
30 | def get_sheet_names(self):
31 | '''
32 | Author: xiaoyichao
33 | param {type}
34 | Description: 返回要读取的sheet的名称组成的list
35 | '''
36 | return self.sheet_names
37 |
38 | def read_sheet(self, sheet_name):
39 | '''
40 | Author: xiaoyichao
41 | param {type}
42 | Description: 读取excel中某个sheet的数据
43 | '''
44 | try:
45 | book = xlrd.open_workbook(filename=self.excel_file)
46 | table = book.sheet_by_name(sheet_name)
47 | nrows = table.nrows
48 | ncols = table.ncols
49 | sheet_list = []
50 | for row in range(1, nrows):
51 | for col in range(2, ncols):
52 | cell_value = table.cell(row, col).value
53 | if cell_value != "":
54 | q_id = row
55 | original_question = cell_value
56 | answer = table.cell(row, 1).value
57 | self.id += 1
58 | owner_name = sheet_name
59 | sheet_list.append(
60 | [q_id, original_question, answer, self.id, owner_name])
61 | return sheet_list
62 | except Exception:
63 | print("Exception")
64 | return []
65 |
66 | def read_QA_data(self):
67 | '''
68 | Author: xiaoyichao
69 | param {type}
70 | Description: 读取excel中的问答数据
71 | '''
72 | excel_list = []
73 | for sheet_name in self.sheet_names:
74 | sheet_list = self.read_sheet(sheet_name)
75 | excel_list.append(sheet_list)
76 | return excel_list
77 |
78 |
79 | # exceldata = ExcelData()
80 | # excel_list = exceldata.read_QA_data()
81 | # print(excel_list)
82 |
--------------------------------------------------------------------------------
/es/search_engines_operate.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-19 17:14:35
6 | LastEditTime: 2020-08-25 17:50:47
7 | @Description: 训练annoy文件,不用faiss 是因为faiss不支持float64,最大精度floa32.
8 | 也有利用annoy 检索的功能
9 | '''
10 |
11 | from annoy import AnnoyIndex
12 | import faiss
13 | from faiss import normalize_L2
14 | import os
15 | import sys
16 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17 | from faq.get_question_vecs import ReadVec2bin
18 |
19 | dir_name = os.path.abspath(os.path.dirname(__file__))
20 | read_vec2bin = ReadVec2bin()
21 |
22 |
23 | class SearchEngine(object):
24 | def train_annoy(self, owner_name):
25 | bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name)
26 | annoy_index_path = os.path.join(
27 | dir_name, './search_model/%s_annoy.index' % owner_name)
28 | tc_index = AnnoyIndex(f=512, metric='angular')
29 |
30 | if os.path.exists(os.path.join(dir_name, './search_model')) is False:
31 | os.mkdir(os.path.join(dir_name, './search_model'))
32 |
33 | if os.path.exists(annoy_index_path):
34 | os.remove(annoy_index_path)
35 | print("删除旧的 %s_annoy.index文件" % owner_name)
36 |
37 | for i, vec in enumerate(bert_vecs):
38 | tc_index.add_item(i, vec)
39 | tc_index.build(100)
40 | tc_index.save(annoy_index_path)
41 | print("写入 %s_annoy.index文件" % owner_name)
42 |
43 | def train_faiss(self, owner_name):
44 | bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name)
45 | d = 512 # dimension
46 | nb = len(bert_vecs) # database size
47 | faiss_index_path = os.path.join(
48 | dir_name, './search_model/%s_faiss.index' % owner_name)
49 | training_vectors = bert_vecs.astype('float32')
50 | normalize_L2(training_vectors)
51 | index = faiss.IndexFlatIP(d)
52 | index.train(training_vectors)
53 | index.add(training_vectors)
54 | if os.path.exists(os.path.join(dir_name, './search_model')) is False:
55 | os.mkdir(os.path.join(dir_name, './search_model'))
56 |
57 | if os.path.exists(faiss_index_path):
58 | os.remove(faiss_index_path)
59 | print("删除旧的 %s_faiss.index文件" % owner_name)
60 |
61 | faiss.write_index(index, faiss_index_path)
62 | print("写入 %s_faiss.index文件" % owner_name)
63 |
--------------------------------------------------------------------------------
/es/search_model/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file !.gitkeep
--------------------------------------------------------------------------------
/es/stopwords4_process_question_dedup.txt:
--------------------------------------------------------------------------------
1 | ?
2 | hello
3 | hi
4 | 一下
5 | 一个
6 | 上
7 | 不
8 | 为什么
9 | 么
10 | 么么哒
11 | 了
12 | 什么
13 | 你好
14 | 再见
15 | 可以
16 | 吗
17 | 吧
18 | 呢
19 | 哈
20 | 哈哈
21 | 哈哈哈
22 | 哈喽
23 | 哪个
24 | 哪里
25 | 啊
26 | 啦
27 | 嗨
28 | 在
29 | 在不在
30 | 在吗
31 | 在哪
32 | 在哪里
33 | 好
34 | 好哒
35 | 好滴
36 | 好的
37 | 如何
38 | 希望
39 | 怎么
40 | 怎么样
41 | 怎样
42 | 怎样才能
43 | 您好
44 | 想
45 | 想要
46 | 感谢
47 | 我
48 | 我想
49 | 我的
50 | 找不到
51 | 拜拜
52 | 时
53 | 时候
54 | 有人吗
55 | 有没有
56 | 的
57 | 真
58 | 真希望
59 | 要
60 | 请问
61 | 谢谢
62 | 谢谢啦
63 | 这个
64 | 那个
65 | 问
66 | 问一下
67 | 问题
68 | 非常
69 | ?
70 |
--------------------------------------------------------------------------------
/es/train_search_model.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-19 17:14:35
6 | LastEditTime: 2020-08-25 18:05:41
7 | @Description:
8 | '''
9 | from read_excel import ExcelData
10 | import os
11 | import sys
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | from search_engines_operate import SearchEngine
14 |
15 | exceldata = ExcelData()
16 | sheet_names = exceldata.get_sheet_names()
17 | search_engine = SearchEngine()
18 |
19 | for sheet_name in sheet_names:
20 | search_engine.train_annoy(owner_name=sheet_name)
21 | search_engine.train_faiss(owner_name=sheet_name)
22 |
--------------------------------------------------------------------------------
/es/userdict.txt:
--------------------------------------------------------------------------------
1 | 好好住
2 | ipad
3 | ipad pro
4 | 平板
5 | 平板电脑
6 | 夜间模式
7 | 暗黑模式
8 | 同城
9 | 当地
10 | 投诉
11 | 维权
12 | 盗用
13 | 盗图
14 | 入驻
15 | 申请
16 | 入住
17 | 认证
18 | 更换
19 | 更改
20 | ppt
21 | 课件
22 | pdf
23 | 表格
24 | 在哪
25 | 找不到
26 | 日常
27 | 常见
28 | 推送
29 | 推荐
30 | 闪退
31 | bug
32 | 异常
33 | 历史推送
34 | 往期推送
35 | 装修日记
36 | 装修记录
37 | 装修待办
38 | 装修记账
39 | 账号
40 | 账户
41 |
--------------------------------------------------------------------------------
/es/write_data2es.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-01-02 16:55:23
6 | LastEditTime: 2021-03-01 19:13:26
7 | @Description: 将数据写到ES中
8 |
9 | '''
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | from jieba_befaq import StopwordsBEFAQ
13 | from read_excel import ExcelData
14 | import os
15 | # import sys
16 | # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17 | import configparser
18 |
19 | dir_name = os.path.abspath(os.path.dirname(__file__))
20 | es_config = configparser.ConfigParser()
21 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
22 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
23 |
24 |
25 | alias_name = es_config["ServerInfo"]["alias_name"]
26 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
27 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
28 | index_name_set = set([index_name_1, index_name_2])
29 |
30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
31 | if if_es_use_passwd == "1":
32 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
33 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
34 | es_connect = Elasticsearch(
35 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
36 | else:
37 |
38 | es_connect = Elasticsearch(
39 | es_server_ip_port)
40 |
41 | es_faq = ESCURD(es_connect)
42 | stopwords4BEFAQ = StopwordsBEFAQ()
43 |
44 |
45 | class ReadsSqlData2ES(object):
46 | def __init__(self):
47 | self.exceldata = ExcelData()
48 | self.excel_list = self.exceldata.read_QA_data()
49 |
50 | def write_data2es(self, index_name):
51 | '''
52 | @Author: xiaoyichao
53 | @param {type}
54 | @Description: 将数据写到ES中
55 | '''
56 |
57 | for sheet_data in self.excel_list:
58 | actions = []
59 | num = 0
60 | owner_name = "未命名领域"
61 | for info in sheet_data:
62 | num += 1
63 | q_id, original_question, answer, id, owner_name = info[
64 | 0], info[1], info[2], info[3], info[4]
65 | process_question = original_question.lower()
66 | process_question = stopwords4BEFAQ.seg_sentence4faq(
67 | sentence=process_question)
68 | action_name = "action"+str(num)
69 | action_name = {}
70 | action_name["_index"] = index_name
71 | action_name["_source"] = {
72 | "q_id": q_id,
73 | "specific_q_id": id,
74 | "original_question": original_question,
75 | "process_question": process_question,
76 | "original_question_cn_middle": original_question.lower(),
77 | "original_question_cn_left": original_question.lower(),
78 | "answer": answer,
79 | "owner_name": owner_name
80 | }
81 | actions.append(action_name)
82 | es_faq.insert_more(index_name=index_name, actions=actions, owner_name=owner_name)
83 |
84 |
85 | if __name__ == "__main__":
86 | read_sql_data = ReadsSqlData2ES()
87 | current_index = es_faq.es_get_alias(alias_name=alias_name)
88 | new_index_set = index_name_set-set([current_index])
89 | new_index = new_index_set.pop()
90 | es_faq.del_index(index_name=new_index)
91 | es_faq.create_index(index_name=new_index)
92 | read_sql_data.write_data2es(index_name=new_index)
93 | es_faq.es_put_alias(index_name=new_index, alias_name=alias_name)
94 | es_faq.es_del_alias(index_name=current_index, alias_name=alias_name)
95 |
--------------------------------------------------------------------------------
/es/write_vecs2bin.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-01-02 16:55:23
6 | LastEditTime: 2021-06-25 15:27:08
7 | @Description: 将问题的集合的向量写入bin文件
8 |
9 | '''
10 |
11 |
12 | import numpy as np
13 | from read_excel import ExcelData
14 | import os
15 | import sys
16 | os.chdir(sys.path[0])
17 | sys.path.append("../")
18 | from bert_server.sentence_bert_server import SentenceBERT
19 |
20 |
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 |
23 |
24 | class WriteVec2bin(object):
25 | def __init__(self):
26 | self.exceldata = ExcelData()
27 | self.excel_list = self.exceldata.read_QA_data()
28 | self.sheet_names = self.exceldata.get_sheet_names()
29 | self.sentenceBERT = SentenceBERT()
30 |
31 | def write_bert_vecs(self, owner_name, num):
32 | '''
33 | @Author: xiaoyichao
34 | @param {type}
35 | @Description: 句向量都进行写入bin文件
36 | '''
37 | if os.path.exists(os.path.join(dir_name, '../faq/bert_vect')) is False:
38 | os.mkdir(os.path.join(dir_name, '../faq/bert_vect'))
39 | bert_vecs_path = os.path.join(
40 | dir_name, '../faq/bert_vect/%s_bert_vecs.npy' % (owner_name))
41 | bert_sentences_path = os.path.join(
42 | dir_name, '../faq/bert_vect/%s_bert_sentences.txt' % (owner_name))
43 | orgin_query_vecs = np.zeros(shape=(1, 512))
44 | with open(bert_sentences_path, "w") as f:
45 | f.write("数据库中的问题"+"\n")
46 | for info in self.excel_list[num]:
47 | original_question = info[1]
48 | f.write(original_question+"\n")
49 | orgin_query = original_question.replace(",", " ")
50 | orgin_query_list = orgin_query.split(' ')
51 | orgin_query_vec = self.sentenceBERT.get_bert(orgin_query_list)
52 | orgin_query_vecs = np.concatenate(
53 | (orgin_query_vecs, orgin_query_vec), axis=0)
54 | if os.path.exists(bert_vecs_path):
55 | os.remove(bert_vecs_path)
56 | print("删除旧的BERT向量文件")
57 | # 将铺平的向量reshape
58 | orgin_query_vecs = np.reshape(orgin_query_vecs, (-1, 512))
59 | np.save(bert_vecs_path, orgin_query_vecs)
60 |
61 | print("BERT向量文件写入", bert_vecs_path)
62 |
63 | def write_bert_vecs4sheets(self):
64 | '''
65 | Author: xiaoyichao
66 | param {type}
67 | Description: 对每个领域语料的句向量都进行写入bin文件的操作
68 | '''
69 | for i, sheet_name in enumerate(self.sheet_names):
70 | self.write_bert_vecs(owner_name=sheet_name, num=i)
71 |
72 |
73 | if __name__ == "__main__":
74 | write_vec2bin = WriteVec2bin()
75 | write_vec2bin.write_bert_vecs4sheets()
76 |
--------------------------------------------------------------------------------
/faq/bert_vect/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file !.gitkeep
--------------------------------------------------------------------------------
/faq/deduplicate_threshold_op.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-05-22 12:24:06
6 | LastEditTime: 2021-06-06 22:00:53
7 | @Description: 对重排序后的数据,根据q_id进行去重复,卡阈值。低于置信度阈值的数据不要
8 | '''
9 |
10 |
11 | class DeduplicateThreshold(object):
12 | def dedu_thr(self, q_ids, re_rank_sim_list, threshold):
13 | high_confidence_q_id_pos = []
14 | if len(q_ids) > 0:
15 | q_id_dict = {}
16 | # 获取 q_id和position关系的字典
17 | for position, id in enumerate(q_ids):
18 | if id not in q_id_dict:
19 | q_id_dict[id] = [position]
20 | else:
21 | q_id_dict[id].append(position)
22 | # print("召回的q_id_dict:", q_id_dict)
23 | # 对q_id去重复,某个q_id下存在多个数据的,取其中最高相似度的结果,某个q_id下只有一个数据的直接取这个数据,也就是第0个数据
24 | unique_q_ids_pos = []
25 | for poss in q_id_dict.values():
26 | max_sim_pos = poss[0]
27 | if len(poss) > 1:
28 | for qid_pos in poss:
29 | if re_rank_sim_list[qid_pos] > re_rank_sim_list[max_sim_pos]:
30 | max_sim_pos = qid_pos
31 | unique_q_ids_pos.append(max_sim_pos)
32 | # 对去重复后的q_id,卡阈值,高于置信度的才要。
33 | for q_id_pos in unique_q_ids_pos:
34 | if re_rank_sim_list[q_id_pos] >= threshold:
35 | high_confidence_q_id_pos.append(q_id_pos)
36 | return high_confidence_q_id_pos
37 | else:
38 | return high_confidence_q_id_pos
39 |
40 |
--------------------------------------------------------------------------------
/faq/get_final_data.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | @LastEditors: xiaoyichao
5 | @Date: 2020-05-23 16:21:51
6 | @LastEditTime: 2020-07-23 14:45:05
7 | @Description: FAQ模块。根据去重复,卡阈值之后留下的q_id,取出对应的question,answer,相似度
8 | '''
9 |
10 |
11 | class FinalData(object):
12 | def get_json_confidence(self, json_data):
13 | return json_data["confidence"]
14 |
15 | def get_qa(self, high_confidence_q_id_pos, maybe_questions, maybe_answers, re_rank_sim, get_num, retrieval_q_ids, specific_q_ids):
16 | return_data = []
17 | for q_id_pos in high_confidence_q_id_pos:
18 | single_json = {}
19 | single_json["q_id"] = retrieval_q_ids[q_id_pos]
20 | single_json["specific_q_id"] = specific_q_ids[q_id_pos]
21 | single_json["question"] = maybe_questions[q_id_pos]
22 | single_json["answer"] = maybe_answers[q_id_pos]
23 | single_json["confidence"] = round(re_rank_sim[q_id_pos], 2)
24 | return_data.append(single_json)
25 | return_data.sort(reverse=True, key=self.get_json_confidence)
26 | # 对返回数据的数量进行限制。
27 | if len(high_confidence_q_id_pos) > get_num:
28 | return return_data[:get_num]
29 | else:
30 | return return_data
31 |
--------------------------------------------------------------------------------
/faq/get_question_vecs.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-09 14:45:34
6 | LastEditTime: 2021-06-25 15:04:53
7 | @Description: 获取问题集合的BERT向量
8 | '''
9 |
10 |
11 | import numpy as np
12 |
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | from es.read_excel import ExcelData
17 |
18 | exceldata = ExcelData()
19 | sheet_names = exceldata.get_sheet_names()
20 | dir_name = os.path.abspath(os.path.dirname(__file__))
21 |
22 |
23 | class ReadVec2bin(object):
24 | def __init__(self):
25 | self.owner_name_sentence = {}
26 | self.owner_name_bert_vecs = {}
27 | for sheet_name in sheet_names:
28 | bert_vecs_path = os.path.join(
29 | dir_name, './bert_vect/%s_bert_vecs.npy' % (sheet_name))
30 | bert_sentences_path = os.path.join(
31 | dir_name, './bert_vect/%s_bert_sentences.txt' % (sheet_name))
32 |
33 | with open(bert_sentences_path, "r", encoding="utf8")as sent:
34 | sentences = sent.read()
35 | sentences = sentences.strip("\n")
36 | sentences = sentences.split("\n")
37 | self.owner_name_sentence[sheet_name] = sentences[1:]
38 | bert_vecs = np.load(bert_vecs_path)
39 | self.owner_name_bert_vecs[sheet_name] = bert_vecs[1:]
40 |
41 | def read_bert_sents(self, owner_name):
42 | return self.owner_name_sentence[owner_name]
43 |
44 | def read_bert_vecs(self, owner_name):
45 | return self.owner_name_bert_vecs[owner_name]
46 |
47 |
--------------------------------------------------------------------------------
/faq/jieba4befaq.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-03-24 13:25:41
6 | LastEditTime: 2021-02-23 17:45:49
7 | @Description: 对用户的FAQ问题。去掉停用词,比如,怎样,如何这些词。然后进入ES搜索
8 | '''
9 | import jieba
10 | import os
11 | dir_name = os.path.abspath(os.path.dirname(__file__))
12 |
13 |
14 | class JiebaBEFAQ(object):
15 |
16 | def stopwordslist(self, filepath):
17 | stopwords = [line.strip() for line in open(
18 | filepath, 'r', encoding='utf-8').readlines()]
19 | return set(stopwords)
20 |
21 | # 对句子进行分词
22 | def seg_sentence(self, sentence):
23 | # 创建用户字典
24 | userdict = os.path.join(dir_name, '../es/userdict.txt')
25 | jieba.load_userdict(userdict)
26 | sentence_seged = jieba.cut(sentence.strip())
27 | stopwords_file = os.path.join(
28 | dir_name, '../es/stopwords4_process_question_dedup.txt')
29 | stopwords = self.stopwordslist(stopwords_file) # 这里加载停用词的路径
30 | outstr = "" # 分隔符号
31 | for word in sentence_seged:
32 | if word not in stopwords:
33 | if word != '\t':
34 | outstr += word
35 | outstr += "" # 分隔符号
36 | return outstr
37 |
38 | def get_list(self, sentence):
39 | '''
40 | Author: xiaoyichao
41 | param {type}
42 | Description: 将句子变成切次词后的list
43 | '''
44 | sentence_terms = list(jieba.cut(sentence))
45 | return sentence_terms
46 |
--------------------------------------------------------------------------------
/faq/matching_operate.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-05-12 20:46:56
6 | LastEditTime: 2021-06-25 16:08:05
7 | @Description:
8 | '''
9 | import numpy as np
10 | import jieba
11 | import Levenshtein
12 | import time
13 | import configparser
14 | from sklearn.metrics.pairwise import cosine_similarity
15 | from gensim.summarization import bm25
16 | import os
17 | import sys
18 | os.chdir(sys.path[0])
19 | sys.path.append("../")
20 | from faq.get_question_vecs import ReadVec2bin
21 | from faq.jieba4befaq import JiebaBEFAQ
22 | from bert_server.sentence_bert_server import SentenceBERT
23 |
24 |
25 | dir_name = os.path.abspath(os.path.dirname(__file__))
26 | faq_config = configparser.ConfigParser()
27 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
28 |
29 |
30 | class Matching(object):
31 | def __init__(self):
32 | self.read_vec2bin = ReadVec2bin()
33 | self.jiebaBEFAQ = JiebaBEFAQ()
34 | self.sentenceBERT = SentenceBERT()
35 |
36 | def cosine_sim(self, orgin_query, retrieval_questions, owner_name):
37 | '''
38 | @Author: xiaoyichao
39 | @param {type}
40 | @Description: BERT空间的余弦相似度
41 | '''
42 | sentences = self.read_vec2bin.read_bert_sents(owner_name=owner_name)
43 | bert_vecs = self.read_vec2bin.read_bert_vecs(owner_name=owner_name)
44 | orgin_query = orgin_query.replace(",", " ")
45 | orgin_query_list = orgin_query.split(' ')
46 | print("orgin_query_list", orgin_query_list)
47 |
48 | orgin_query_vec = self.sentenceBERT.get_bert(
49 | sentence_list=orgin_query_list)
50 | if orgin_query_vec != np.array([]): # 如果BERT服务正常
51 | retrieval_questions_vec = []
52 | for retrieval_question in retrieval_questions:
53 | # 获取事先计算好的问题BERT 向量
54 | index_pos = sentences.index(retrieval_question)
55 | retrieval_question_vec = bert_vecs[index_pos]
56 | retrieval_question_vec = retrieval_question_vec.reshape(-1, 512)
57 | retrieval_questions_vec.append(retrieval_question_vec)
58 |
59 | retrieval_questions_vec = np.array(
60 | retrieval_questions_vec).reshape(-1, 512)
61 |
62 | # 计算出来的余弦相似度可能与理论值不一致,这是计算机存储机制导致的。通过四舍五入和异常处理,来规避异常数据出现在最后的结果中。
63 | sim_list = cosine_similarity(
64 | orgin_query_vec, retrieval_questions_vec)[0].tolist()
65 |
66 | # print('SKlearn:', end_time-begin_time)
67 | normalized_sim_list = []
68 | for sim in sim_list:
69 | if sim > 1.0:
70 | sim = 1.0
71 | normalized_sim_list.append(sim)
72 |
73 | return normalized_sim_list
74 | else: # 如果BERT服务超时了
75 | normalized_sim_list = []
76 | return normalized_sim_list
77 |
78 | def jaccrad(self, question, reference): # reference为源句子,question为候选句子
79 | '''
80 | @Author: xiaoyichao
81 | @param {type}
82 | @Description: 计算两个句子的jaccard相似度
83 | '''
84 | terms_reference = jieba.cut(reference) # 默认精准模式
85 | question = question.replace("\n", "")
86 | terms_model = jieba.cut(question)
87 | grams_reference = list(terms_reference)
88 | grams_model = list(terms_model)
89 | temp = 0
90 | for i in grams_reference:
91 | if i in grams_model:
92 | temp = temp+1
93 | fenmu = len(grams_model)+len(grams_reference)-temp # 并集
94 | jaccard_coefficient = float(temp/fenmu) # 交集
95 | return jaccard_coefficient
96 |
97 | def jaccard_sim(self, orgin_query, retrieval_questions):
98 | '''
99 | @Author: xiaoyichao
100 | @param {type}
101 | @Description: 计算query 和潜在问题的jaccard相似度
102 | '''
103 | sim_list = []
104 | for retrieval_question in retrieval_questions:
105 | jaccard_coefficient = self.jaccrad(
106 | question=orgin_query, reference=retrieval_question)
107 | sim_list.append(jaccard_coefficient)
108 | return sim_list
109 |
110 | def bm25_sim(self, orgin_query, retrieval_questions):
111 | '''
112 | @Author: xiaoyichao
113 | @param {type}
114 | @Description: 计算query 和潜在问题的BM25相似度
115 | '''
116 | jieba_corpus = []
117 | for corpu in retrieval_questions:
118 | line_seg = self.jiebaBEFAQ.get_list(corpu)
119 | jieba_corpus.append(line_seg)
120 | jieba_question = self.jiebaBEFAQ.get_list(orgin_query)
121 | bm25Model = bm25.BM25(jieba_corpus)
122 | sim_list = bm25Model.get_scores(jieba_question)
123 | normalized_sim_list = []
124 | max_sim = max(sim_list)
125 | for sim in sim_list:
126 | if sim == 0:
127 | normalized_sim = 0
128 | else:
129 | normalized_sim = sim/max_sim
130 | normalized_sim_list.append(normalized_sim)
131 |
132 | return normalized_sim_list
133 |
134 | def edit_distance_sim(self, orgin_query, retrieval_questions):
135 | '''
136 | @Author: xiaoyichao
137 | @param {type}
138 | @Description: 计算query 和潜在问题的编辑距离的相似度
139 | '''
140 | sim_list = []
141 | max_len = max(len(orgin_query), max([len(x) for x in retrieval_questions]))
142 | for corpu in retrieval_questions:
143 | edit_distance = Levenshtein.distance(orgin_query, corpu)
144 | sim = 1 - edit_distance * 1.0 / max_len
145 | sim_list.append(sim)
146 | return sim_list
147 |
148 |
149 | if __name__ == "__main__":
150 | matching = Matching()
151 | question = "如何评价设计师"
152 | normalized_sim_list = matching.cosine_sim(
153 | question, ["如何评价设计师"], "领域1")
154 | print(normalized_sim_list)
155 |
--------------------------------------------------------------------------------
/faq/re_rank.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-05-22 13:54:44
6 | LastEditTime: 2021-06-06 21:42:47
7 | @Description: 线性模型的重排序,给予不同的算法不同的权重
8 | '''
9 |
10 |
11 | class ReRank(object):
12 | def linear_model(self, consin_sim, jaccard_sim, bm25_sim, edit_distance_sim, consine_weight, jaccard_weight, BM25_weight, edit_distance_weight):
13 | if consin_sim != []:
14 | tmp_multiple_sims = [i * consine_weight + j*jaccard_weight + k*BM25_weight + l*edit_distance_weight
15 | for i, j, k, l in zip(consin_sim, jaccard_sim, bm25_sim, edit_distance_sim)]
16 | multiple_sims = []
17 | if consine_weight + jaccard_weight + BM25_weight + edit_distance_weight ==1:
18 | for multiple_sim in tmp_multiple_sims:
19 | if multiple_sim > 1.0:
20 | multiple_sim = 1.0
21 | multiple_sims.append(multiple_sim)
22 | else:
23 | multiple_sims = tmp_multiple_sims
24 | return multiple_sims
25 | else:
26 | multiple_sims = jaccard_sim
27 | return multiple_sims
28 |
29 |
--------------------------------------------------------------------------------
/faq/retrieval_es.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-01-02 16:55:23
6 | LastEditTime: 2021-06-18 16:00:32
7 | @Description: 使用ES召回数据和Faiss(annoy)召回数据
8 |
9 | '''
10 |
11 | from elasticsearch import Elasticsearch
12 | from annoy import AnnoyIndex
13 | import numpy as np
14 | import faiss
15 | import os
16 | import sys
17 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18 | from es.es_operate import ESCURD
19 | import configparser
20 | from bert_server.sentence_bert_server import SentenceBERT
21 | from faq.get_question_vecs import ReadVec2bin
22 |
23 |
24 | dir_name = os.path.abspath(os.path.dirname(__file__))
25 | es_config = configparser.ConfigParser()
26 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
27 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
28 |
29 |
30 | index_name = es_config["ServerInfo"]["alias_name"]
31 |
32 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
33 | if if_es_use_passwd == "1":
34 | http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
35 | http_auth_password = es_config["ServerAddress"]["http_auth_password"]
36 | es_connect = Elasticsearch(
37 | es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
38 | else:
39 |
40 | es_connect = Elasticsearch(
41 | es_server_ip_port)
42 |
43 | es_faq = ESCURD(es_connect)
44 | sentenceBERT = SentenceBERT()
45 | read_vec2bin = ReadVec2bin()
46 |
47 |
48 | class SearchData(object):
49 | '''
50 | Author: xiaoyichao
51 | param {type}
52 | Description: 用于召回数据,会使用ES,Annoy,Faiss,具体使用哪些可以自己配置
53 | '''
54 | def search_es(self, owner_name, query_word_list, ES_limit_num):
55 | '''
56 | Author: xiaoyichao
57 | param {type}
58 | Description: 使用ES召回
59 | '''
60 | retrieve_data = es_faq.search_data(
61 | index_name=index_name, owner_name=owner_name, query_word_list=query_word_list, limit_num=ES_limit_num)
62 | retrieve_results = retrieve_data["hits"]
63 | max_result_len = retrieve_results["total"]["value"]
64 | # max_score = retrieve_results["max_score"]
65 | hits = retrieve_results["hits"]
66 | maybe_original_questions = []
67 | maybe_process_questions = []
68 | maybe_answers = []
69 | specific_q_ids = []
70 | q_ids = []
71 | if ES_limit_num < max_result_len:
72 | result_len = ES_limit_num
73 | else:
74 | result_len = max_result_len
75 | for i in range(result_len):
76 | qu_an_id = hits[i]["_source"]
77 | original_question = qu_an_id["original_question"]
78 | process_question = qu_an_id["process_question"]
79 | answer = qu_an_id["answer"]
80 | q_id = qu_an_id["q_id"]
81 | specific_q_id = qu_an_id["specific_q_id"]
82 | maybe_original_questions.append(original_question)
83 | maybe_process_questions.append(process_question)
84 | maybe_answers.append(answer)
85 | q_ids.append(q_id)
86 | specific_q_ids.append(specific_q_id)
87 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
88 |
89 | def search_annoy(self, owner_name, question, num=5):
90 | '''
91 | Author: xiaoyichao
92 | param {type}
93 | Description: 使用Annoy 召回
94 | '''
95 | sentences = read_vec2bin.read_bert_sents(owner_name=owner_name)
96 | annoy_index_path = os.path.join(
97 | dir_name, '../es/search_model/%s_annoy.index' % owner_name)
98 | encodearrary = sentenceBERT.get_bert([question])
99 | tc_index = AnnoyIndex(f=512, metric='angular')
100 | tc_index.load(annoy_index_path)
101 | # items = tc_index.get_nns_by_vector(
102 | # encodearrary[0], num, include_distances=True)
103 | items = tc_index.get_nns_by_vector(
104 | encodearrary[0], num, include_distances=True)
105 | sim_questions = [sentences[num_annoy] for num_annoy in items[0]]
106 | # sims = items[1]
107 | # index_nums = items[0]
108 | return sim_questions
109 |
110 | def search_faiss(self, owner_name, question, num=5):
111 | '''
112 | Author: xiaoyichao
113 | param {type}
114 | Description: 使用Faiss 召回
115 | '''
116 | sentences = read_vec2bin.read_bert_sents(owner_name=owner_name)
117 | faiss_index_path = os.path.join(
118 | dir_name, '../es/search_model/%s_faiss.index' % owner_name)
119 | index = faiss.read_index(faiss_index_path)
120 | question_vec = sentenceBERT.get_bert([question]).astype('float32')
121 | index.nprobe = 1
122 | sims, index_nums = index.search(question_vec, num)
123 | sim_questions = [sentences[num_faiss] for num_faiss in index_nums[0]]
124 | # index_nums = index_nums[0].tolist()
125 | # sims = sims[0].tolist()
126 | return sim_questions
127 |
128 | def merge_op(self, question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num):
129 | '''
130 | Author: xiaoyichao
131 | param {type}
132 | Description: 合并ES与faiss或(和)annoy的结果
133 | '''
134 | if use_faiss == 1 and use_annoy == 0:
135 | print("use_faiss")
136 | mayey_search_questions = self.search_faiss(
137 | owner_name, question, num=engine_limit_num)
138 | elif use_faiss == 0 and use_annoy == 1:
139 | print("use_annoy")
140 | mayey_search_questions = self.search_annoy(
141 | owner_name, question, num=engine_limit_num)
142 | elif use_faiss == 1 and use_annoy == 1:
143 | print("use_annoy and use_faiss ")
144 | mayey_search_questions_faiss = self.search_faiss(
145 | owner_name, question, num=engine_limit_num)
146 | mayey_search_questions_annoy = self.search_annoy(
147 | owner_name, question, num=engine_limit_num)
148 | mayey_search_questions = list(
149 | set(mayey_search_questions_faiss+mayey_search_questions_annoy))
150 | else:
151 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
152 | print("ES检索出的数据", maybe_original_questions)
153 | # 再去查ES的数据,跟ES的数据做合并。去重复。
154 | for sim_question in mayey_search_questions:
155 | if sim_question not in set(maybe_original_questions):
156 | print("faiss、annoy 检索出的新数据", sim_question)
157 | retrieve_data = es_faq.search4search_engine(
158 | index_name, owner_name, question=sim_question)
159 | retrieve_results = retrieve_data["hits"]
160 | max_result_len = retrieve_results["total"]["value"]
161 | # max_score = retrieve_results["max_score"]
162 | hits = retrieve_results["hits"]
163 |
164 | if max_result_len >= 1:
165 | for i in range(1):
166 | qu_an_id = hits[i]["_source"]
167 | original_question = qu_an_id["original_question"]
168 | process_question = qu_an_id["process_question"]
169 | answer = qu_an_id["answer"]
170 | q_id = qu_an_id["q_id"]
171 | specific_q_id = qu_an_id["specific_q_id"]
172 | maybe_original_questions.append(original_question)
173 | maybe_process_questions.append(process_question)
174 | maybe_answers.append(answer)
175 | q_ids.append(q_id)
176 | specific_q_ids.append(specific_q_id)
177 | # 合并数据
178 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
179 |
180 | def search_merge(self, owner_name, question, query_word_list, use_other_when_es_none, use_faiss=0, use_annoy=0, engine_limit_num=5, ES_limit_num=10):
181 | # 首先用ES检索
182 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.search_es(
183 | owner_name=owner_name, query_word_list=query_word_list, ES_limit_num=ES_limit_num)
184 | if use_other_when_es_none is False:
185 | if len(maybe_original_questions) == 0: # ES没有数据的时候才用faiss或(和)annoy
186 | # 推荐使用这种方式,因为faiss和annoy一定会召回指定数量的数据。这其中很可能会出现你不想看到的数据。当ES召回数据量为0的时候,再利用Fasis或(和)annoy召回数据
187 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op(
188 | question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num)
189 | else: # ES有数据的时候也用faiss或(和)annoy。
190 | maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op(
191 | question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num)
192 |
193 | return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
194 |
--------------------------------------------------------------------------------
/image/BEFAQ 框架.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/image/BEFAQ 框架.png
--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file !.gitkeep
--------------------------------------------------------------------------------
/model/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file !.gitkeep
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence_transformers==1.2.0
2 | jieba==0.39
3 | elasticsearch==7.7.0
4 | annoy==1.16.3
5 | xlrd==1.2.0
6 | numpy==1.18.2
7 | faiss_cpu==1.6.3
8 | sanic==20.6.3
9 | scikit_learn==0.23.2
10 | transformers==4.6.1
11 | python-Levenshtein==0.12.2
12 | gensim==3.8.3
13 | uvloop==0.14.0
--------------------------------------------------------------------------------
/src/associative_questions_server.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-06-12 08:15:51
6 | LastEditTime: 2021-06-18 16:28:41
7 | @Description:
8 | '''
9 | from sanic import Sanic
10 | import sanic
11 | import configparser
12 | import os
13 | import sys
14 | os.chdir(sys.path[0])
15 | sys.path.append("../")
16 | from es.es_search_cn import SearchData4Association
17 | from common.response_add_head import res_with_head
18 | from common.kill_program import kill_port
19 |
20 |
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 | search_data = SearchData4Association()
23 |
24 |
25 | # 接口会返回json数据
26 | app = Sanic()
27 | app = Sanic("associative questions")
28 |
29 |
30 | @app.route("/associative_questions", methods=["POST", "HEAD"])
31 | async def associative_questions(request):
32 |
33 | # 接收到的参数
34 | current_question = str(request.form.get("current_question"))
35 | limit_num = int(request.form.get("limit_num"))
36 | owner_name = str(request.form.get("owner_name"))
37 | if_middle = int(request.form.get("if_middle", default=1))
38 | if if_middle == 1:
39 | if_middle = True
40 | if if_middle == 0:
41 | if_middle = False
42 | else:
43 | if_middle = True
44 |
45 | maybe_original_questions = search_data.search_question_cn(
46 | owner_name=owner_name, current_question=current_question, limit_num=limit_num, if_middle=if_middle)
47 |
48 | answer_json = {}
49 | answer_json["code"] = "1"
50 | answer_json["msg"] = "OK"
51 | answer_json["data"] = {
52 | "message": maybe_original_questions}
53 | return res_with_head(answer_json)
54 |
55 |
56 | @app.route("/", methods=["GET", "HEAD"])
57 | async def alibaba_operator_check(request):
58 | print("alibaba SLB checking server status")
59 | return sanic.response.text(200)
60 |
61 |
62 | if __name__ == "__main__":
63 | root_config = configparser.ConfigParser()
64 | root_config.read(os.path.join(
65 | dir_name, "../config/associative_questions_config.ini"))
66 | port = int(root_config["ServerAddress"]["port"])
67 |
68 | kill_port(port)
69 |
70 | app.run(host="0.0.0.0",
71 | port=port,
72 | workers=int(root_config["ServerInfo"]["work_number"]),
73 | debug=False, access_log=False)
74 |
--------------------------------------------------------------------------------
/src/main_faq.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | '''
3 | @Author: xiaoyichao
4 | LastEditors: xiaoyichao
5 | @Date: 2020-05-12 20:46:56
6 | @Description: FAQ功能的主程序文件
7 | '''
8 | import time
9 | import jieba
10 | import configparser
11 | from sanic import Sanic
12 | from sanic.response import json
13 | from sanic import response
14 | import os
15 | import sys
16 | os.chdir(sys.path[0])
17 | sys.path.append("../")
18 | from common.kill_program import kill_port
19 | from es.es_search_cn import SearchData4Association
20 | from common.response_add_head import res_with_head
21 | from faq.jieba4befaq import JiebaBEFAQ
22 | from faq.retrieval_es import SearchData
23 | from faq.matching_operate import Matching
24 | from faq.deduplicate_threshold_op import DeduplicateThreshold
25 | from faq.re_rank import ReRank
26 | from faq.get_final_data import FinalData
27 |
28 |
29 | dir_name = os.path.abspath(os.path.dirname(__file__))
30 |
31 | faq_config = configparser.ConfigParser()
32 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
33 | consine_weight = float(faq_config["AlgorithmConfiguration"]["consine"])
34 | jaccard_weight = float(faq_config["AlgorithmConfiguration"]["jaccard"])
35 | BM25_weight = float(faq_config["AlgorithmConfiguration"]["BM25"])
36 | edit_distance_weight = float(faq_config["AlgorithmConfiguration"]["edit_distance"])
37 | use_faiss = int(faq_config["AlgorithmConfiguration"]["use_faiss"])
38 | use_annoy = int(faq_config["AlgorithmConfiguration"]["use_annoy"])
39 | engine_num = int(faq_config["Faiss_Annoy_Configuration"]["engine_num"])
40 | ES_num = int(faq_config["ESConfiguration"]["ES_num"])
41 | use_other_when_es_none = int(faq_config["AlgorithmConfiguration"]["use_other_when_es_none"])
42 | if use_other_when_es_none == 1:
43 | use_other_when_es_none = True
44 | else:
45 | use_other_when_es_none = False
46 |
47 |
48 | jiebaBEFAQ = JiebaBEFAQ()
49 | search_data = SearchData()
50 | match_ing = Matching()
51 | rerank = ReRank()
52 | final_data = FinalData()
53 | deduplicate_threshold = DeduplicateThreshold()
54 | search_data4association = SearchData4Association()
55 |
56 | app = Sanic()
57 | app = Sanic("Feedback BEFAQ")
58 |
59 |
60 | @app.route("/BEFAQ", methods=["POST", "HEAD"])
61 | async def myfaq(request):
62 | orgin_query = str(request.form.get("question"))
63 | owner_name = str(request.form.get("owner_name"))
64 | get_num = int(request.form.get("get_num", default=3))
65 | threshold = float(request.form.get("threshold", default=0.5))
66 |
67 | # 给ES使用的结巴分词
68 | process_query = jiebaBEFAQ.seg_sentence(
69 | sentence=orgin_query)
70 | query_terms = jieba.cut(process_query)
71 | query_word_list = list(query_terms)
72 |
73 | maybe_original_questions, maybe_process_questions, maybe_answers, retrieval_q_ids, specific_q_ids = search_data.search_merge(
74 | owner_name=owner_name, question=orgin_query, query_word_list=query_word_list, use_faiss=use_faiss, use_annoy=use_annoy, engine_limit_num=engine_num, ES_limit_num=ES_num, use_other_when_es_none=use_other_when_es_none)
75 |
76 | if len(retrieval_q_ids) > 0: # ES(或faiss 或 annoy )中检索到了数据
77 | # cosine_sim的retrieval_questions使用的maybe_original_questions,orgin_query使用的没有处理过的query
78 | consin_sim = match_ing.cosine_sim(
79 | orgin_query=orgin_query, retrieval_questions=maybe_original_questions, owner_name=owner_name)
80 | print("consin_sim:", consin_sim)
81 |
82 | # jaccard_sim的retrieval_questions使用的maybe_process_questions,orgin_query使用的是去掉停用词的query
83 | jaccard_sim = match_ing.jaccard_sim(
84 | orgin_query=process_query, retrieval_questions=maybe_process_questions)
85 | print("jaccard_sim:", jaccard_sim)
86 |
87 | bm25_sim = match_ing.bm25_sim(
88 | orgin_query=process_query, retrieval_questions=maybe_process_questions)
89 | print("bm25_sim:", bm25_sim)
90 |
91 | edit_distance_sim = match_ing.edit_distance_sim(
92 | orgin_query=process_query, retrieval_questions=maybe_process_questions)
93 | print("edit_distance_sim:", edit_distance_sim)
94 |
95 | re_rank_sim = rerank.linear_model(
96 | consin_sim=consin_sim, jaccard_sim=jaccard_sim, bm25_sim=bm25_sim, edit_distance_sim=edit_distance_sim,
97 | consine_weight=consine_weight, jaccard_weight=jaccard_weight, BM25_weight=BM25_weight, edit_distance_weight=edit_distance_weight)
98 |
99 | print("retrieval_q_ids:", retrieval_q_ids)
100 | print("maybe_original_questions:", maybe_original_questions)
101 | print("maybe_process_questions:", maybe_process_questions)
102 | print("re_rank_sim:", re_rank_sim)
103 |
104 | high_confidence_q_id_pos = deduplicate_threshold.dedu_thr(
105 | q_ids=retrieval_q_ids, re_rank_sim_list=re_rank_sim, threshold=threshold)
106 | print("high_confidence_q_id_pos:", high_confidence_q_id_pos)
107 |
108 | return_data = final_data.get_qa(
109 | high_confidence_q_id_pos, maybe_original_questions, maybe_answers, re_rank_sim=re_rank_sim, get_num=get_num, retrieval_q_ids=retrieval_q_ids, specific_q_ids=specific_q_ids)
110 |
111 | print("return_data", return_data)
112 | return json(return_data)
113 | else: # ES中没有检索到数据
114 | return_data = []
115 | return json(return_data)
116 |
117 |
118 | @app.route("/associative_questions", methods=["POST", "HEAD"])
119 | async def associative_questions(request):
120 | # 接收到的参数
121 | current_question = str(request.form.get("current_question"))
122 | limit_num = int(request.form.get("limit_num"))
123 | owner_name = str(request.form.get("owner_name"))
124 | if_middle = int(request.form.get("if_middle", default=1))
125 | if if_middle == 1:
126 | if_middle = True
127 | elif if_middle == 0:
128 | if_middle = False
129 | else:
130 | if_middle = True
131 |
132 | maybe_original_questions = search_data4association.search_question_cn(
133 | owner_name, current_question, limit_num, if_middle)
134 |
135 | answer_json = {}
136 | answer_json["code"] = "1"
137 | answer_json["msg"] = "OK"
138 | answer_json["data"] = {
139 | "message": maybe_original_questions}
140 | return res_with_head(answer_json)
141 |
142 |
143 | @app.route("/", methods=["GET", "HEAD"])
144 | async def alibaba_operator_check(request):
145 | print("alibaba SLB checking server status")
146 | return response.text(200)
147 |
148 |
149 | if __name__ == "__main__":
150 |
151 | port = int(faq_config["ServerAddress"]["port"])
152 | kill_port(port)
153 | # 启动http 服务
154 | app.run(host="0.0.0.0",
155 | port=port,
156 | workers=int(faq_config["ServerInfo"]["work_number"]),
157 | debug=False, access_log=False)
158 |
--------------------------------------------------------------------------------