├── .gitignore
├── LICENSE
├── README.md
├── bert_server
    └── sentence_bert_server.py
├── common
    ├── get_ip.py
    ├── kill_program.py
    └── response_add_head.py
├── config
    ├── associative_questions_config.ini
    ├── befaq_conf.ini
    ├── es.ini
    └── sheetname.conf
├── data
    └── 线上用户反馈回复.xls
├── docker
    ├── README.md
    └── docker-compose.yml
├── es
    ├── es_create_index.py
    ├── es_del_data.py
    ├── es_del_index.py
    ├── es_operate.py
    ├── es_search_cn.py
    ├── jieba_befaq.py
    ├── read_excel.py
    ├── search_engines_operate.py
    ├── search_model
    │   └── .gitkeep
    ├── stopwords4_process_question_dedup.txt
    ├── train_search_model.py
    ├── userdict.txt
    ├── write_data2es.py
    └── write_vecs2bin.py
├── faq
    ├── bert_vect
    │   └── .gitkeep
    ├── deduplicate_threshold_op.py
    ├── get_final_data.py
    ├── get_question_vecs.py
    ├── jieba4befaq.py
    ├── matching_operate.py
    ├── re_rank.py
    └── retrieval_es.py
├── image
    └── BEFAQ 框架.png
├── logs
    └── .gitkeep
├── model
    └── .gitkeep
├── requirements.txt
└── src
    ├── associative_questions_server.py
    └── main_faq.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea*
2 | .vscode
3 | __pycache__
4 | nohup.out
5 | *.m
6 | log*.*
7 | search_model
8 | bert_vect
9 | model


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BEFAQ
  2 | 
  3 | **BEFAQ(BERT-based Embedding Frequently Asked Question)** 开源项目是好好住面向多领域FAQ集合的问答系统框架。</br>
  4 | <br>我们将Sentence BERT模型应用到FAQ问答系统中。开发者可以使用BEFAQ系统快速构建和定制适用于特定业务场景的FAQ问答系统。</br>
  5 | 
  6 | ## BEFAQ的优点有：
  7 | 
  8 | <br>（1）使用了Elasticsearch、Faiss、Annoy 作为召回引擎</br>
  9 | <br>（2）使用了Sentence BERT 语义向量（Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks）</br>
 10 | <br>（3）对同义问题有很好的支持</br>
 11 | <br>（4）支持多领域语料（保证了召回的数据是对应领域的，即使是同样的问题，也可以得到不同的答案。）</br>
 12 | <br>（5）提供了根据当前输入提示联想问题（suggest）功能的接口</br>
 13 | 
 14 | 
 15 | ## BEFAQ的框架结构如下图
 16 | ![image](https://github.com/hhzrd/BERT-Embedding-Frequently-Asked-Question/blob/xiao/docker/image/BEFAQ%20%E6%A1%86%E6%9E%B6.png)
 17 | 
 18 | 
 19 | ## 如何使用
 20 | ### 1、通过docker的方式使用(docker中已经安装Es7.6.1、kibana、IK分词器和同义词功能,BEFAQ的代码也已经包含在docker中。)
 21 |     我们提倡通过docker的方式快速上手，启动方式请参考根目录下的docker文件夹中的README.md
 22 | 
 23 | ### 2、通过非docker的方式使用
 24 | 
 25 | 
 26 | #### 2.1、在本机安装Es7.6.1和配套的kibana，配置Es的IK分词器和同义词功能
 27 |     请参考博客[ES（Elasticsearch）7.6.1安装教程](https://blog.csdn.net/weixin_37792714/article/details/108025200)进行安装。如何已经配置过Es、IK分词器和同义词功能，可以略过这一步。但是记得把同义词同步到你的Es中。为了方便大家。相关文件的下载，都放在了百度网盘中，欢迎大家使用。链接:https://pan.baidu.com/s/1PxgINf6Q1UZBtcsYw6FU0w  密码:4q9h
 28 | 
 29 | 
 30 | 在BEFAQ中，为了方便大家的使用，我们提供两种Elasticsearch的连接方式：使用用户名和密码的方式与不使用用户名密码的方式。如何修改请参看项目根目录下config文件夹的es.ini 配置文件中的说明。在我们的博客中，我们提供了Elasticsearch配置用户名和密码的方式。
 31 | 
 32 | 
 33 | 
 34 | #### 2.2、下载项目代码并创建BEFAQ的虚拟环境
 35 | 
 36 |     conda create -n befaq python=3.6 -y
 37 |     source activate befaq
 38 |     git clone https://github.com/hhzrd/BERT-Embedding-Frequently-Asked-Question.git
 39 |     进入BEFAQ的根目录，然后
 40 |     pip install -r requirements.txt
 41 | 
 42 | #### 2.3、sentence-transformers 多语言预训练模型的下载
 43 | 
 44 |     首先进入到项目的根目录，然后
 45 |     cd model
 46 |     wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distiluse-base-multilingual-cased.zip
 47 |     unzip distiluse-base-multilingual-cased.zip
 48 |     请将模型文件都直接放在model文件夹下。
 49 |     如果使用最新的模型报错（并且sentence_transformers==0.3.0），请到百度网盘中下载老版本的模型（适配sentence_transformers==0.3.0,transformers==3.0.2）。目前BEFAQ使用的sentence_transformers已经升级到1.2.0版本号。
 50 | 
 51 | #### 2.4、excel数据格式
 52 |     如果你想要先跑通代码尝试一下。可以先不配置自己的数据。
 53 | 
 54 |     excel表格请放置在项目根目录下的 data/文件下，例如目前是示例文件名为“线上用户反馈回复.xls” excel数据是QA数据的来源，其中的数据会被写入到Es中。大家下载源码后，可以打开这个文件具体看一下数据示例。
 55 | 
 56 |     sheet的名称表示不同的领域，比如，我的第一个领域，叫做“领域1”。其中，第一列是“数据填写人姓名”，可以为空。第二列是“答案”，不允许为空。第三列是“原始问题”，不允许为空。第三列以后是“同义问题”，同义问题的数量没有限制。可以有很多同义问题，也可以一个同义问题都没有。一行一条数据。
 57 | 
 58 |     sheet名为“词典”的，放置的是用户词典。比如，我不想让“好好住”这个词在分词的过程中被切开。就把这个词放置在词典中。一行一条数据。程序会自动读取到指定位置(用于jieba分词)，但是Es中IK分词器的自定义词典需要自己添加
 59 |     sheet名为“停用词”的，放置的是停用词词典。一行一条数据。程序会自动读取到指定位置。
 60 |     sheet名为“同义词”的，是放置同义词的sheet。第一列是原义词，第二列及其之后是同义词。比如，番茄和西红柿是同义词。第一行列放番茄，第二列放西红柿。一行一条数据。同义词的数据需要自己写到Es的同义词表中，具体参看我上边提到ES（Elasticsearch）7.6.1安装教程的博客。因为你当下的服务器未必是Es的服务器，所以这里并没有用程序直接写入。
 61 | 
 62 |     同义词，词典，停用词。多个领域共用。词典，停用词是给BEFAQ的jieba分词使用的。同义词是给Es使用的。
 63 | 
 64 |     你可以在Excel中写上很多领域的数据，但是具体读取哪些领域的数据，项目根目录下config文件夹的sheetname.conf中可以配置。
 65 | 
 66 | #### 2.5、修改BEFAQ的配置文件
 67 | 
 68 |     项目根目录下的data/线上用户反馈回复.xls 是QA数据的来源，其中的数据会被写入到Es中。如果你想要先跑通代码尝试一下。可以先不配置自己的数据。
 69 |     项目根目录下的config文件夹下sheetname.conf 是读取Excel文档数据的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。
 70 |     项目根目录下的config文件夹的es.ini 是BEFAQ关于ES的配置文件。这个配置文件即使是想要先跑通代码尝试一下，也是需要修改的。这个配置文件里需要配置Es的IP（域名）和端口号，Es的登陆的用户名和密码。一定要根据自己的Es的配置进行修改，才能让BEFAQ连接上你的Es。
 71 |     项目根目录下的config文件夹的befaq_conf.ini 是BEFAQ的配置文件。如果你想要先跑通代码尝试一下。可以先不修改这里的配置。
 72 | 
 73 | 
 74 | #### 2.6、如何开启BEFAQ服务
 75 | 
 76 |     进入项目的根目录，然后
 77 |     source activate befaq
 78 |     cd es
 79 | 
 80 |     将数据从excel中的数据写到Es 
 81 |     python write_data2es.py
 82 | 
 83 |     将问题处理成Sentence BERT 向量，保存到bin类型文件中，便于后期读取问题的向量。
 84 |     python write_vecs2bin.py
 85 | 
 86 |     训练Faiss和Annoy模型
 87 |     python train_search_model.py
 88 | 
 89 |     启动BEFAQ服务 （如果数据没有发生变化，后期启动服务只需要进行这一步）
 90 |     进入项目的根目录(cd ..)，然后
 91 |     cd src
 92 |     启动BEFAQ服务
 93 |     python main_faq.py
 94 |     或者在后台中启动
 95 |     nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 &
 96 |     
 97 |     查看项目运行状态
 98 |     ps -ef|grep main_faq.py
 99 | 
100 |     在终端中测试BEFAQ。BEFAQ的服务是post请求。(将127.0.0.1替换成自己的ip)
101 |     
102 |     curl -d "question=如何评价设计师&get_num=3&threshold=0.5&owner_name=领域1"   http://127.0.0.1:8129/BEFAQ
103 |     
104 |     接口url:
105 |     http://127.0.0.1:8129/BEFAQ
106 |     接口参数说明
107 |     question：用户的问题。必需
108 |     get_num：接口最多返回几条数据。非必需，默认为3
109 |     threshold：阈值，相似度高于或等于这个阈值的数据才会被接口返回。非必需，默认为0.5
110 |     owner_name：数据所有者的名称，也就是excel中每个领域的数据对应的sheet name。用来区分多领域数据。必需
111 |     
112 |     返回的数据格式：
113 |     [
114 |         {
115 |             "q_id": 2,
116 |             "specific_q_id": 3,
117 |             "question": "如何评价设计师",
118 |             "answer": "你好。点击认证设计师头像，进入TA的个人主页，点击左下角「评价」即可进行评价。此外，设计师的荣耀值是根据设计师的站内数据综合计算，无法直接打分的哦。感谢你的支持。",
119 |             "confidence": 1.0
120 |         },
121 |         {
122 |             "q_id": 6,
123 |             "specific_q_id": 7,
124 |             "question": "怎样把个人设计师转成机构设计师",
125 |             "answer": "你好，可以登录好好住官网，再次点击提交设计师认证资料，即可重新修改哟；",
126 |             "confidence": 0.6
127 |         }
128 |     ]
129 | 
130 | 
131 | #### 2.7、如何开启BEFAQ的联想词接口服务
132 | 
133 |     如果想要启动根据当前输入联想问题的功能。
134 |     进入项目根目录，然后
135 |     cd src
136 |     python associative_questions_server.py
137 |     或者在后台中启动
138 |     nohup python -u associative_questions_server.py >/dev/null 2>&1 &
139 | 
140 |     查看项目运行状态
141 |     ps -ef|grep associative_questions_server.py
142 | 
143 | 
144 |     在终端中测试联想功能。服务是post请求。(如果不是本机，请将127.0.0.1替换成自己的ip)
145 |     curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1"  http://127.0.0.1:8128/associative_questions
146 |     
147 |     接口url:
148 |     http://127.0.0.1:8128/associative_questions
149 |     接口参数说明
150 |     current_question:
151 |     limit_num：接口最多返回几条数据。必需
152 |     owner_name：数据所有者的名称，用来区分多领域数据。必需
153 |     if_middle:是否允许用户当前输入的内容在中间的位置。非必需。默认为1，1为允许，0为不允许。
154 | 
155 |     返回的数据格式：
156 |     {
157 |         "code": "1",
158 |         "msg": "OK",
159 |         "data": {
160 |             "message": [
161 |                 "按地区找设计师",
162 |                 "设计师可以选择同城吗",
163 |                 "怎样把个人设计师转成机构设计师"
164 |             ]
165 |         }
166 |     }
167 | 
168 | ## Authors
169 | 
170 | <br>该项目的主要贡献者有:</br>
171 | * [肖轶超](https://github.com/xiaoyichao)（好好住）
172 | * [徐忠杰](https://github.com/461025412)（好好住）
173 | * [王得祥](https://github.com/oksite)（好好住）
174 | * [向泳州](https://github.com/XiangYongzhou)（好好住）
175 | * [辛少普](https://github.com/hhzrd)（好好住）
176 | 
177 | ## 参考文献：
178 | 
179 | <br>[1] [百度AnyQ](https://github.com/baidu/AnyQ)</br>
180 | <br>[2] [sentence-transformers](https://github.com/UKPLab/sentence-transformers)</br>
181 | <br>[3] [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084)</br>
182 | 
183 | ## Copyright and License
184 | 
185 | BEFAQ is provided under the [Apache-2.0 license](https://github.com/baidu/AnyQ/blob/master/LICENSE).
186 | 


--------------------------------------------------------------------------------
/bert_server/sentence_bert_server.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-11 08:42:52
 6 | LastEditTime: 2021-06-18 17:41:43
 7 | @Description:   获取SentenceBERT的向量
 8 | '''
 9 | 
10 | import numpy as np
11 | import torch
12 | import os
13 | import configparser
14 | from sentence_transformers import SentenceTransformer
15 | 
16 | dir_name = os.path.abspath(os.path.dirname(__file__))
17 | 
18 | faq_config = configparser.ConfigParser()
19 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
20 | Sentence_BERT_path = os.path.join(dir_name, "../", str(
21 |     faq_config["AlgorithmConfiguration"]["Sentence_BERT_path"]))
22 | 
23 | 
24 | class SentenceBERT(object):
25 |     '''
26 |     Author: xiaoyichao
27 |     param {type}
28 |     Description: SentenceBERT
29 |     '''
30 | 
31 |     def __init__(self):
32 |         self.model = SentenceTransformer(Sentence_BERT_path)
33 |         if torch.cuda.is_available():
34 |             self.model = self.model.to(torch.device("cuda"))
35 |         print("Sentenence BERT使用的设备为：%s" % self.model.device)
36 | 
37 |     def normalize(self, vec):
38 |         '''
39 |         Author: xiaoyichao
40 |         param {type}
41 |         Description: 矢量在用于相似度计算之前被归一化为单位长度，使得余弦相似性和点积相当。参考文章https://www.thinbug.com/q/41387000
42 |         '''
43 |         norm = np.linalg.norm(vec)
44 |         if norm == 0:
45 |             return vec
46 |         return vec/norm
47 | 
48 |     def get_bert(self, sentence_list):
49 |         '''
50 |         Author: xiaoyichao
51 |         param {type}
52 |         Description: 返回(512,)纬度的SentenceBERT向量
53 |         '''
54 |         sentences_vec = []
55 |         sentences_vec = np.array(self.model.encode(sentence_list))
56 |         sentences_vec_mean = np.mean(sentences_vec, axis=0).reshape(-1, 512)
57 |         # sentences_vec_max = np.max(sentences_vec, axis=0).reshape(-1, 512)
58 |         return np.array([self.normalize(sentences_vec_mean[0])])
59 | 
60 |     def get_object(self):
61 |         '''
62 |         Author: xiaoyichao
63 |         param {type}
64 |         Description: 返回SentenceBERT的对象
65 |         '''
66 |         return self.model
67 | 
68 | 
69 | # # # 测试demo
70 | if __name__ == '__main__':
71 |     sentenceBERT = SentenceBERT()
72 |     sentences_vec = sentenceBERT.get_bert(sentence_list=["如何评价设计师"])
73 |     print(sentences_vec.shape)
74 |     print(sentences_vec)
75 | 


--------------------------------------------------------------------------------
/common/get_ip.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-02-05 14:35:28
 6 | LastEditTime: 2020-08-13 21:37:43
 7 | @Description: 查询本机ip地址
 8 | '''
 9 | import socket
10 | 
11 | 
12 | def get_host_ip():
13 |     '''
14 |     Author: xiaoyichao
15 |     param {type}
16 |     Description: 查询本机ip地址
17 |     '''
18 |     try:
19 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
20 |         s.connect(('8.8.8.8', 80))
21 |         ip = s.getsockname()[0]
22 |     finally:
23 |         s.close()
24 | 
25 |     return ip
26 | 


--------------------------------------------------------------------------------
/common/kill_program.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | Date: 2020-08-20 11:09:45
 6 | LastEditTime: 2021-07-08 11:50:42
 7 | Description: kill 进程
 8 | '''
 9 | import os
10 | 
11 | 
12 | def kill_port(port):
13 |     '''
14 |     @Author: xiaoyichao
15 |     @param {*}
16 |     @Description: 根据端口号杀掉程序
17 |     ''' 
18 |     find_kill = "kill -9 $(lsof -i:%d -t)" % port
19 |     try:
20 |         result = os.popen(find_kill)
21 |         print("%d端口程序kill 成功" % port)
22 |         return result.read()
23 |     except Exception:
24 |         print("%d端口程序kill 失败" % port)


--------------------------------------------------------------------------------
/common/response_add_head.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-04-23 15:52:51
 6 | LastEditTime: 2021-03-10 18:04:03
 7 | @Description: 用于接口返回数据，加入headers
 8 | '''
 9 | 
10 | from sanic.response import json
11 | 
12 | 
13 | def res_with_head(data_json):
14 |     '''
15 |     Author: xiaoyichao
16 |     param {type}
17 |     Description: 用于接口返回数据，加入headers
18 |     '''
19 |     return json(
20 |         data_json,
21 |         headers={
22 |             "Access-Control-Allow-Origin": "*",
23 |             "Access-Control-Allow-Methods": "OPTIONS,HEAD,GET,POST",
24 |             "Access-Control-Allow-Headers": "x-requested-with"},
25 |         status=200
26 |     )
27 | 


--------------------------------------------------------------------------------
/config/associative_questions_config.ini:
--------------------------------------------------------------------------------
1 | [ServerAddress]
2 | port = 8128
3 | # 联想功能的端口号
4 | [ServerInfo]
5 | work_number = 1
6 | # 进程数。支持多进程。
7 | 


--------------------------------------------------------------------------------
/config/befaq_conf.ini:
--------------------------------------------------------------------------------
 1 | [ServerAddress]
 2 | port = 8129
 3 | #BEFAQ的端口号
 4 | [ServerInfo]
 5 | work_number = 1
 6 | #启动的线程数，目前只能开启单线程
 7 | [ESConfiguration]
 8 | ES_num = 10
 9 | #ES召回数据的数量
10 | [Faiss_Annoy_Configuration]
11 | engine_num = 5
12 | #Faiss和(或)Annoy召回的数量
13 | [AlgorithmConfiguration]
14 | Sentence_BERT_path =  ./model/
15 | # Sentence_BERT多语言模型的相对路径，若没有特殊需求，无需更改。
16 | consine = 0.6
17 | # Sentence_BERT高纬度空间下，余弦相似度算法在线性模型中所占的比重
18 | jaccard =  0.2
19 | # BM25算法在线性模型中所占的比重
20 | BM25 =  0.1
21 | # 编辑距离算法在线性模型中所占的比重
22 | edit_distance =  0.1
23 | # jaccard系数在线性模型中所占的比重
24 | use_other_when_es_none = 0
25 | # 0表示 ES没有数据的时候才用faiss或(和)annoy。1表示 ES有数据的时候也用Faiss或(和)Annoy。 
26 | # 推荐使用参数0，因为Faiss或(和)Annoy的机制是一定会召回指定数量的数据，这是不利于后期计算相似度的，因为这其中很可能有你不想要召回的脏数据。
27 | # 在BEFAQ的设计中，ES根据jieba分词后各个关键字做召回，结果更可控，当ES没有召回数据的时候，再使用Faiss或(和)Annoy更好。
28 | use_faiss = 1
29 | # 是否使用Faiss,1表示使用，0表示不使用。
30 | use_annoy = 0
31 | # 是否使用Annoy，1表示使用，0表示不使用。
32 | # Faiss和Annoy可以选择都使用，也可以选择都不使用。推荐只使用Faiss就可以。
33 | # 两个都不使用的时候，use_other_when_es_none参数已经失效，因为此时只有ES用来召回数据
34 | 
35 | [ServerInfo4Association]
36 | work_number = 2
37 | 
38 | 


--------------------------------------------------------------------------------
/config/es.ini:
--------------------------------------------------------------------------------
 1 | [ServerAddress]
 2 | # 如果Es的服务部署在docker容器中
 3 | # es_server_ip_port = http://elasticsearch4befaq:9200
 4 | # 如果Es的服务在本机
 5 | es_server_ip_port = http://127.0.0.1:9200
 6 | #如果Es的服务在另一台服务器上，需要替换为自己ES的IP或域名、端口号
 7 | #es_server_ip_port = http://xxx.xx.xx.xx:9200
 8 | # 我们提供的Es docker是没有密码的，如果使用我们提供的Es docker,if_es_use_passwd = 0 即可
 9 | # if_es_use_passwd =1 表示BEFAQ连接Es的时候使用用户名+密码的方式，0表示不使用用户名密码的方式。0的时候http_auth_user_name和http_auth_password参数是无效的。
10 | if_es_use_passwd = 0
11 | # Es的登陆的用户名
12 | http_auth_user_name = you Elasticsearch user_name
13 | # Es的登陆的用户名
14 | http_auth_password = you Elasticsearch password 
15 | 
16 | [ServerInfo]
17 | index_name_1 = index_faq_1
18 | # ES的索引1 的name 
19 | index_name_2 = index_faq_2
20 | # ES的索引2 的name 
21 | alias_name = index_faq
22 | # ES的索引别名name
23 | 


--------------------------------------------------------------------------------
/config/sheetname.conf:
--------------------------------------------------------------------------------
 1 | [excel_name]
 2 | name = 线上用户反馈回复.xls
 3 | # 数据所在的Excel的名称。 Excel的路径为项目根目录下的 data/线上用户反馈回复.xls
 4 | [QA_sheets]
 5 | # 想要读取的多领域语料的sheet名，程序会把这些数据写入到ES中。
 6 | sheets = 领域1,领域2,领域3,领域4
 7 | [Synonyms]
 8 | sheet = 同义词
 9 | # 同义词的数据需要自己写到ES的同义词表中，具体文件路径请参看我写的ES安装过程的博客
10 | [Stopwords]
11 | sheet = 停用词
12 | # BEFAQ的jieba停用词表,程序会自动读取到 es/stopwords4_process_question_dedup.txt中
13 | [Userdict]
14 | # BEFAQ的jieba字典,程序会自动读取到 es/userdict.txt中
15 | sheet = 词典


--------------------------------------------------------------------------------
/data/线上用户反馈回复.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/data/线上用户反馈回复.xls


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # docker 方式启动程序
 2 | 
 3 | ## 1、启动docker集群
 4 |     首先请根据自己的系统安装docker-compose，然后才能启动docker-compose。
 5 |     交互方式启动
 6 |     docker-compose up
 7 |     后台方式启动
 8 |     docker-compose up -d
 9 |     如果想要停止docker-compose
10 |     docker-compose stop
11 | ## 2、进入BEFAQ的doker
12 |     Es相关的测试数据已经写到了Es的docker内。如果需要更新数据，请参考项目根目录下的README.md
13 |     进入befaq的docker
14 |     docker exec -it befaq /bin/bash
15 | ## 3、启动BEFAQ服务
16 |     进入项目根目录
17 |     cd /projects/BERT-Embedding-Frequently-Asked-Question/
18 |     cd es
19 |     将数据从excel中的数据写到Es 
20 |     python write_data2es.py
21 | 
22 |     将问题处理成Sentence BERT 向量，保存到bin类型文件中，便于后期读取问题的向量。
23 |     python write_vecs2bin.py
24 | 
25 |     训练Faiss和Annoy模型
26 |     python train_search_model.py
27 | 
28 |     进入src文件夹,启动BEFAQ服务
29 |     cd ../src
30 |     python main_faq.py
31 |     或者在后台中启动
32 |     nohup python -u main_faq.py > "../logs/log_$(date +"%Y-%m-%d-%H").txt" 2>&1 &
33 |     在终端中测试联想功能。服务是post请求。(如果不是本机，请将127.0.0.1替换成自己的ip)
34 |     curl -d "question=忘记原始密码如何修改密码？&get_num=3&threshold=0.5&owner_name=领域1"   http://127.0.0.1:8129/BEFAQ
35 |     如何手动kill BEFAQ服务
36 |     kill -9 $(lsof -i:8129 -t)
37 | ## 4、启动BEFAQ的联想词接口服务
38 |     cd /projects/BEFAQ
39 |     cd src
40 |     python associative_questions_server.py
41 |     或者在后台中启动
42 |     nohup python -u associative_questions_server.py >/dev/null 2>&1 &
43 |     在终端中测试联想功能。服务是post请求。(如果不是本机，请将127.0.0.1替换成自己的ip)
44 |     curl -d "current_question=设计师&limit_num=3&owner_name=领域1&if_middle=1"  http://127.0.0.1:8128/associative_questions
45 | ## 5、测试接口
46 |     请参考项目根目录下的README.md
47 | 
48 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.1'
 2 | services:
 3 |     kibana:
 4 |         image: xiaoyichao1993/kibana-7.6.1:latest
 5 |         container_name: kibana4befaq
 6 |         links:
 7 |             - elasticsearch4befaq
 8 |         ports:
 9 |             - 5601:5601
10 | 
11 |     elasticsearch4befaq:
12 |         image: xiaoyichao1993/es7-befaq:latest
13 |         container_name: es4befaq
14 |         cap_add:
15 |             - IPC_LOCK
16 |         volumes:
17 |             - esdata1:/usr/share/elasticsearch/data
18 |         ports:
19 |             - 9200:9200
20 |         environment:
21 |             - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
22 |             - cluster.name=befaq-es
23 |             - bootstrap.memory_lock=true
24 |             - discovery.type=single-node
25 | 
26 |     befaq:
27 |         image: xiaoyichao1993/befaq:latest
28 |         container_name: befaq
29 |         links:
30 |             - elasticsearch4befaq
31 |         ports:
32 |             - 8129:8129
33 |             - 8128:8128
34 |         stdin_open: true
35 |         tty: true
36 |         depends_on:
37 |             - elasticsearch4befaq
38 | 
39 | volumes:
40 |   esdata1:
41 |     driver: local
42 | 


--------------------------------------------------------------------------------
/es/es_create_index.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-01-02 16:55:23
 6 | LastEditTime: 2021-06-25 14:12:52
 7 | @Description: 创建一个索引，仅供测试。
 8 | 
 9 | '''
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | import os
13 | import configparser
14 | 
15 | 
16 | dir_name = os.path.abspath(os.path.dirname(__file__))
17 | es_config = configparser.ConfigParser()
18 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
19 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
20 | 
21 | 
22 | # 使用配置文件中的index_name，也可以自己命名，创建其他名称的索引
23 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
24 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
26 | if if_es_use_passwd == "1":
27 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
28 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
29 |     es_connect = Elasticsearch(
30 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
31 | else:
32 | 
33 |     es_connect = Elasticsearch(
34 |         es_server_ip_port)
35 | 
36 | es_faq = ESCURD(es_connect)
37 | 
38 | if __name__ == "__main__":
39 |     es_faq.create_index(index_name=index_name_1)
40 |     es_faq.create_index(index_name=index_name_2)
41 | 


--------------------------------------------------------------------------------
/es/es_del_data.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-19 19:01:17
 6 | LastEditTime: 2021-06-25 14:13:35
 7 | @Description: 删除索引，仅供测试。
 8 | '''
 9 | 
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | import configparser
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | 
17 | 
18 | dir_name = os.path.abspath(os.path.dirname(__file__))
19 | es_config = configparser.ConfigParser()
20 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
22 | 
23 | 
24 | # 使用配置文件中的index_name，也可以自己命名，创建其他名称的索引
25 | index_name = es_config["ServerInfo"]["index_name_1"]
26 | 
27 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
28 | if if_es_use_passwd == "1":
29 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
30 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
31 |     es_connect = Elasticsearch(
32 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
33 | else:
34 | 
35 |     es_connect = Elasticsearch(
36 |         es_server_ip_port)
37 | 
38 | 
39 | es_faq = ESCURD(es_connect)
40 | 
41 | if __name__ == "__main__":
42 |     owner_names = ["领域1,领域2,领域3"]
43 |     for owner_name in owner_names:
44 |         es_faq.del_data(index_name, owner_name)
45 | 


--------------------------------------------------------------------------------
/es/es_del_index.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-01-02 16:55:23
 6 | LastEditTime: 2021-06-06 21:54:28
 7 | @Description: 删除ES的索引， del_index_name 是要删除的索引的名字
 8 | 
 9 | '''
10 | 
11 | from es_operate import ESCURD
12 | from elasticsearch import Elasticsearch
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | import configparser
17 | import os
18 | import sys
19 | 
20 | 
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 | es_config = configparser.ConfigParser()
23 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
24 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
25 | 
26 | 
27 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
28 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
29 | 
30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
31 | if if_es_use_passwd == "1":
32 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
33 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
34 |     es_connect = Elasticsearch(
35 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
36 | else:
37 | 
38 |     es_connect = Elasticsearch(
39 |         es_server_ip_port)
40 | 
41 | 
42 | es_faq = ESCURD(es_connect)
43 | 
44 | if __name__ == "__main__":
45 |     es_faq.del_index(index_name=index_name_1)
46 |     es_faq.del_index(index_name=index_name_2)
47 | 


--------------------------------------------------------------------------------
/es/es_operate.py:
--------------------------------------------------------------------------------
  1 | # coding=UTF-8
  2 | '''
  3 | @Author: xiaoyichao
  4 | LastEditors: xiaoyichao
  5 | @Date: 2020-05-21 15:31:50
  6 | LastEditTime: 2021-06-18 15:52:23
  7 | @Description: ES相关操作的类
  8 | 
  9 | '''
 10 | from elasticsearch.helpers import bulk
 11 | 
 12 | 
 13 | class ESCURD(object):
 14 |     def __init__(self, es):
 15 |         self.es = es
 16 | 
 17 |     def create_index(self, index_name):
 18 |         '''
 19 |         @Author: xiaoyichao
 20 |         @param {type}
 21 |         @Description: 创建索引
 22 |         '''
 23 |         mappings_cn = {
 24 |             "settings": {
 25 |                 "index.max_ngram_diff": 10,
 26 |                 "number_of_shards": 5,
 27 |                 "number_of_replicas": 1,
 28 |                 "analysis": {
 29 |                     "filter": {
 30 |                         "local_synonym": {
 31 |                             "type": "synonym",
 32 |                             "synonyms_path": "synonyms/synonym.txt"
 33 |                         },
 34 |                         "edge_ngram_filter": {
 35 |                             "type": "edge_ngram",
 36 |                             "min_gram": 1,
 37 |                             "max_gram": 50
 38 |                         }
 39 |                     },
 40 |                     "analyzer": {
 41 |                         "text_ik": {
 42 |                             "type": "custom",
 43 |                             "tokenizer": "ik_smart",
 44 |                             "filter": ["lowercase"]
 45 |                         },
 46 |                         "text_ik_s": {
 47 |                             "type": "custom",
 48 |                             "tokenizer": "ik_smart",
 49 |                             "filter": [
 50 |                                 "lowercase",
 51 |                                 "local_synonym"
 52 |                             ]
 53 |                         },
 54 | 
 55 |                         "save_origin_split": {
 56 |                             "type": "custom",
 57 |                             "tokenizer": "standard",
 58 |                             "filter": [
 59 |                                 "lowercase"
 60 |                             ]
 61 |                         },
 62 |                         "keyword_cn": {
 63 |                             "type": "custom",
 64 |                             "tokenizer": "keyword",
 65 |                             "filter": [
 66 |                                 "lowercase",
 67 |                                 "edge_ngram_filter"
 68 |                             ]
 69 |                         },
 70 |                         "ngram_tokenizer_analyzer": {
 71 |                             "type": "custom",
 72 |                             "tokenizer": "ngram_tokenizer",
 73 |                             "filter": [
 74 |                                 "lowercase"
 75 |                             ]
 76 |                         }
 77 | 
 78 |                     },
 79 |                     "tokenizer": {
 80 |                         "ngram_tokenizer": {
 81 |                             "type": "ngram",
 82 |                             "min_gram": 1,
 83 |                             "max_gram": 6,
 84 |                             "token_chars": [
 85 |                                 "letter",
 86 |                                 "digit"]
 87 |                         }
 88 | 
 89 |                     }
 90 |                 }
 91 |             },
 92 |             "mappings": {
 93 |                 "properties": {
 94 |                     "original_question": {
 95 |                         "type": "text",
 96 |                         "analyzer": "save_origin_split",
 97 |                         "search_analyzer": "save_origin_split"
 98 |                     },
 99 |                     "original_question_cn_left": {
100 |                         "type": "text",
101 |                         "analyzer": "keyword_cn",
102 |                         "search_analyzer": "keyword"
103 |                     },
104 |                     "original_question_cn_middle": {
105 |                         "type": "text",
106 |                         "analyzer": "ngram_tokenizer_analyzer",
107 |                         "search_analyzer": "keyword"
108 |                     },
109 |                     "process_question": {
110 |                         "type": "text",
111 |                         "analyzer": "text_ik",
112 |                         "search_analyzer": "text_ik_s"
113 |                     },
114 |                     "answer": {
115 |                         "type": "text"
116 |                     },
117 |                     "q_id": {
118 |                         "type": "integer"
119 |                     },
120 |                     "specific_q_id": {
121 |                         "type": "integer"
122 |                     },
123 |                     "id": {
124 |                         "type": "integer"
125 |                     },
126 |                     "owner_name": {
127 |                         "type": "keyword"
128 |                     }
129 |                 }
130 |             }
131 |         }
132 | 
133 |         if self.es.indices.exists(index=index_name) is True:
134 |             print("索引 %s 之前已经存在" % index_name)
135 |         else:
136 |             self.es.indices.create(index=index_name, body=mappings_cn)
137 |             print("成功创建索引: %s" % index_name)
138 | 
139 |     def del_index(self, index_name):
140 |         # 删除索引
141 |         if self.es.indices.exists(index=index_name) is True:
142 |             res = self.es.indices.delete(index_name)
143 |             print("删除索引:", index_name)
144 |             return res
145 |         else:
146 |             print("想要删除的索引 %s 不存在" % index_name)
147 |             return
148 | 
149 |     def del_data(self, index_name, owner_name):
150 |         # 删除owner_name对用的数据
151 |         query = {'query': {'match': {'owner_name': owner_name}}}
152 | 
153 |         res = self.es.delete_by_query(
154 |             index=index_name,  body=query)
155 |         print("删除数据:", res)
156 | 
157 |     def insert_more(self, index_name, actions, owner_name):
158 |         '''
159 |         @Author: xiaoyichao
160 |         @param {type}：
161 |         @Description: 添加多条数据
162 | 
163 |         '''
164 |         res, _ = bulk(self.es, actions, index=index_name,
165 |                       raise_on_error=True)
166 |         print("%s 向ES中添加了%d条数据" % (owner_name, res))
167 | 
168 |     def search_data(self, index_name, owner_name, query_word_list, limit_num):
169 |         '''
170 |         @Author: xiaoyichao
171 |         @param {type}
172 |         @Description: 查询ES数据
173 |         '''
174 |         limit_num = int(limit_num)
175 | 
176 |         should_list = []
177 |         for word in query_word_list:
178 |             match = {
179 |                 "match": {
180 |                     "process_question": word
181 |                 }
182 |             }
183 |             should_list.append(match)
184 |         bool_inside_value = {"should": should_list}
185 |         list_must_value_2 = {}
186 |         list_must_value_2["bool"] = bool_inside_value
187 | 
188 |         list_must_value_1 = [
189 |             {
190 |                 "match_phrase": {
191 |                     "owner_name": owner_name
192 |                 }
193 |             }
194 |         ]
195 | 
196 |         must_list = []
197 |         must_list.append(list_must_value_1)
198 |         must_list.append(list_must_value_2)
199 | 
200 |         dic_bool_value = {}
201 |         dic_bool_value["must"] = must_list
202 | 
203 |         dic_bool = {}
204 |         dic_bool["bool"] = dic_bool_value
205 | 
206 |         doc = {}
207 |         doc["query"] = dic_bool
208 |         doc["_source"] = ["q_id", "process_question",
209 |                           "original_question", "answer", "specific_q_id"]
210 |         doc["size"] = limit_num
211 | 
212 |         print("ES查询语句：", doc)
213 | 
214 |         res = self.es.search(
215 |             index=index_name, body=doc)
216 |         return res
217 | 
218 |     def search_cn(self, index_name, owner_name, current_question, search_limit_num, if_middle=True):
219 |         '''
220 |         @Author: xiaoyichao
221 |         @param {type}
222 |         @Description: 查询中文提示词
223 |         '''
224 |         search_limit_num = int(search_limit_num)
225 | 
226 |         doc = {}
227 |         if if_middle:  # 从中间开始搜索
228 | 
229 |             doc["query"] = {
230 |                 "bool": {
231 |                     "must": [
232 |                         [
233 |                             {
234 |                                 "match": {
235 |                                     "owner_name": owner_name
236 |                                 }
237 |                             },
238 |                             {
239 |                                 "match": {"original_question_cn_middle": current_question}
240 |                             }
241 | 
242 |                         ]]
243 |                 }
244 |             }
245 | 
246 |         else:
247 | 
248 |             doc["query"] = {
249 |                 "bool": {
250 |                     "must": [
251 |                         [
252 |                             {
253 |                                 "match": {
254 |                                     "owner_name": owner_name
255 |                                 }
256 |                             },
257 |                             {
258 |                                 "match": {"original_question_cn_left": current_question}
259 |                             }
260 | 
261 |                         ]]
262 |                 }
263 |             }
264 |         doc["_source"] = ["original_question", "q_id"]
265 |         doc["size"] = search_limit_num
266 | 
267 |         # print("ES查询语句：", doc)
268 | 
269 |         res = self.es.search(
270 |             index=index_name, body=doc)
271 |         return res
272 | 
273 |     def search4search_engine(self, index_name,  owner_name, question):
274 |         '''
275 |         @Author: xiaoyichao
276 |         @param {type}
277 |         @Description: 查询annoy或faiss检索出的question的对应信息，例如q_id等
278 |         '''
279 |         doc = {}
280 | 
281 |         doc["query"] = {
282 |             "bool": {
283 |                 "must": [
284 |                         [
285 |                             {
286 |                                 "match": {
287 |                                     "owner_name": owner_name
288 |                                 }
289 |                             },
290 |                             {
291 |                                 "match_phrase": {"original_question": question}
292 |                             }
293 | 
294 |                         ]]
295 |             }
296 |         }
297 | 
298 |         doc["_source"] = ["q_id", "specific_q_id", "process_question",
299 |                           "original_question", "answer"]
300 | 
301 |         print("ES查询语句：", doc)
302 | 
303 |         res = self.es.search(
304 |             index=index_name,  body=doc)
305 |         return res
306 | 
307 |     def es_put_alias(self, index_name, alias_name):
308 |         '''
309 |         Author: xiaoyichao
310 |         param {type}
311 |         Description: 添加别名和索引的连接
312 |         '''
313 |         res = self.es.indices.put_alias(index=index_name, name=alias_name)
314 |         print("添加别名%s和索引%s的连接" % (alias_name, index_name))
315 |         return res
316 | 
317 |     def es_get_alias(self, alias_name):
318 |         '''
319 |         Author: xiaoyichao
320 |         param {type}
321 |         Description: 获取当前别名下的索引
322 |         '''
323 |         try:
324 |             res = self.es.indices.get_alias(name=alias_name)
325 |             current_index = list(res.keys())[0]
326 |             print("获取当前别名%s下的索引" % alias_name)
327 |             return current_index
328 |         except Exception:
329 |             return
330 | 
331 |     def es_del_alias(self, index_name, alias_name):
332 |         '''
333 |         Author: xiaoyichao
334 |         param {type}
335 |         Description: 删除别名和索引的连接
336 |         '''
337 |         try:
338 |             res = self.es.indices.delete_alias(
339 |                 index=index_name, name=alias_name)
340 |             print("删除别名%s和索引%s的连接" % (alias_name, index_name))
341 |             return res
342 |         except Exception:
343 |             return
344 | 


--------------------------------------------------------------------------------
/es/es_search_cn.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-12 07:19:00
 6 | LastEditTime: 2021-03-10 19:05:51
 7 | @Description: 用于实现搜索框的中文提示词的类
 8 | '''
 9 | from elasticsearch import Elasticsearch
10 | import configparser
11 | import os
12 | import sys
13 | os.chdir(sys.path[0])
14 | sys.path.append("../")
15 | from es.es_operate import ESCURD
16 | 
17 | 
18 | dir_name = os.path.abspath(os.path.dirname(__file__))
19 | es_config = configparser.ConfigParser()
20 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
21 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
22 | 
23 | index_name = es_config["ServerInfo"]["alias_name"]
24 | 
25 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
26 | if if_es_use_passwd == "1":
27 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
28 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
29 |     es_connect = Elasticsearch(
30 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
31 | else:
32 | 
33 |     es_connect = Elasticsearch(
34 |         es_server_ip_port)
35 | 
36 | es_faq = ESCURD(es_connect)
37 | 
38 | 
39 | class SearchData4Association(object):
40 |     # 实现搜索框的中文提示词的类
41 |     def search_question_cn(self, owner_name, current_question, limit_num, if_middle):
42 |         current_question = current_question.lower()
43 |         search_limit_num = 100
44 | 
45 |         retrieve_data = es_faq.search_cn(
46 |             index_name, owner_name, current_question, search_limit_num, if_middle)
47 | 
48 |         retrieve_results = retrieve_data["hits"]
49 |         max_result_len = retrieve_results["total"]["value"]
50 |         hits = retrieve_results["hits"]
51 |         maybe_original_questions = []
52 |         q_ids = []
53 |         if limit_num < max_result_len:
54 |             result_len = limit_num
55 |         else:
56 |             result_len = max_result_len
57 |         for i in range(result_len):
58 |             qu_an_id = hits[i]["_source"]
59 |             original_question = qu_an_id["original_question"]
60 |             q_id = qu_an_id["q_id"]
61 |             maybe_original_questions.append(original_question)
62 |             q_ids.append(q_id)
63 |         q_id_set = set()
64 |         deduplication_maybe_questions = []
65 |         # q_id去重复并根据相关度排序
66 |         for q_id, maybe_original_question in zip(q_ids, maybe_original_questions):
67 |             if q_id not in q_id_set:
68 |                 deduplication_maybe_questions.append(maybe_original_question)
69 | 
70 |         return deduplication_maybe_questions
71 | 


--------------------------------------------------------------------------------
/es/jieba_befaq.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-03-24 13:25:41
 6 | LastEditTime: 2021-06-06 21:13:52
 7 | @Description:  用于写入ES的process_question字段时去掉同义词。比如，怎样，如何这些词。
 8 | '''
 9 | import jieba
10 | import os
11 | dir_name = os.path.abspath(os.path.dirname(__file__))
12 | 
13 | 
14 | class StopwordsBEFAQ(object):
15 | 
16 |     def stopwordslist(self, filepath):
17 |         stopwords = [line.strip() for line in open(
18 |             filepath, 'r', encoding='utf-8').readlines()]
19 |         return set(stopwords)
20 | 
21 |     # 对句子进行分词
22 |     def seg_sentence4faq(self, sentence):
23 |         #  创建用户字典
24 |         userdict = os.path.join(dir_name, 'userdict.txt')
25 |         jieba.load_userdict(userdict)
26 |         sentence_seged = jieba.cut(sentence.strip())
27 |         stopwords_file = os.path.join(
28 |             dir_name, 'stopwords4_process_question_dedup.txt')
29 |         stopwords = self.stopwordslist(stopwords_file)  # 这里加载停用词的路径
30 |         outstr = ""  # 分隔符号
31 |         for word in sentence_seged:
32 |             if word not in stopwords:
33 |                 if word != '\t':
34 |                     outstr += word
35 |                     outstr += ""  # 分隔符号
36 |         return outstr
37 | 
38 |     def seg_sentence4customer_service(self, sentence):
39 |         #  创建用户字典
40 |         userdict = os.path.join(dir_name, 'userdict.txt')
41 |         jieba.load_userdict(userdict)
42 |         sentence_seged = jieba.cut(sentence.strip())
43 |         # stopwords_file = os.path.join(
44 |         #     dir_name, 'stopwords4_process_question_dedup.txt')
45 |         # stopwords = self.stopwordslist(stopwords_file)  # 这里加载停用词的路径
46 |         outstr = ""  # 分隔符号
47 |         for word in sentence_seged:
48 |             # if word not in stopwords:
49 |             if word != '\t':
50 |                 outstr += word
51 |                 outstr += ""  # 分隔符号
52 |         return outstr
53 | 
54 | 


--------------------------------------------------------------------------------
/es/read_excel.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | Date: 2020-08-13 11:34:47
 6 | LastEditTime: 2021-06-18 16:31:16
 7 | Description: 用于读取excel表格的类
 8 | '''
 9 | import os
10 | import sys
11 | import xlrd
12 | import configparser
13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14 | 
15 | dir_name = os.path.abspath(os.path.dirname(__file__))
16 | 
17 | 
18 | class ExcelData(object):
19 | 
20 |     def __init__(self):
21 |         self.excel_config = configparser.ConfigParser()
22 |         self.excel_config.read(os.path.join(dir_name, "../config/sheetname.conf"))
23 |         self.sheet_names = self.excel_config["QA_sheets"]["sheets"].split(",")
24 |         self.excel_name = self.excel_config["excel_name"]["name"]
25 |         self.synonyms_sheet = self.excel_config["Synonyms"]["sheet"]
26 |         self.stopwords_sheet = self.excel_config["Stopwords"]["sheet"]
27 |         self.excel_file = os.path.join(dir_name, "../data/", self.excel_name)
28 |         self.id = 0
29 | 
30 |     def get_sheet_names(self):
31 |         '''
32 |         Author: xiaoyichao
33 |         param {type}
34 |         Description: 返回要读取的sheet的名称组成的list
35 |         '''
36 |         return self.sheet_names
37 | 
38 |     def read_sheet(self, sheet_name):
39 |         '''
40 |         Author: xiaoyichao
41 |         param {type}
42 |         Description: 读取excel中某个sheet的数据
43 |         '''
44 |         try:
45 |             book = xlrd.open_workbook(filename=self.excel_file)
46 |             table = book.sheet_by_name(sheet_name)
47 |             nrows = table.nrows
48 |             ncols = table.ncols
49 |             sheet_list = []
50 |             for row in range(1, nrows):
51 |                 for col in range(2, ncols):
52 |                     cell_value = table.cell(row, col).value
53 |                     if cell_value != "":
54 |                         q_id = row
55 |                         original_question = cell_value
56 |                         answer = table.cell(row, 1).value
57 |                         self.id += 1
58 |                         owner_name = sheet_name
59 |                         sheet_list.append(
60 |                             [q_id, original_question, answer, self.id, owner_name])
61 |             return sheet_list
62 |         except Exception:
63 |             print("Exception")
64 |             return []
65 | 
66 |     def read_QA_data(self):
67 |         '''
68 |         Author: xiaoyichao
69 |         param {type}
70 |         Description: 读取excel中的问答数据
71 |         '''
72 |         excel_list = []
73 |         for sheet_name in self.sheet_names:
74 |             sheet_list = self.read_sheet(sheet_name)
75 |             excel_list.append(sheet_list)
76 |         return excel_list
77 | 
78 | 
79 | # exceldata = ExcelData()
80 | # excel_list = exceldata.read_QA_data()
81 | # print(excel_list)
82 | 


--------------------------------------------------------------------------------
/es/search_engines_operate.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-19 17:14:35
 6 | LastEditTime: 2020-08-25 17:50:47
 7 | @Description: 训练annoy文件，不用faiss 是因为faiss不支持float64，最大精度floa32. 
 8 | 也有利用annoy 检索的功能
 9 | '''
10 | 
11 | from annoy import AnnoyIndex
12 | import faiss
13 | from faiss import normalize_L2
14 | import os
15 | import sys
16 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17 | from faq.get_question_vecs import ReadVec2bin
18 | 
19 | dir_name = os.path.abspath(os.path.dirname(__file__))
20 | read_vec2bin = ReadVec2bin()
21 | 
22 | 
23 | class SearchEngine(object):
24 |     def train_annoy(self, owner_name):
25 |         bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name)
26 |         annoy_index_path = os.path.join(
27 |             dir_name, './search_model/%s_annoy.index' % owner_name)
28 |         tc_index = AnnoyIndex(f=512, metric='angular')
29 | 
30 |         if os.path.exists(os.path.join(dir_name, './search_model')) is False:
31 |             os.mkdir(os.path.join(dir_name, './search_model'))
32 | 
33 |         if os.path.exists(annoy_index_path):
34 |             os.remove(annoy_index_path)
35 |             print("删除旧的  %s_annoy.index文件" % owner_name)
36 | 
37 |         for i, vec in enumerate(bert_vecs):
38 |             tc_index.add_item(i, vec)
39 |         tc_index.build(100)
40 |         tc_index.save(annoy_index_path)
41 |         print("写入  %s_annoy.index文件" % owner_name)
42 | 
43 |     def train_faiss(self, owner_name):
44 |         bert_vecs = read_vec2bin.read_bert_vecs(owner_name=owner_name)
45 |         d = 512                          # dimension
46 |         nb = len(bert_vecs)             # database size
47 |         faiss_index_path = os.path.join(
48 |             dir_name, './search_model/%s_faiss.index' % owner_name)
49 |         training_vectors = bert_vecs.astype('float32')
50 |         normalize_L2(training_vectors)
51 |         index = faiss.IndexFlatIP(d)
52 |         index.train(training_vectors)
53 |         index.add(training_vectors)
54 |         if os.path.exists(os.path.join(dir_name, './search_model')) is False:
55 |             os.mkdir(os.path.join(dir_name, './search_model'))
56 | 
57 |         if os.path.exists(faiss_index_path):
58 |             os.remove(faiss_index_path)
59 |             print("删除旧的  %s_faiss.index文件" % owner_name)
60 | 
61 |         faiss.write_index(index, faiss_index_path)
62 |         print("写入  %s_faiss.index文件" % owner_name)
63 | 


--------------------------------------------------------------------------------
/es/search_model/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory 
2 | * 
3 | # Except this file !.gitkeep 


--------------------------------------------------------------------------------
/es/stopwords4_process_question_dedup.txt:
--------------------------------------------------------------------------------
 1 | ?
 2 | hello
 3 | hi
 4 | 一下
 5 | 一个
 6 | 上
 7 | 不
 8 | 为什么
 9 | 么
10 | 么么哒
11 | 了
12 | 什么
13 | 你好
14 | 再见
15 | 可以
16 | 吗
17 | 吧
18 | 呢
19 | 哈
20 | 哈哈
21 | 哈哈哈
22 | 哈喽
23 | 哪个
24 | 哪里
25 | 啊
26 | 啦
27 | 嗨
28 | 在
29 | 在不在
30 | 在吗
31 | 在哪
32 | 在哪里
33 | 好
34 | 好哒
35 | 好滴
36 | 好的
37 | 如何
38 | 希望
39 | 怎么
40 | 怎么样
41 | 怎样
42 | 怎样才能
43 | 您好
44 | 想
45 | 想要
46 | 感谢
47 | 我
48 | 我想
49 | 我的
50 | 找不到
51 | 拜拜
52 | 时
53 | 时候
54 | 有人吗
55 | 有没有
56 | 的
57 | 真
58 | 真希望
59 | 要
60 | 请问
61 | 谢谢
62 | 谢谢啦
63 | 这个
64 | 那个
65 | 问
66 | 问一下
67 | 问题
68 | 非常
69 | ？
70 | 


--------------------------------------------------------------------------------
/es/train_search_model.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-19 17:14:35
 6 | LastEditTime: 2020-08-25 18:05:41
 7 | @Description: 
 8 | '''
 9 | from read_excel import ExcelData
10 | import os
11 | import sys
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | from search_engines_operate import SearchEngine
14 | 
15 | exceldata = ExcelData()
16 | sheet_names = exceldata.get_sheet_names()
17 | search_engine = SearchEngine()
18 | 
19 | for sheet_name in sheet_names:
20 |     search_engine.train_annoy(owner_name=sheet_name)
21 |     search_engine.train_faiss(owner_name=sheet_name)
22 | 


--------------------------------------------------------------------------------
/es/userdict.txt:
--------------------------------------------------------------------------------
 1 | 好好住
 2 | ipad
 3 | ipad pro
 4 | 平板
 5 | 平板电脑
 6 | 夜间模式
 7 | 暗黑模式
 8 | 同城
 9 | 当地
10 | 投诉
11 | 维权
12 | 盗用
13 | 盗图
14 | 入驻
15 | 申请
16 | 入住
17 | 认证
18 | 更换
19 | 更改
20 | ppt
21 | 课件
22 | pdf
23 | 表格
24 | 在哪
25 | 找不到
26 | 日常
27 | 常见
28 | 推送
29 | 推荐
30 | 闪退
31 | bug
32 | 异常
33 | 历史推送
34 | 往期推送
35 | 装修日记
36 | 装修记录
37 | 装修待办
38 | 装修记账
39 | 账号
40 | 账户
41 | 


--------------------------------------------------------------------------------
/es/write_data2es.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-01-02 16:55:23
 6 | LastEditTime: 2021-03-01 19:13:26
 7 | @Description: 将数据写到ES中
 8 | 
 9 | '''
10 | from es_operate import ESCURD
11 | from elasticsearch import Elasticsearch
12 | from jieba_befaq import StopwordsBEFAQ
13 | from read_excel import ExcelData
14 | import os
15 | # import sys
16 | # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17 | import configparser
18 | 
19 | dir_name = os.path.abspath(os.path.dirname(__file__))
20 | es_config = configparser.ConfigParser()
21 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
22 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
23 | 
24 | 
25 | alias_name = es_config["ServerInfo"]["alias_name"]
26 | index_name_1 = es_config["ServerInfo"]["index_name_1"]
27 | index_name_2 = es_config["ServerInfo"]["index_name_2"]
28 | index_name_set = set([index_name_1, index_name_2])
29 | 
30 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
31 | if if_es_use_passwd == "1":
32 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
33 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
34 |     es_connect = Elasticsearch(
35 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
36 | else:
37 | 
38 |     es_connect = Elasticsearch(
39 |         es_server_ip_port)
40 | 
41 | es_faq = ESCURD(es_connect)
42 | stopwords4BEFAQ = StopwordsBEFAQ()
43 | 
44 | 
45 | class ReadsSqlData2ES(object):
46 |     def __init__(self):
47 |         self.exceldata = ExcelData()
48 |         self.excel_list = self.exceldata.read_QA_data()
49 | 
50 |     def write_data2es(self, index_name):
51 |         '''
52 |         @Author: xiaoyichao
53 |         @param {type}
54 |         @Description: 将数据写到ES中
55 |         '''
56 | 
57 |         for sheet_data in self.excel_list:
58 |             actions = []
59 |             num = 0
60 |             owner_name = "未命名领域"
61 |             for info in sheet_data:
62 |                 num += 1
63 |                 q_id, original_question, answer, id, owner_name = info[
64 |                     0], info[1], info[2], info[3], info[4]
65 |                 process_question = original_question.lower()
66 |                 process_question = stopwords4BEFAQ.seg_sentence4faq(
67 |                     sentence=process_question)
68 |                 action_name = "action"+str(num)
69 |                 action_name = {}
70 |                 action_name["_index"] = index_name
71 |                 action_name["_source"] = {
72 |                     "q_id": q_id,
73 |                     "specific_q_id": id,
74 |                     "original_question": original_question,
75 |                     "process_question": process_question,
76 |                     "original_question_cn_middle": original_question.lower(),
77 |                     "original_question_cn_left": original_question.lower(),
78 |                     "answer": answer,
79 |                     "owner_name": owner_name
80 |                 }
81 |                 actions.append(action_name)
82 |             es_faq.insert_more(index_name=index_name, actions=actions, owner_name=owner_name)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     read_sql_data = ReadsSqlData2ES()
87 |     current_index = es_faq.es_get_alias(alias_name=alias_name)
88 |     new_index_set = index_name_set-set([current_index])
89 |     new_index = new_index_set.pop()
90 |     es_faq.del_index(index_name=new_index)
91 |     es_faq.create_index(index_name=new_index)
92 |     read_sql_data.write_data2es(index_name=new_index)
93 |     es_faq.es_put_alias(index_name=new_index, alias_name=alias_name)
94 |     es_faq.es_del_alias(index_name=current_index, alias_name=alias_name)
95 | 


--------------------------------------------------------------------------------
/es/write_vecs2bin.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-01-02 16:55:23
 6 | LastEditTime: 2021-06-25 15:27:08
 7 | @Description: 将问题的集合的向量写入bin文件
 8 | 
 9 | '''
10 | 
11 | 
12 | import numpy as np
13 | from read_excel import ExcelData
14 | import os
15 | import sys
16 | os.chdir(sys.path[0])
17 | sys.path.append("../")
18 | from bert_server.sentence_bert_server import SentenceBERT
19 | 
20 | 
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 | 
23 | 
24 | class WriteVec2bin(object):
25 |     def __init__(self):
26 |         self.exceldata = ExcelData()
27 |         self.excel_list = self.exceldata.read_QA_data()
28 |         self.sheet_names = self.exceldata.get_sheet_names()
29 |         self.sentenceBERT = SentenceBERT()
30 | 
31 |     def write_bert_vecs(self, owner_name, num):
32 |         '''
33 |         @Author: xiaoyichao
34 |         @param {type}
35 |         @Description: 句向量都进行写入bin文件
36 |         '''
37 |         if os.path.exists(os.path.join(dir_name, '../faq/bert_vect')) is False:
38 |             os.mkdir(os.path.join(dir_name, '../faq/bert_vect'))
39 |         bert_vecs_path = os.path.join(
40 |             dir_name, '../faq/bert_vect/%s_bert_vecs.npy' % (owner_name))
41 |         bert_sentences_path = os.path.join(
42 |             dir_name, '../faq/bert_vect/%s_bert_sentences.txt' % (owner_name))
43 |         orgin_query_vecs = np.zeros(shape=(1, 512))
44 |         with open(bert_sentences_path, "w") as f:
45 |             f.write("数据库中的问题"+"\n")
46 |             for info in self.excel_list[num]:
47 |                 original_question = info[1]
48 |                 f.write(original_question+"\n")
49 |                 orgin_query = original_question.replace("，", " ")
50 |                 orgin_query_list = orgin_query.split(' ')
51 |                 orgin_query_vec = self.sentenceBERT.get_bert(orgin_query_list)
52 |                 orgin_query_vecs = np.concatenate(
53 |                     (orgin_query_vecs, orgin_query_vec), axis=0)
54 |             if os.path.exists(bert_vecs_path):
55 |                 os.remove(bert_vecs_path)
56 |                 print("删除旧的BERT向量文件")
57 |             # 将铺平的向量reshape
58 |             orgin_query_vecs = np.reshape(orgin_query_vecs, (-1, 512))
59 |             np.save(bert_vecs_path, orgin_query_vecs)
60 | 
61 |         print("BERT向量文件写入", bert_vecs_path)
62 | 
63 |     def write_bert_vecs4sheets(self):
64 |         '''
65 |         Author: xiaoyichao
66 |         param {type}
67 |         Description: 对每个领域语料的句向量都进行写入bin文件的操作
68 |         '''
69 |         for i, sheet_name in enumerate(self.sheet_names):
70 |             self.write_bert_vecs(owner_name=sheet_name, num=i)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     write_vec2bin = WriteVec2bin()
75 |     write_vec2bin.write_bert_vecs4sheets()
76 | 


--------------------------------------------------------------------------------
/faq/bert_vect/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory 
2 | * 
3 | # Except this file !.gitkeep 


--------------------------------------------------------------------------------
/faq/deduplicate_threshold_op.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-05-22 12:24:06
 6 | LastEditTime: 2021-06-06 22:00:53
 7 | @Description: 对重排序后的数据，根据q_id进行去重复，卡阈值。低于置信度阈值的数据不要
 8 | '''
 9 | 
10 | 
11 | class DeduplicateThreshold(object):
12 |     def dedu_thr(self, q_ids, re_rank_sim_list, threshold):
13 |         high_confidence_q_id_pos = []
14 |         if len(q_ids) > 0:
15 |             q_id_dict = {}
16 |             # 获取 q_id和position关系的字典
17 |             for position, id in enumerate(q_ids):
18 |                 if id not in q_id_dict:
19 |                     q_id_dict[id] = [position]
20 |                 else:
21 |                     q_id_dict[id].append(position)
22 |             # print("召回的q_id_dict:", q_id_dict)
23 |             # 对q_id去重复,某个q_id下存在多个数据的，取其中最高相似度的结果，某个q_id下只有一个数据的直接取这个数据，也就是第0个数据
24 |             unique_q_ids_pos = []
25 |             for poss in q_id_dict.values():
26 |                 max_sim_pos = poss[0]
27 |                 if len(poss) > 1:
28 |                     for qid_pos in poss:
29 |                         if re_rank_sim_list[qid_pos] > re_rank_sim_list[max_sim_pos]:
30 |                             max_sim_pos = qid_pos
31 |                 unique_q_ids_pos.append(max_sim_pos)
32 |             # 对去重复后的q_id,卡阈值,高于置信度的才要。
33 |             for q_id_pos in unique_q_ids_pos:
34 |                 if re_rank_sim_list[q_id_pos] >= threshold:
35 |                     high_confidence_q_id_pos.append(q_id_pos)
36 |             return high_confidence_q_id_pos
37 |         else:
38 |             return high_confidence_q_id_pos
39 | 
40 | 


--------------------------------------------------------------------------------
/faq/get_final_data.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | @LastEditors: xiaoyichao
 5 | @Date: 2020-05-23 16:21:51
 6 | @LastEditTime: 2020-07-23 14:45:05
 7 | @Description: FAQ模块。根据去重复，卡阈值之后留下的q_id，取出对应的question,answer,相似度
 8 | '''
 9 | 
10 | 
11 | class FinalData(object):
12 |     def get_json_confidence(self, json_data):
13 |         return json_data["confidence"]
14 | 
15 |     def get_qa(self, high_confidence_q_id_pos, maybe_questions, maybe_answers, re_rank_sim, get_num, retrieval_q_ids, specific_q_ids):
16 |         return_data = []
17 |         for q_id_pos in high_confidence_q_id_pos:
18 |             single_json = {}
19 |             single_json["q_id"] = retrieval_q_ids[q_id_pos]
20 |             single_json["specific_q_id"] = specific_q_ids[q_id_pos]
21 |             single_json["question"] = maybe_questions[q_id_pos]
22 |             single_json["answer"] = maybe_answers[q_id_pos]
23 |             single_json["confidence"] = round(re_rank_sim[q_id_pos], 2)
24 |             return_data.append(single_json)
25 |         return_data.sort(reverse=True, key=self.get_json_confidence)
26 |         # 对返回数据的数量进行限制。
27 |         if len(high_confidence_q_id_pos) > get_num:
28 |             return return_data[:get_num]
29 |         else:
30 |             return return_data
31 | 


--------------------------------------------------------------------------------
/faq/get_question_vecs.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-09 14:45:34
 6 | LastEditTime: 2021-06-25 15:04:53
 7 | @Description: 获取问题集合的BERT向量
 8 | '''
 9 | 
10 | 
11 | import numpy as np
12 | 
13 | import os
14 | import sys
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | from es.read_excel import ExcelData
17 | 
18 | exceldata = ExcelData()
19 | sheet_names = exceldata.get_sheet_names()
20 | dir_name = os.path.abspath(os.path.dirname(__file__))
21 | 
22 | 
23 | class ReadVec2bin(object):
24 |     def __init__(self):
25 |         self.owner_name_sentence = {}
26 |         self.owner_name_bert_vecs = {}
27 |         for sheet_name in sheet_names:
28 |             bert_vecs_path = os.path.join(
29 |                 dir_name, './bert_vect/%s_bert_vecs.npy' % (sheet_name))
30 |             bert_sentences_path = os.path.join(
31 |                 dir_name, './bert_vect/%s_bert_sentences.txt' % (sheet_name))
32 | 
33 |             with open(bert_sentences_path, "r", encoding="utf8")as sent:
34 |                 sentences = sent.read()
35 |                 sentences = sentences.strip("\n")
36 |                 sentences = sentences.split("\n")
37 |             self.owner_name_sentence[sheet_name] = sentences[1:]
38 |             bert_vecs = np.load(bert_vecs_path)
39 |             self.owner_name_bert_vecs[sheet_name] = bert_vecs[1:]
40 | 
41 |     def read_bert_sents(self, owner_name):
42 |         return self.owner_name_sentence[owner_name]
43 | 
44 |     def read_bert_vecs(self, owner_name):
45 |         return self.owner_name_bert_vecs[owner_name]
46 | 
47 | 


--------------------------------------------------------------------------------
/faq/jieba4befaq.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-03-24 13:25:41
 6 | LastEditTime: 2021-02-23 17:45:49
 7 | @Description: 对用户的FAQ问题。去掉停用词，比如，怎样，如何这些词。然后进入ES搜索
 8 | '''
 9 | import jieba
10 | import os
11 | dir_name = os.path.abspath(os.path.dirname(__file__))
12 | 
13 | 
14 | class JiebaBEFAQ(object):
15 | 
16 |     def stopwordslist(self, filepath):
17 |         stopwords = [line.strip() for line in open(
18 |             filepath, 'r', encoding='utf-8').readlines()]
19 |         return set(stopwords)
20 | 
21 |     # 对句子进行分词
22 |     def seg_sentence(self, sentence):
23 |         #  创建用户字典
24 |         userdict = os.path.join(dir_name, '../es/userdict.txt')
25 |         jieba.load_userdict(userdict)
26 |         sentence_seged = jieba.cut(sentence.strip())
27 |         stopwords_file = os.path.join(
28 |             dir_name, '../es/stopwords4_process_question_dedup.txt')
29 |         stopwords = self.stopwordslist(stopwords_file)  # 这里加载停用词的路径
30 |         outstr = ""  # 分隔符号
31 |         for word in sentence_seged:
32 |             if word not in stopwords:
33 |                 if word != '\t':
34 |                     outstr += word
35 |                     outstr += ""  # 分隔符号
36 |         return outstr
37 | 
38 |     def get_list(self, sentence):
39 |         '''
40 |         Author: xiaoyichao
41 |         param {type}
42 |         Description: 将句子变成切次词后的list
43 |         '''
44 |         sentence_terms = list(jieba.cut(sentence))
45 |         return sentence_terms
46 | 


--------------------------------------------------------------------------------
/faq/matching_operate.py:
--------------------------------------------------------------------------------
  1 | # coding=UTF-8
  2 | '''
  3 | @Author: xiaoyichao
  4 | LastEditors: xiaoyichao
  5 | @Date: 2020-05-12 20:46:56
  6 | LastEditTime: 2021-06-25 16:08:05
  7 | @Description: 
  8 | '''
  9 | import numpy as np
 10 | import jieba
 11 | import Levenshtein
 12 | import time
 13 | import configparser
 14 | from sklearn.metrics.pairwise import cosine_similarity
 15 | from gensim.summarization import bm25
 16 | import os
 17 | import sys
 18 | os.chdir(sys.path[0])
 19 | sys.path.append("../")
 20 | from faq.get_question_vecs import ReadVec2bin
 21 | from faq.jieba4befaq import JiebaBEFAQ
 22 | from bert_server.sentence_bert_server import SentenceBERT
 23 | 
 24 | 
 25 | dir_name = os.path.abspath(os.path.dirname(__file__))
 26 | faq_config = configparser.ConfigParser()
 27 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
 28 | 
 29 | 
 30 | class Matching(object):
 31 |     def __init__(self):
 32 |         self.read_vec2bin = ReadVec2bin()
 33 |         self.jiebaBEFAQ = JiebaBEFAQ()
 34 |         self.sentenceBERT = SentenceBERT()
 35 | 
 36 |     def cosine_sim(self, orgin_query, retrieval_questions, owner_name):
 37 |         '''
 38 |         @Author: xiaoyichao
 39 |         @param {type}
 40 |         @Description: BERT空间的余弦相似度
 41 |         '''
 42 |         sentences = self.read_vec2bin.read_bert_sents(owner_name=owner_name)
 43 |         bert_vecs = self.read_vec2bin.read_bert_vecs(owner_name=owner_name)
 44 |         orgin_query = orgin_query.replace("，", " ")
 45 |         orgin_query_list = orgin_query.split(' ')
 46 |         print("orgin_query_list", orgin_query_list)
 47 | 
 48 |         orgin_query_vec = self.sentenceBERT.get_bert(
 49 |             sentence_list=orgin_query_list)
 50 |         if orgin_query_vec != np.array([]):  # 如果BERT服务正常
 51 |             retrieval_questions_vec = []
 52 |             for retrieval_question in retrieval_questions:
 53 |                 # 获取事先计算好的问题BERT 向量
 54 |                 index_pos = sentences.index(retrieval_question)
 55 |                 retrieval_question_vec = bert_vecs[index_pos]
 56 |                 retrieval_question_vec = retrieval_question_vec.reshape(-1, 512)
 57 |                 retrieval_questions_vec.append(retrieval_question_vec)
 58 | 
 59 |             retrieval_questions_vec = np.array(
 60 |                 retrieval_questions_vec).reshape(-1, 512)
 61 | 
 62 |             # 计算出来的余弦相似度可能与理论值不一致，这是计算机存储机制导致的。通过四舍五入和异常处理，来规避异常数据出现在最后的结果中。
 63 |             sim_list = cosine_similarity(
 64 |                 orgin_query_vec, retrieval_questions_vec)[0].tolist()
 65 | 
 66 |             # print('SKlearn:', end_time-begin_time)
 67 |             normalized_sim_list = []
 68 |             for sim in sim_list:
 69 |                 if sim > 1.0:
 70 |                     sim = 1.0
 71 |                 normalized_sim_list.append(sim)
 72 | 
 73 |             return normalized_sim_list
 74 |         else:  # 如果BERT服务超时了
 75 |             normalized_sim_list = []
 76 |             return normalized_sim_list
 77 | 
 78 |     def jaccrad(self, question, reference):  # reference为源句子，question为候选句子
 79 |         '''
 80 |         @Author: xiaoyichao
 81 |         @param {type}
 82 |         @Description: 计算两个句子的jaccard相似度
 83 |         '''
 84 |         terms_reference = jieba.cut(reference)  # 默认精准模式
 85 |         question = question.replace("\n", "")
 86 |         terms_model = jieba.cut(question)
 87 |         grams_reference = list(terms_reference)
 88 |         grams_model = list(terms_model)
 89 |         temp = 0
 90 |         for i in grams_reference:
 91 |             if i in grams_model:
 92 |                 temp = temp+1
 93 |         fenmu = len(grams_model)+len(grams_reference)-temp  # 并集
 94 |         jaccard_coefficient = float(temp/fenmu)  # 交集
 95 |         return jaccard_coefficient
 96 | 
 97 |     def jaccard_sim(self, orgin_query, retrieval_questions):
 98 |         '''
 99 |         @Author: xiaoyichao
100 |         @param {type}
101 |         @Description: 计算query 和潜在问题的jaccard相似度
102 |         '''
103 |         sim_list = []
104 |         for retrieval_question in retrieval_questions:
105 |             jaccard_coefficient = self.jaccrad(
106 |                 question=orgin_query, reference=retrieval_question)
107 |             sim_list.append(jaccard_coefficient)
108 |         return sim_list
109 | 
110 |     def bm25_sim(self, orgin_query, retrieval_questions):
111 |         '''
112 |         @Author: xiaoyichao
113 |         @param {type}
114 |         @Description: 计算query 和潜在问题的BM25相似度
115 |         '''
116 |         jieba_corpus = []
117 |         for corpu in retrieval_questions:
118 |             line_seg = self.jiebaBEFAQ.get_list(corpu)
119 |             jieba_corpus.append(line_seg)
120 |         jieba_question = self.jiebaBEFAQ.get_list(orgin_query)
121 |         bm25Model = bm25.BM25(jieba_corpus)
122 |         sim_list = bm25Model.get_scores(jieba_question)
123 |         normalized_sim_list = []
124 |         max_sim = max(sim_list)
125 |         for sim in sim_list:
126 |             if sim == 0:
127 |                 normalized_sim = 0
128 |             else:
129 |                 normalized_sim = sim/max_sim
130 |             normalized_sim_list.append(normalized_sim)
131 | 
132 |         return normalized_sim_list
133 | 
134 |     def edit_distance_sim(self, orgin_query, retrieval_questions):
135 |         '''
136 |         @Author: xiaoyichao
137 |         @param {type}
138 |         @Description: 计算query 和潜在问题的编辑距离的相似度
139 |         '''
140 |         sim_list = []
141 |         max_len = max(len(orgin_query), max([len(x) for x in retrieval_questions]))
142 |         for corpu in retrieval_questions:
143 |             edit_distance = Levenshtein.distance(orgin_query, corpu)
144 |             sim = 1 - edit_distance * 1.0 / max_len
145 |             sim_list.append(sim)
146 |         return sim_list
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     matching = Matching()
151 |     question = "如何评价设计师"
152 |     normalized_sim_list = matching.cosine_sim(
153 |         question, ["如何评价设计师"], "领域1")
154 |     print(normalized_sim_list)
155 | 


--------------------------------------------------------------------------------
/faq/re_rank.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-05-22 13:54:44
 6 | LastEditTime: 2021-06-06 21:42:47
 7 | @Description: 线性模型的重排序，给予不同的算法不同的权重
 8 | '''
 9 | 
10 | 
11 | class ReRank(object):
12 |     def linear_model(self, consin_sim, jaccard_sim, bm25_sim, edit_distance_sim, consine_weight, jaccard_weight, BM25_weight, edit_distance_weight):
13 |         if consin_sim != []:
14 |             tmp_multiple_sims = [i * consine_weight + j*jaccard_weight + k*BM25_weight + l*edit_distance_weight
15 |                                  for i, j, k, l in zip(consin_sim, jaccard_sim, bm25_sim, edit_distance_sim)]
16 |             multiple_sims = []
17 |             if consine_weight + jaccard_weight + BM25_weight + edit_distance_weight ==1:
18 |                 for multiple_sim in tmp_multiple_sims:
19 |                     if multiple_sim > 1.0:
20 |                         multiple_sim = 1.0
21 |                     multiple_sims.append(multiple_sim)
22 |             else:
23 |                 multiple_sims = tmp_multiple_sims
24 |             return multiple_sims
25 |         else:
26 |             multiple_sims = jaccard_sim
27 |             return multiple_sims
28 | 
29 | 


--------------------------------------------------------------------------------
/faq/retrieval_es.py:
--------------------------------------------------------------------------------
  1 | # coding=UTF-8
  2 | '''
  3 | @Author: xiaoyichao
  4 | LastEditors: xiaoyichao
  5 | @Date: 2020-01-02 16:55:23
  6 | LastEditTime: 2021-06-18 16:00:32
  7 | @Description: 使用ES召回数据和Faiss(annoy)召回数据
  8 | 
  9 | '''
 10 | 
 11 | from elasticsearch import Elasticsearch
 12 | from annoy import AnnoyIndex
 13 | import numpy as np
 14 | import faiss
 15 | import os
 16 | import sys
 17 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 18 | from es.es_operate import ESCURD
 19 | import configparser
 20 | from bert_server.sentence_bert_server import SentenceBERT
 21 | from faq.get_question_vecs import ReadVec2bin
 22 | 
 23 | 
 24 | dir_name = os.path.abspath(os.path.dirname(__file__))
 25 | es_config = configparser.ConfigParser()
 26 | es_config.read(os.path.join(dir_name, "../config/es.ini"))
 27 | es_server_ip_port = es_config["ServerAddress"]["es_server_ip_port"]
 28 | 
 29 | 
 30 | index_name = es_config["ServerInfo"]["alias_name"]
 31 | 
 32 | if_es_use_passwd = es_config["ServerAddress"]["if_es_use_passwd"]
 33 | if if_es_use_passwd == "1":
 34 |     http_auth_user_name = es_config["ServerAddress"]["http_auth_user_name"]
 35 |     http_auth_password = es_config["ServerAddress"]["http_auth_password"]
 36 |     es_connect = Elasticsearch(
 37 |         es_server_ip_port, http_auth=(http_auth_user_name, http_auth_password))
 38 | else:
 39 | 
 40 |     es_connect = Elasticsearch(
 41 |         es_server_ip_port)
 42 | 
 43 | es_faq = ESCURD(es_connect)
 44 | sentenceBERT = SentenceBERT()
 45 | read_vec2bin = ReadVec2bin()
 46 | 
 47 | 
 48 | class SearchData(object):
 49 |     '''
 50 |     Author: xiaoyichao
 51 |     param {type}
 52 |     Description: 用于召回数据，会使用ES，Annoy，Faiss,具体使用哪些可以自己配置
 53 |     '''
 54 |     def search_es(self, owner_name, query_word_list, ES_limit_num):
 55 |         '''
 56 |         Author: xiaoyichao
 57 |         param {type}
 58 |         Description: 使用ES召回
 59 |         '''
 60 |         retrieve_data = es_faq.search_data(
 61 |             index_name=index_name, owner_name=owner_name, query_word_list=query_word_list, limit_num=ES_limit_num)
 62 |         retrieve_results = retrieve_data["hits"]
 63 |         max_result_len = retrieve_results["total"]["value"]
 64 |         # max_score = retrieve_results["max_score"]
 65 |         hits = retrieve_results["hits"]
 66 |         maybe_original_questions = []
 67 |         maybe_process_questions = []
 68 |         maybe_answers = []
 69 |         specific_q_ids = []
 70 |         q_ids = []
 71 |         if ES_limit_num < max_result_len:
 72 |             result_len = ES_limit_num
 73 |         else:
 74 |             result_len = max_result_len
 75 |         for i in range(result_len):
 76 |             qu_an_id = hits[i]["_source"]
 77 |             original_question = qu_an_id["original_question"]
 78 |             process_question = qu_an_id["process_question"]
 79 |             answer = qu_an_id["answer"]
 80 |             q_id = qu_an_id["q_id"]
 81 |             specific_q_id = qu_an_id["specific_q_id"]
 82 |             maybe_original_questions.append(original_question)
 83 |             maybe_process_questions.append(process_question)
 84 |             maybe_answers.append(answer)
 85 |             q_ids.append(q_id)
 86 |             specific_q_ids.append(specific_q_id)
 87 |         return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
 88 | 
 89 |     def search_annoy(self, owner_name, question, num=5):
 90 |         '''
 91 |         Author: xiaoyichao
 92 |         param {type}
 93 |         Description: 使用Annoy 召回
 94 |         '''
 95 |         sentences = read_vec2bin.read_bert_sents(owner_name=owner_name)
 96 |         annoy_index_path = os.path.join(
 97 |             dir_name, '../es/search_model/%s_annoy.index' % owner_name)
 98 |         encodearrary = sentenceBERT.get_bert([question])
 99 |         tc_index = AnnoyIndex(f=512, metric='angular')
100 |         tc_index.load(annoy_index_path)
101 |         # items = tc_index.get_nns_by_vector(
102 |         #     encodearrary[0], num, include_distances=True)
103 |         items = tc_index.get_nns_by_vector(
104 |             encodearrary[0], num, include_distances=True)
105 |         sim_questions = [sentences[num_annoy] for num_annoy in items[0]]
106 |         # sims = items[1]
107 |         # index_nums = items[0]
108 |         return sim_questions
109 | 
110 |     def search_faiss(self, owner_name, question, num=5):
111 |         '''
112 |         Author: xiaoyichao
113 |         param {type}
114 |         Description: 使用Faiss 召回
115 |         '''
116 |         sentences = read_vec2bin.read_bert_sents(owner_name=owner_name)
117 |         faiss_index_path = os.path.join(
118 |             dir_name, '../es/search_model/%s_faiss.index' % owner_name)
119 |         index = faiss.read_index(faiss_index_path)
120 |         question_vec = sentenceBERT.get_bert([question]).astype('float32')
121 |         index.nprobe = 1
122 |         sims, index_nums = index.search(question_vec, num)
123 |         sim_questions = [sentences[num_faiss] for num_faiss in index_nums[0]]
124 |         # index_nums = index_nums[0].tolist()
125 |         # sims = sims[0].tolist()
126 |         return sim_questions
127 | 
128 |     def merge_op(self, question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num):
129 |         '''
130 |         Author: xiaoyichao
131 |         param {type}
132 |         Description: 合并ES与faiss或（和）annoy的结果
133 |         '''
134 |         if use_faiss == 1 and use_annoy == 0:
135 |             print("use_faiss")
136 |             mayey_search_questions = self.search_faiss(
137 |                 owner_name, question, num=engine_limit_num)
138 |         elif use_faiss == 0 and use_annoy == 1:
139 |             print("use_annoy")
140 |             mayey_search_questions = self.search_annoy(
141 |                 owner_name, question, num=engine_limit_num)
142 |         elif use_faiss == 1 and use_annoy == 1:
143 |             print("use_annoy and use_faiss ")
144 |             mayey_search_questions_faiss = self.search_faiss(
145 |                 owner_name, question, num=engine_limit_num)
146 |             mayey_search_questions_annoy = self.search_annoy(
147 |                 owner_name, question, num=engine_limit_num)
148 |             mayey_search_questions = list(
149 |                 set(mayey_search_questions_faiss+mayey_search_questions_annoy))
150 |         else:
151 |             return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
152 |         print("ES检索出的数据", maybe_original_questions)
153 |         # 再去查ES的数据，跟ES的数据做合并。去重复。
154 |         for sim_question in mayey_search_questions:
155 |             if sim_question not in set(maybe_original_questions):
156 |                 print("faiss、annoy 检索出的新数据", sim_question)
157 |                 retrieve_data = es_faq.search4search_engine(
158 |                     index_name, owner_name, question=sim_question)
159 |                 retrieve_results = retrieve_data["hits"]
160 |                 max_result_len = retrieve_results["total"]["value"]
161 |                 # max_score = retrieve_results["max_score"]
162 |                 hits = retrieve_results["hits"]
163 | 
164 |                 if max_result_len >= 1:
165 |                     for i in range(1):
166 |                         qu_an_id = hits[i]["_source"]
167 |                         original_question = qu_an_id["original_question"]
168 |                         process_question = qu_an_id["process_question"]
169 |                         answer = qu_an_id["answer"]
170 |                         q_id = qu_an_id["q_id"]
171 |                         specific_q_id = qu_an_id["specific_q_id"]
172 |                         maybe_original_questions.append(original_question)
173 |                         maybe_process_questions.append(process_question)
174 |                         maybe_answers.append(answer)
175 |                         q_ids.append(q_id)
176 |                         specific_q_ids.append(specific_q_id)
177 |         # 合并数据
178 |         return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
179 | 
180 |     def search_merge(self, owner_name, question, query_word_list, use_other_when_es_none, use_faiss=0, use_annoy=0, engine_limit_num=5, ES_limit_num=10):
181 |         # 首先用ES检索
182 |         maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.search_es(
183 |             owner_name=owner_name, query_word_list=query_word_list, ES_limit_num=ES_limit_num)
184 |         if use_other_when_es_none is False:
185 |             if len(maybe_original_questions) == 0:  # ES没有数据的时候才用faiss或(和)annoy
186 |                 # 推荐使用这种方式，因为faiss和annoy一定会召回指定数量的数据。这其中很可能会出现你不想看到的数据。当ES召回数据量为0的时候，再利用Fasis或（和）annoy召回数据
187 |                 maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op(
188 |                     question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num)
189 |         else:  # ES有数据的时候也用faiss或(和)annoy。
190 |             maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids = self.merge_op(
191 |                 question, owner_name, maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids, use_faiss, use_annoy, engine_limit_num)
192 | 
193 |         return maybe_original_questions, maybe_process_questions, maybe_answers, q_ids, specific_q_ids
194 | 


--------------------------------------------------------------------------------
/image/BEFAQ 框架.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhzrd/BEFAQ/955d1780a2625b805f3ebe1649d96d16df820254/image/BEFAQ 框架.png


--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory 
2 | * 
3 | # Except this file !.gitkeep 


--------------------------------------------------------------------------------
/model/.gitkeep:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory 
2 | * 
3 | # Except this file !.gitkeep 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | sentence_transformers==1.2.0
 2 | jieba==0.39
 3 | elasticsearch==7.7.0
 4 | annoy==1.16.3
 5 | xlrd==1.2.0
 6 | numpy==1.18.2
 7 | faiss_cpu==1.6.3
 8 | sanic==20.6.3
 9 | scikit_learn==0.23.2
10 | transformers==4.6.1
11 | python-Levenshtein==0.12.2
12 | gensim==3.8.3
13 | uvloop==0.14.0


--------------------------------------------------------------------------------
/src/associative_questions_server.py:
--------------------------------------------------------------------------------
 1 | # coding=UTF-8
 2 | '''
 3 | @Author: xiaoyichao
 4 | LastEditors: xiaoyichao
 5 | @Date: 2020-06-12 08:15:51
 6 | LastEditTime: 2021-06-18 16:28:41
 7 | @Description: 
 8 | '''
 9 | from sanic import Sanic
10 | import sanic
11 | import configparser
12 | import os
13 | import sys
14 | os.chdir(sys.path[0])
15 | sys.path.append("../")
16 | from es.es_search_cn import SearchData4Association
17 | from common.response_add_head import res_with_head
18 | from common.kill_program import kill_port
19 | 
20 | 
21 | dir_name = os.path.abspath(os.path.dirname(__file__))
22 | search_data = SearchData4Association()
23 | 
24 | 
25 | # 接口会返回json数据
26 | app = Sanic()
27 | app = Sanic("associative questions")
28 | 
29 | 
30 | @app.route("/associative_questions", methods=["POST", "HEAD"])
31 | async def associative_questions(request):
32 | 
33 |     # 接收到的参数
34 |     current_question = str(request.form.get("current_question"))
35 |     limit_num = int(request.form.get("limit_num"))
36 |     owner_name = str(request.form.get("owner_name"))
37 |     if_middle = int(request.form.get("if_middle", default=1))
38 |     if if_middle == 1:
39 |         if_middle = True
40 |     if if_middle == 0:
41 |         if_middle = False
42 |     else:
43 |         if_middle = True
44 | 
45 |     maybe_original_questions = search_data.search_question_cn(
46 |         owner_name=owner_name, current_question=current_question, limit_num=limit_num, if_middle=if_middle)
47 | 
48 |     answer_json = {}
49 |     answer_json["code"] = "1"
50 |     answer_json["msg"] = "OK"
51 |     answer_json["data"] = {
52 |         "message": maybe_original_questions}
53 |     return res_with_head(answer_json)
54 | 
55 | 
56 | @app.route("/", methods=["GET", "HEAD"])
57 | async def alibaba_operator_check(request):
58 |     print("alibaba SLB checking server status")
59 |     return sanic.response.text(200)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     root_config = configparser.ConfigParser()
64 |     root_config.read(os.path.join(
65 |         dir_name, "../config/associative_questions_config.ini"))
66 |     port = int(root_config["ServerAddress"]["port"])
67 | 
68 |     kill_port(port)
69 | 
70 |     app.run(host="0.0.0.0",
71 |             port=port,
72 |             workers=int(root_config["ServerInfo"]["work_number"]),
73 |             debug=False, access_log=False)
74 | 


--------------------------------------------------------------------------------
/src/main_faq.py:
--------------------------------------------------------------------------------
  1 | # coding=UTF-8
  2 | '''
  3 | @Author: xiaoyichao
  4 | LastEditors: xiaoyichao
  5 | @Date: 2020-05-12 20:46:56
  6 | @Description: FAQ功能的主程序文件
  7 | '''
  8 | import time
  9 | import jieba
 10 | import configparser
 11 | from sanic import Sanic
 12 | from sanic.response import json
 13 | from sanic import response
 14 | import os
 15 | import sys
 16 | os.chdir(sys.path[0])
 17 | sys.path.append("../")
 18 | from common.kill_program import kill_port
 19 | from es.es_search_cn import SearchData4Association
 20 | from common.response_add_head import res_with_head
 21 | from faq.jieba4befaq import JiebaBEFAQ
 22 | from faq.retrieval_es import SearchData
 23 | from faq.matching_operate import Matching
 24 | from faq.deduplicate_threshold_op import DeduplicateThreshold
 25 | from faq.re_rank import ReRank
 26 | from faq.get_final_data import FinalData
 27 | 
 28 | 
 29 | dir_name = os.path.abspath(os.path.dirname(__file__))
 30 | 
 31 | faq_config = configparser.ConfigParser()
 32 | faq_config.read(os.path.join(dir_name, "../config/befaq_conf.ini"))
 33 | consine_weight = float(faq_config["AlgorithmConfiguration"]["consine"])
 34 | jaccard_weight = float(faq_config["AlgorithmConfiguration"]["jaccard"])
 35 | BM25_weight = float(faq_config["AlgorithmConfiguration"]["BM25"])
 36 | edit_distance_weight = float(faq_config["AlgorithmConfiguration"]["edit_distance"])
 37 | use_faiss = int(faq_config["AlgorithmConfiguration"]["use_faiss"])
 38 | use_annoy = int(faq_config["AlgorithmConfiguration"]["use_annoy"])
 39 | engine_num = int(faq_config["Faiss_Annoy_Configuration"]["engine_num"])
 40 | ES_num = int(faq_config["ESConfiguration"]["ES_num"])
 41 | use_other_when_es_none = int(faq_config["AlgorithmConfiguration"]["use_other_when_es_none"])
 42 | if use_other_when_es_none == 1:
 43 |     use_other_when_es_none = True
 44 | else:
 45 |     use_other_when_es_none = False
 46 | 
 47 | 
 48 | jiebaBEFAQ = JiebaBEFAQ()
 49 | search_data = SearchData()
 50 | match_ing = Matching()
 51 | rerank = ReRank()
 52 | final_data = FinalData()
 53 | deduplicate_threshold = DeduplicateThreshold()
 54 | search_data4association = SearchData4Association()
 55 | 
 56 | app = Sanic()
 57 | app = Sanic("Feedback BEFAQ")
 58 | 
 59 | 
 60 | @app.route("/BEFAQ", methods=["POST", "HEAD"])
 61 | async def myfaq(request):
 62 |     orgin_query = str(request.form.get("question"))
 63 |     owner_name = str(request.form.get("owner_name"))
 64 |     get_num = int(request.form.get("get_num", default=3))
 65 |     threshold = float(request.form.get("threshold", default=0.5))
 66 | 
 67 |     # 给ES使用的结巴分词
 68 |     process_query = jiebaBEFAQ.seg_sentence(
 69 |         sentence=orgin_query)
 70 |     query_terms = jieba.cut(process_query)
 71 |     query_word_list = list(query_terms)
 72 | 
 73 |     maybe_original_questions, maybe_process_questions, maybe_answers, retrieval_q_ids, specific_q_ids = search_data.search_merge(
 74 |         owner_name=owner_name, question=orgin_query, query_word_list=query_word_list, use_faiss=use_faiss, use_annoy=use_annoy, engine_limit_num=engine_num, ES_limit_num=ES_num, use_other_when_es_none=use_other_when_es_none)
 75 | 
 76 |     if len(retrieval_q_ids) > 0:  # ES（或faiss 或 annoy ）中检索到了数据
 77 |         # cosine_sim的retrieval_questions使用的maybe_original_questions，orgin_query使用的没有处理过的query
 78 |         consin_sim = match_ing.cosine_sim(
 79 |             orgin_query=orgin_query, retrieval_questions=maybe_original_questions, owner_name=owner_name)
 80 |         print("consin_sim:", consin_sim)
 81 | 
 82 |         # jaccard_sim的retrieval_questions使用的maybe_process_questions,orgin_query使用的是去掉停用词的query
 83 |         jaccard_sim = match_ing.jaccard_sim(
 84 |             orgin_query=process_query, retrieval_questions=maybe_process_questions)
 85 |         print("jaccard_sim:", jaccard_sim)
 86 | 
 87 |         bm25_sim = match_ing.bm25_sim(
 88 |             orgin_query=process_query, retrieval_questions=maybe_process_questions)
 89 |         print("bm25_sim:", bm25_sim)
 90 | 
 91 |         edit_distance_sim = match_ing.edit_distance_sim(
 92 |             orgin_query=process_query, retrieval_questions=maybe_process_questions)
 93 |         print("edit_distance_sim:", edit_distance_sim)
 94 | 
 95 |         re_rank_sim = rerank.linear_model(
 96 |             consin_sim=consin_sim, jaccard_sim=jaccard_sim, bm25_sim=bm25_sim, edit_distance_sim=edit_distance_sim,
 97 |             consine_weight=consine_weight, jaccard_weight=jaccard_weight, BM25_weight=BM25_weight, edit_distance_weight=edit_distance_weight)
 98 | 
 99 |         print("retrieval_q_ids:", retrieval_q_ids)
100 |         print("maybe_original_questions:", maybe_original_questions)
101 |         print("maybe_process_questions:", maybe_process_questions)
102 |         print("re_rank_sim:", re_rank_sim)
103 | 
104 |         high_confidence_q_id_pos = deduplicate_threshold.dedu_thr(
105 |             q_ids=retrieval_q_ids, re_rank_sim_list=re_rank_sim, threshold=threshold)
106 |         print("high_confidence_q_id_pos:", high_confidence_q_id_pos)
107 | 
108 |         return_data = final_data.get_qa(
109 |             high_confidence_q_id_pos, maybe_original_questions, maybe_answers, re_rank_sim=re_rank_sim, get_num=get_num, retrieval_q_ids=retrieval_q_ids, specific_q_ids=specific_q_ids)
110 | 
111 |         print("return_data", return_data)
112 |         return json(return_data)
113 |     else:  # ES中没有检索到数据
114 |         return_data = []
115 |         return json(return_data)
116 | 
117 | 
118 | @app.route("/associative_questions", methods=["POST", "HEAD"])
119 | async def associative_questions(request):
120 |     # 接收到的参数
121 |     current_question = str(request.form.get("current_question"))
122 |     limit_num = int(request.form.get("limit_num"))
123 |     owner_name = str(request.form.get("owner_name"))
124 |     if_middle = int(request.form.get("if_middle", default=1))
125 |     if if_middle == 1:
126 |         if_middle = True
127 |     elif if_middle == 0:
128 |         if_middle = False
129 |     else:
130 |         if_middle = True
131 | 
132 |     maybe_original_questions = search_data4association.search_question_cn(
133 |         owner_name, current_question, limit_num, if_middle)
134 | 
135 |     answer_json = {}
136 |     answer_json["code"] = "1"
137 |     answer_json["msg"] = "OK"
138 |     answer_json["data"] = {
139 |         "message": maybe_original_questions}
140 |     return res_with_head(answer_json)
141 | 
142 | 
143 | @app.route("/", methods=["GET", "HEAD"])
144 | async def alibaba_operator_check(request):
145 |     print("alibaba SLB checking server status")
146 |     return response.text(200)
147 | 
148 | 
149 | if __name__ == "__main__":
150 | 
151 |     port = int(faq_config["ServerAddress"]["port"])
152 |     kill_port(port)
153 |     # 启动http 服务
154 |     app.run(host="0.0.0.0",
155 |             port=port,
156 |             workers=int(faq_config["ServerInfo"]["work_number"]),
157 |             debug=False, access_log=False)
158 | 


--------------------------------------------------------------------------------