├── .DS_Store
├── .gitignore
├── EasySearch.jpg
├── LICENSE
├── README.md
├── cluster.yml
├── cluster
    ├── cluster.go
    ├── dataserver.go
    ├── dataserver_test.go
    ├── managerserver.go
    ├── managerserver_test.go
    ├── searchclient.go
    ├── searchclient_test.go
    ├── searchserver.go
    ├── searchserver_test.go
    ├── server.go
    └── shardingindexer.go
├── config.yml
├── config
    └── Config.go
├── go.mod
├── go.sum
├── index
    ├── btreeindex.go
    ├── btreeindex_test.go
    ├── document.go
    ├── document_test.go
    ├── hashmapindex.go
    ├── hashmapindex_test.go
    ├── index.go
    ├── postinglist.go
    ├── postinglist_test.go
    ├── property.go
    └── tfidf.go
├── main.go
├── paraphrase
    ├── serving
    │   ├── model.go
    │   └── model_test.go
    └── train
    │   ├── wiki2txt.py
    │   └── word2vec.py
├── score
    ├── bm25.go
    └── bm25_test.go
├── search
    ├── indexer.go
    ├── indexer_test.go
    ├── merger.go
    ├── searcher.go
    └── searcher_test.go
├── start.sh
└── util
    ├── collection.go
    ├── condition.go
    ├── filter.go
    ├── filter_test.go
    ├── net.go
    ├── tokenizer.go
    └── tokenizer_test.go


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awesomefly/easysearch/6f23c6b3f8dc4ef071ff7c4e7b7ac2bc35363d0e/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | easysearch
2 | err.log
3 | cpu.pprof
4 | /.idea
5 | /data
6 | 


--------------------------------------------------------------------------------
/EasySearch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awesomefly/easysearch/6f23c6b3f8dc4ef071ff7c4e7b7ac2bc35363d0e/EasySearch.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Easy Full-Text Search Engine
  2 | 
  3 | # Overview
  4 | EasySearch是一个分布式的全文检索搜索引擎，同时支持内存检索与磁盘检索，并针对性做了性能优化。
  5 | 
  6 | ## 新特性
  7 | 
  8 | 1. 支持基于wiki文档构建倒排索引
  9 | 2. 索引结构支持Hashtable与Btree
 10 | 3. 引擎支持全量索引+增量索引，增量索引是基于Hashtable在内存中构建的，支持实时更新，定时合并到全量索引；且支持了DoubleBuffer更新，提升了查询性能；
 11 | 4. 全量索引分为SmallSegment、MiddleSegment、BigSegment 3中， 多个SmallSegment达到一定大小后合并到MiddleSegment，以此类推。按不同大小或时间拆分，也可以降低全量索引重建成本
 12 | 5. 检索加速：支持非精准topk检索，postinglist归并时，支持按词频等静态分提前截断r个加速归并（胜者）。 归并后支持截断
 13 | 6. 相关性打分：支持bm25相关性排序
 14 | 7. 支持搜索词语义改写
 15 | 
 16 | ## Requirement
 17 | - go 1.16.5 以上
 18 | 
 19 | 
 20 | ## Quick Start
 21 | ### 下载
 22 | 
 23 | - 下载项目代码到你的工作目录：
 24 | 
 25 |   ```
 26 |   git clone https://github.com/awesomefly/easysearch.git
 27 |   ```
 28 | 
 29 | - 通过go mod更新依赖:
 30 | 
 31 |   ```
 32 |   cd $PROJECT_DIR
 33 |   go mod tidy
 34 |   ```
 35 | 
 36 | - 项目构建:
 37 |   ```
 38 |   go build
 39 |   ```
 40 | 
 41 | ### 本地索引
 42 | - 下载wiki文档到本地路径, 这里我们下载wiki摘要数据，对摘要建立倒排索引。 [下载链接]( https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz)
 43 |   ```
 44 |   cd $PROJECT_DIR/data
 45 |   wget  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
 46 |   ```
 47 | - 构建索引文件， 在项目路径下创建config.yml文件，加入构建索引配置项
 48 |     ```
 49 |   cd $PROJECT_DIR
 50 |   vim config.yml
 51 |   ```
 52 |   - 配置如下：
 53 |   ``` 
 54 |   Storage:
 55 |     IndexFile: ./data/wiki_index   #索引文件存储路径
 56 |     DumpFile: ./data/enwiki-latest-abstract1.xml.gz  #文档路径
 57 |   BM25:
 58 |     K1: 2
 59 |     B: 0.75 
 60 |   ```
 61 |   - 创建索引
 62 |   ```
 63 |   cd $PROJECT_DIR
 64 |   ./easysearch -m indexer
 65 |   ```
 66 |   如果索引构建成功，$PROJECT_DIR/data目录下会生成 wiki_index.idx,wiki_index.kv,wiki_index.sum 三个文件
 67 | - 本地检索, 通过关键字搜索文档
 68 |   ```
 69 |   ./easysearch -m searcher -q "Album Jordan" --source=local
 70 |   ```
 71 | 
 72 | ### 语义改写 [参考](https://github.com/dwt0317/QueryRewritingService/tree/master/embedding)
 73 | - requirement
 74 |   - python 3.8+
 75 | - 下载训练集 [下载链接](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2)
 76 |   ```
 77 |   cd $PROJECT_DIR/data
 78 |   wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 79 |   ```
 80 | - 文本预处理：wiki文档预处理，提取词集到data/wiki_texts.txt
 81 |   ```
 82 |   /usr/bin/python3 paraphrase/train/wiki2txt.py --cmd=parse --file=$WIKI_FILE
 83 |   ```
 84 | - 模型训练: 采用python gensim.word2vec训练数据，并保存模型与词向量集合 
 85 |   ```
 86 |   /usr/bin/python3 paraphrase/train/word2vec.py --cmd=train --corpus_file=./data/wiki_texts.txt 
 87 |   ```
 88 |   训练成功则会生成：
 89 | 
 90 |   模型数据文件./data/med200_less.model.bin
 91 | 
 92 |   向量数据文件 ./data/word2vec.format.bin
 93 | - 模型应用
 94 |   - golang语言可以使用code.sajari.com/word2vec库来加载训练得到的词向量集合， 通过并通过接口获取搜索词的近义词
 95 |   - 单元测试 paraphrase/serving/model_test.go
 96 |   - 本地搜索时需要增加选项 --model_file
 97 |     ```
 98 |     ./easysearch -m searcher -q "Album Jordan" --source=local --model_file=./data/word2vec.format.bin 
 99 |     ```
100 |   - 集群搜索时新增配置项
101 |     ``` 
102 |     Storage:
103 |       IndexFile: ./data/wiki_index   #索引文件存储路径
104 |       DumpFile: ./data/enwiki-latest-abstract1.xml.gz  #文档路径
105 |       ModelFile: ./data/word2vec.format.bin
106 |     ```
107 | 
108 | ### 分布式
109 | 
110 | #### Architecture
111 | ![](EasySearch.jpg) 
112 | - MangerServer 服务信息与元数据管理节点
113 | - DataServer 索引数据存储节点， 每个节点上有多个分片索引数据
114 | - SearchServer 只负责处理查询请求
115 | 
116 | 
117 | #### 构建分片索引
118 | - 修改配置
119 |   ```
120 |   cd $PROJECT_DIR
121 |   vim config.yml
122 |   ```
123 | - 新增分片配置如下：
124 |   ``` 
125 |   Storage:
126 |     IndexFile: ./data/wiki_index   #索引文件存储路径
127 |     DumpFile: ./data/enwiki-latest-abstract1.xml.gz  #文档路径
128 |   BM25:
129 |     K1: 2
130 |     B: 0.75 
131 |   Cluster:
132 |     ShardingNum: 10
133 | 
134 |   ```
135 | - 创建索引分片
136 |   ```
137 |   cd $PROJECT_DIR
138 |   ./easysearch -m indexer --sharding=true
139 | 
140 |   ```
141 | #### 创建集群
142 | 
143 | ###### 创建单机standalone集群
144 | 
145 |   - 创建集群配置文件（cluster.yml只用与创建Standalone集群）
146 |       ```
147 |       cd $PROJECT_DIR
148 |       vim cluster.yml
149 |       ```
150 |   - 配置（创建1个管理节点，10个数据节点，2个查询节点）
151 |       ```
152 |       ManageServer:
153 |         Host: 127.0.0.1
154 |         Port: 1234
155 |       SearchServer:
156 |         - Host: 127.0.0.1
157 |           Port: 1235
158 |         - Host: 127.0.0.1
159 |           Port: 1236
160 |       DataServer:
161 |         - Host: 127.0.0.1
162 |           Port: 1240
163 |         - Host: 127.0.0.1
164 |           Port: 1241
165 |         - Host: 127.0.0.1
166 |           Port: 1242
167 |         - Host: 127.0.0.1
168 |           Port: 1243
169 |         - Host: 127.0.0.1
170 |           Port: 1244
171 |         - Host: 127.0.0.1
172 |           Port: 1245
173 |         - Host: 127.0.0.1
174 |           Port: 1246
175 |         - Host: 127.0.0.1
176 |           Port: 1247
177 |         - Host: 127.0.0.1
178 |           Port: 1248
179 |         - Host: 127.0.0.1
180 |           Port: 1249
181 |       ```
182 |   - 修改config.yml配置如下
183 |       ``` 
184 |       Storage:
185 |         IndexFile: ./data/wiki_index   #索引文件存储路径
186 |         DumpFile: ./data/enwiki-latest-abstract1.xml.gz  #文档路径
187 |       BM25:
188 |         K1: 2
189 |         B: 0.75
190 |       Cluster:
191 |         ShardingNum: 10
192 |         ManageServer:  #ip port保持与集群配置一致
193 |           Host: 127.0.0.1
194 |           Port: 1234
195 |       ```
196 |   - 集群启动
197 |       ```
198 |       bash start.sh standalone
199 |       ```
200 | ###### 创建分布式集群
201 |   - 自行创建集群需要准备好机器实例，分别在不同机器节点上启动不同服务
202 |   - 启动顺序 ManagerServer->DataServer->SearchServer
203 |   - 启动ManagerServer
204 |     - 配置如下
205 |     ``` 
206 |     Server: # ManagerServer host和ip
207 |       Host: 127.0.0.1
208 |       Port: 1234
209 |     Cluster:
210 |       ShardingNum: 10
211 |       ReplicateNum: 3
212 |     ```
213 |     - 启动
214 |     ```
215 |     ./easysearch -m cluster --servername=managerserver
216 |     ```
217 |     
218 |   - 启动DataServer
219 |     - 配置
220 |     ```
221 |     Storage:
222 |       IndexFile: ./data/wiki_index   #索引文件存储路径
223 |       DumpFile: ./data/enwiki-latest-abstract1.xml.gz  #文档路径
224 |     BM25:
225 |       K1: 2
226 |       B: 0.75
227 |     Server: # DataServer host和ip
228 |       Host: 127.0.0.1
229 |       Port: 1240   
230 |     Cluster:
231 |       ShardingNum: 10
232 |       ReplicateNum: 3
233 |       ManageServer:  #ManagerServer ip port保持与集群配置一致
234 |         Host: 127.0.0.1
235 |         Port: 1234
236 |     ```
237 |     - 启动
238 |     ```
239 |     ./easysearch -m cluster --servername=dataserver
240 |     ```
241 |     - 启动SearchServer
242 |       - 配置
243 |       ```
244 |       Server: # SearchServer host和ip
245 |         Host: 127.0.0.1
246 |         Port: 1235 
247 |       Cluster:
248 |         ShardingNum: 10
249 |         ReplicateNum: 3
250 |         ManageServer:  #ManagerServer ip port保持与集群配置一致
251 |           Host: 127.0.0.1
252 |           Port: 1234
253 |       ```
254 |       - 启动
255 |       ```
256 |       ./easysearch -m cluster --servername=searchserver
257 |       ```      
258 | #### 分布式检索
259 |     
260 | - 结果查询
261 |   ```
262 |   ./easysearch -m searcher -q "Album Jordan" --source=remote
263 |   ```
264 | - OUPUT:
265 |     ```
266 |     2021/12/20 19:45:03 Starting remote search..
267 |     2021/12/20 19:45:04 Search found 5 documents in 611.645503ms
268 |     2021/12/20 19:45:04 10  The Great Session is an album led by pianist Duke Jordan recorded in 1978 and released on the Danish SteepleChase label in 1981.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015
269 |     2021/12/20 19:45:04 605 Thinking of You is an album led by pianist Duke Jordan recorded in 1979 in Denmark (with one track from 1978) and released on the Danish SteepleChase label in 1982.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015
270 |     2021/12/20 19:45:04 613 Change a Pace is an album led by pianist Duke Jordan recorded in 1979 in Denmark and released on the Danish SteepleChase label in 1980.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015
271 |     2021/12/20 19:45:04 597 Flight to Japan is an album led by the pianist Duke Jordan, recorded in 1976 in Tokyo and released on the Danish SteepleChase label in 1978.Duke Jordan discography, accessed March 24, 2015- SteepleChase Records discography, accessed March 24, 2015
272 |     2021/12/20 19:45:04 564 Suburbs is an album by the American New wave band The Suburbs, released in 1986. It was their first and only release on A&M Records.
273 |   
274 |     ```
275 | 
276 | 
277 | ## TODO
278 | - PostingList压缩与归并效率优化
279 | - 字典索引压缩，减少存储空间
280 | - 精排引入LR、DNN
281 | - 多路召回引入向量检索
282 | 
283 | ## 参考
284 | 
285 | - [skip-list vs btree](https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537)
286 | - [simple fts](https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/.)
287 | 
288 | 
289 | 


--------------------------------------------------------------------------------
/cluster.yml:
--------------------------------------------------------------------------------
 1 | ManageServer:
 2 |   Host: 127.0.0.1
 3 |   Port: 1234
 4 | SearchServer:
 5 |   - Host: 127.0.0.1
 6 |     Port: 1235
 7 |   - Host: 127.0.0.1
 8 |     Port: 1236
 9 | DataServer:
10 |   - Host: 127.0.0.1
11 |     Port: 1240
12 |   - Host: 127.0.0.1
13 |     Port: 1241
14 |   - Host: 127.0.0.1
15 |     Port: 1242
16 |   - Host: 127.0.0.1
17 |     Port: 1243
18 |   - Host: 127.0.0.1
19 |     Port: 1244
20 |   - Host: 127.0.0.1
21 |     Port: 1245
22 |   - Host: 127.0.0.1
23 |     Port: 1246
24 |   - Host: 127.0.0.1
25 |     Port: 1247
26 |   - Host: 127.0.0.1
27 |     Port: 1248
28 |   - Host: 127.0.0.1
29 |     Port: 1249


--------------------------------------------------------------------------------
/cluster/cluster.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"math/rand"
 6 | )
 7 | 
 8 | const (
 9 | 	ManagerNode = 1
10 | 	DataNode    = 2
11 | 	SearchNode  = 3
12 | )
13 | 
14 | type Node struct {
15 | 	ID   int
16 | 	Type int
17 | 	Host string //ip:port
18 | 
19 | 	LeaderSharding   []int //主分片
20 | 	FollowerSharding []int //备份分片
21 | }
22 | 
23 | type Cluster struct {
24 | 	ShardingNum  int //分片数
25 | 	ReplicateNum int //数据备份数
26 | 
27 | 	SearchNodeCorpus []Node
28 | 	DataNodeCorpus   map[string]Node
29 | }
30 | 
31 | func NewCluster(shard, replicate int) *Cluster {
32 | 	return &Cluster{
33 | 		ShardingNum:      shard,
34 | 		ReplicateNum:     replicate,
35 | 		SearchNodeCorpus: make([]Node, 0),
36 | 		DataNodeCorpus:   make(map[string]Node, 0),
37 | 	}
38 | }
39 | 
40 | func (c *Cluster) Add(node Node) error {
41 | 	switch node.Type {
42 | 	case DataNode:
43 | 		c.DataNodeCorpus[node.Host] = node
44 | 	case SearchNode:
45 | 		c.SearchNodeCorpus = append(c.SearchNodeCorpus, node)
46 | 	default:
47 | 		return errors.New("invalid node type")
48 | 	}
49 | 	return nil
50 | }
51 | 
52 | const (
53 | 	LeaderSharding   = 1
54 | 	FollowerSharding = 2
55 | )
56 | 
57 | type Sharding2Node map[int][]Node
58 | 
59 | func (c *Cluster) RouteShardingNode(flag int) (Sharding2Node, error) {
60 | 	result := make(Sharding2Node, 0)
61 | 	for _, node := range c.DataNodeCorpus {
62 | 		switch flag {
63 | 		case LeaderSharding:
64 | 			for _, shard := range node.LeaderSharding {
65 | 				result[shard] = append(result[shard], node)
66 | 			}
67 | 		case FollowerSharding:
68 | 			for _, shard := range node.FollowerSharding {
69 | 				result[shard] = append(result[shard], node)
70 | 			}
71 | 		}
72 | 	}
73 | 	return result, nil
74 | }
75 | 
76 | func (c *Cluster) RouteSearchNode() Node {
77 | 	n := rand.Intn(len(c.SearchNodeCorpus))
78 | 	return c.SearchNodeCorpus[n]
79 | }
80 | 


--------------------------------------------------------------------------------
/cluster/dataserver.go:
--------------------------------------------------------------------------------
  1 | package cluster
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math/rand"
  6 | 
  7 | 	"github.com/awesomefly/easysearch/config"
  8 | 
  9 | 	"github.com/awesomefly/easysearch/index"
 10 | 	"github.com/awesomefly/easysearch/search"
 11 | )
 12 | 
 13 | type DataServer struct {
 14 | 	self    Node
 15 | 	cluster Cluster
 16 | 
 17 | 	sharding map[int]*search.Searcher
 18 | 	server   *Server
 19 | }
 20 | 
 21 | func NewDataServer(config *config.Config) *DataServer {
 22 | 	ds := DataServer{
 23 | 		self: Node{
 24 | 			ID:   rand.Intn(10000), //todo: support uuid
 25 | 			Type: DataNode,
 26 | 			Host: config.Server.Address(),
 27 | 		},
 28 | 		server:   &Server{name: "Data", network: "tcp", address: config.Server.Address()},
 29 | 		sharding: make(map[int]*search.Searcher, 0),
 30 | 	}
 31 | 
 32 | 	n := Node{}
 33 | 	err := RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.AddServer", ds.self, &n)
 34 | 	if err != nil {
 35 | 		panic(err)
 36 | 	}
 37 | 	ds.self = n
 38 | 
 39 | 	c := Cluster{}
 40 | 	err = RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.GetCluster", ds.self.Host, &c)
 41 | 	if err != nil {
 42 | 		panic(err)
 43 | 	}
 44 | 	ds.cluster = c
 45 | 	//fmt.Printf("DataServer:%+v\n", ds)
 46 | 
 47 | 	if len(config.Store.IndexFile) == 0 {
 48 | 		panic("index file is empty.")
 49 | 	}
 50 | 
 51 | 	for _, shard := range ds.self.LeaderSharding {
 52 | 		searcher := search.NewSearcher(fmt.Sprintf("%s.%d", config.Store.IndexFile, shard))
 53 | 		if config.Store.ModelFile != "" {
 54 | 			searcher.InitParaphrase(config.Store.ModelFile)
 55 | 		}
 56 | 		ds.sharding[shard] = searcher
 57 | 	}
 58 | 
 59 | 	for _, shard := range ds.self.FollowerSharding {
 60 | 		searcher := search.NewSearcher(fmt.Sprintf("%s.%d", config.Store.IndexFile, shard))
 61 | 		if config.Store.ModelFile != "" {
 62 | 			searcher.InitParaphrase(config.Store.ModelFile)
 63 | 		}
 64 | 		ds.sharding[shard] = searcher
 65 | 	}
 66 | 	return &ds
 67 | }
 68 | 
 69 | func (s *DataServer) Run() {
 70 | 	if err := s.server.RegisterName("DataServer", s); err != nil {
 71 | 		panic(err)
 72 | 	}
 73 | 	if err := s.server.Run(); err != nil {
 74 | 		panic(err)
 75 | 	}
 76 | }
 77 | 
 78 | type SearchRequest struct {
 79 | 	Query    string
 80 | 	Sharding []int
 81 | }
 82 | 
 83 | //Search 搜索
 84 | func (s *DataServer) Search(request SearchRequest, response *[]index.Doc) error {
 85 | 	result := make([]index.Doc, 0)
 86 | 	for _, shard := range request.Sharding {
 87 | 		srh := s.sharding[shard]
 88 | 		if srh == nil {
 89 | 			continue
 90 | 		}
 91 | 		x := srh.Search(request.Query)
 92 | 		result = append(result, x...)
 93 | 	}
 94 | 	*response = result
 95 | 	return nil
 96 | }
 97 | 
 98 | // Add 实时更新
 99 | func (s *DataServer) Add(doc index.Document) {
100 | 	shard := doc.ID % s.cluster.ShardingNum
101 | 	srh := s.sharding[shard]
102 | 	srh.Add(doc)
103 | }
104 | 
105 | // Del 实时删除
106 | func (s *DataServer) Del(doc index.Document) {
107 | 	shard := doc.ID % s.cluster.ShardingNum
108 | 	srh := s.sharding[shard]
109 | 	srh.Del(doc)
110 | }
111 | 
112 | //KeepAlive todo: 备份分片与主分片保持心跳，一旦发现主分片宕机发起选举 or 请求ManageServer重新分配Leader
113 | /*
114 | func (s *DataServer) KeepAlive() {
115 | 	for _, shardId := range s.self.FollowerSharding {
116 | 		key, ok := s.cluster.consistentHash.GetNode(fmt.Sprintf("%d", shardId))
117 | 		if !ok {
118 | 			panic("")
119 | 		}
120 | 		ip := s.cluster.DataNodeCorpus[key].IP
121 | 		port := s.cluster.DataNodeCorpus[key].Port
122 | 
123 | 		ok := KeepAlive(ip, port)
124 | 		if !ok {
125 | 			keys, ok := s.cluster.consistentHash.GetNodes(fmt.Sprintf("%d", shardId), s.cluster.ReplicateNum+1)
126 | 			if !ok {
127 | 				panic("")
128 | 			}
129 | 			fllowers := GetNodes(keys)
130 | 			StratElection(fllowers)
131 | 		}
132 | 	}
133 | }
134 | */
135 | 


--------------------------------------------------------------------------------
/cluster/dataserver_test.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 	"time"
 7 | 
 8 | 	"github.com/awesomefly/easysearch/config"
 9 | 	"github.com/awesomefly/easysearch/index"
10 | 	"github.com/stretchr/testify/assert"
11 | )
12 | 
13 | func TestDataServer(t *testing.T) {
14 | 	managerConfig := config.Config{
15 | 		Server: config.Server{
16 | 			Host: "127.0.0.1",
17 | 			Port: 1234,
18 | 		},
19 | 		Cluster: config.Cluster{
20 | 			ShardingNum:  10,
21 | 			ReplicateNum: 3,
22 | 		},
23 | 	}
24 | 	server := NewManagerServer(&managerConfig)
25 | 	assert.NotNil(t, server)
26 | 	go server.Run()
27 | 	time.Sleep(1 * time.Second)
28 | 
29 | 	dataSvrConfig := config.Config{
30 | 		Store: config.Storage{
31 | 			IndexFile: "../data/wiki_index",
32 | 		},
33 | 		Server: config.Server{
34 | 			Host: "127.0.0.1",
35 | 			Port: 1240,
36 | 		},
37 | 		Cluster: config.Cluster{
38 | 			ShardingNum:  10,
39 | 			ReplicateNum: 3,
40 | 			ManageServer: config.Server{
41 | 				Host: "127.0.0.1",
42 | 				Port: 1234,
43 | 			},
44 | 		},
45 | 	}
46 | 	ds := NewDataServer(&dataSvrConfig)
47 | 	assert.NotNil(t, ds)
48 | 
49 | 	var response []index.Doc
50 | 	err := ds.Search(SearchRequest{Query: "Jordan", Sharding: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}}, &response)
51 | 	assert.Nil(t, err)
52 | 
53 | 	fmt.Printf("%+v\n", response)
54 | }
55 | 


--------------------------------------------------------------------------------
/cluster/managerserver.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"fmt"
 6 | 	"log"
 7 | 
 8 | 	"github.com/awesomefly/easysearch/util"
 9 | 
10 | 	"github.com/serialx/hashring"
11 | 
12 | 	"github.com/awesomefly/easysearch/config"
13 | )
14 | 
15 | type ManagerServer struct {
16 | 	cluster *Cluster
17 | 	hash    *hashring.HashRing
18 | 
19 | 	server *Server
20 | }
21 | 
22 | func NewManagerServer(config *config.Config) *ManagerServer {
23 | 	srv := &ManagerServer{
24 | 		cluster: NewCluster(config.Cluster.ShardingNum, config.Cluster.ReplicateNum),
25 | 		hash:    hashring.New(make([]string, 0)),
26 | 		server:  &Server{name: "Manage", network: "tcp", address: config.Server.Address()},
27 | 	}
28 | 	return srv
29 | }
30 | 
31 | func (m *ManagerServer) Run() {
32 | 	if err := m.server.RegisterName("ManagerServer", m); err != nil {
33 | 		panic(err)
34 | 	}
35 | 	if err := m.server.Run(); err != nil {
36 | 		panic(err)
37 | 	}
38 | }
39 | 
40 | // AddServer called by SearchServer
41 | func (m *ManagerServer) AddServer(request Node, response *Node) error {
42 | 	log.Print("AddServer from ", request.Host)
43 | 
44 | 	m.cluster.Add(request)
45 | 	if request.Type == DataNode {
46 | 		m.hash = m.hash.AddNode(request.Host)
47 | 		if err := m.ReBalance(); err != nil {
48 | 			return err
49 | 		}
50 | 
51 | 		go func() {
52 | 			//todo:使用channel通知分片信息有变化的节点
53 | 		}()
54 | 
55 | 		*response = m.cluster.DataNodeCorpus[request.Host]
56 | 	}
57 | 	return nil
58 | }
59 | 
60 | // GetCluster called by DataServer
61 | func (m *ManagerServer) GetCluster(request string, response *Cluster) error {
62 | 	log.Print("GetCluster from ", request)
63 | 	*response = *m.cluster
64 | 	return nil
65 | }
66 | 
67 | func (m *ManagerServer) ReBalance() error {
68 | 	for k, node := range m.cluster.DataNodeCorpus {
69 | 		node.LeaderSharding = make([]int, 0)
70 | 		node.FollowerSharding = make([]int, 0)
71 | 		m.cluster.DataNodeCorpus[k] = node
72 | 	}
73 | 
74 | 	size := util.IfElseInt(len(m.cluster.DataNodeCorpus) < m.cluster.ReplicateNum, len(m.cluster.DataNodeCorpus), m.cluster.ReplicateNum)
75 | 	for i := 0; i < m.cluster.ShardingNum; i++ {
76 | 		nodes, ok := m.hash.GetNodes(fmt.Sprint(i), size)
77 | 		if !ok {
78 | 			return errors.New("get nodes err: invalid replicated num ")
79 | 		}
80 | 		if len(nodes) < size {
81 | 			return errors.New("unexpected nodes size err. ")
82 | 		}
83 | 
84 | 		n := m.cluster.DataNodeCorpus[nodes[0]]
85 | 		n.LeaderSharding = append(n.LeaderSharding, i)
86 | 		m.cluster.DataNodeCorpus[n.Host] = n
87 | 		for _, k := range nodes[1:] {
88 | 			n = m.cluster.DataNodeCorpus[k]
89 | 			n.FollowerSharding = append(n.FollowerSharding, i)
90 | 			m.cluster.DataNodeCorpus[n.Host] = n
91 | 		}
92 | 
93 | 	}
94 | 	return nil
95 | }
96 | 


--------------------------------------------------------------------------------
/cluster/managerserver_test.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"net/rpc"
 7 | 	"runtime"
 8 | 	"testing"
 9 | 	"time"
10 | 
11 | 	"github.com/awesomefly/easysearch/config"
12 | 	"github.com/stretchr/testify/assert"
13 | )
14 | 
15 | func addServer(host string) error {
16 | 	client, err := rpc.Dial("tcp", ":1234")
17 | 	if err != nil {
18 | 		log.Fatal("dialing:", err)
19 | 		return err
20 | 	}
21 | 	var request = Node{
22 | 		Host: host,
23 | 	}
24 | 	var response Node
25 | 	err = client.Call("ManagerServer.AddServer", request, &response)
26 | 	if err != nil {
27 | 		log.Fatal(err)
28 | 		return err
29 | 	}
30 | 
31 | 	fmt.Printf("resp:%+v\n", response)
32 | 	client.Close()
33 | 	return nil
34 | }
35 | 
36 | func getCluster() error {
37 | 	client, err := rpc.Dial("tcp", ":1234")
38 | 	if err != nil {
39 | 		log.Fatal("dialing:", err)
40 | 		return err
41 | 	}
42 | 
43 | 	var response Cluster
44 | 	err = client.Call("ManagerServer.GetCluster", "local", &response)
45 | 	if err != nil {
46 | 		log.Fatal(err)
47 | 		return err
48 | 	}
49 | 
50 | 	fmt.Printf("resp:%+v\n", response)
51 | 	client.Close()
52 | 	return nil
53 | }
54 | 
55 | func TestManageServer(t *testing.T) {
56 | 	conf := config.Config{
57 | 		Server: config.Server{
58 | 			Host: "127.0.0.1",
59 | 			Port: 1234,
60 | 		},
61 | 
62 | 		Cluster: config.Cluster{
63 | 			ShardingNum:  10,
64 | 			ReplicateNum: 3,
65 | 		},
66 | 	}
67 | 	server := NewManagerServer(&conf)
68 | 	go server.Run()
69 | 	runtime.Gosched()
70 | 
71 | 	time.Sleep(1 * time.Second)
72 | 	assert.NotNil(t, server)
73 | 
74 | 	assert.Equal(t, nil, addServer("127.0.0.1:8801"))
75 | 	assert.Equal(t, nil, getCluster())
76 | 	assert.Equal(t, nil, addServer("127.0.0.1:8802"))
77 | 	assert.Equal(t, nil, getCluster())
78 | 	assert.Equal(t, nil, addServer("127.0.0.1:8803"))
79 | 	assert.Equal(t, nil, getCluster())
80 | 	assert.Equal(t, nil, addServer("127.0.0.1:8804"))
81 | 	assert.Equal(t, nil, getCluster())
82 | }
83 | 


--------------------------------------------------------------------------------
/cluster/searchclient.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/rpc"
 6 | 
 7 | 	"github.com/awesomefly/easysearch/util"
 8 | 
 9 | 	"github.com/awesomefly/easysearch/config"
10 | 	"github.com/awesomefly/easysearch/index"
11 | )
12 | 
13 | //RpcCall RPC方法必须满足Go语言的RPC规则：方法只能有两个可序列化的参数，其中第二个参数是指针类型，并且返回一个error类型，同时必须是公开的方法
14 | func RpcCall(host string, method string, request interface{}, response interface{}) error {
15 | 	client, err := rpc.Dial("tcp", host)
16 | 	if err != nil {
17 | 		log.Fatal("dialing:", err)
18 | 		return err
19 | 	}
20 | 
21 | 	switch v := response.(type) {
22 | 	case *Node:
23 | 		err = client.Call(method, request, v)
24 | 	case *Cluster:
25 | 		err = client.Call(method, request, v)
26 | 	case *[]index.Doc:
27 | 		err = client.Call(method, request, v)
28 | 	}
29 | 	if err != nil {
30 | 		log.Fatal(err)
31 | 		return err
32 | 	}
33 | 	client.Close()
34 | 
35 | 	log.Printf("RPC Response:%+v", response)
36 | 	return nil
37 | }
38 | 
39 | type SearchClient struct {
40 | 	ServerConfig *config.Server //manager server config
41 | 	cluster      *Cluster       //todo: cached and refresh cluster info
42 | }
43 | 
44 | func NewSearchClient(config *config.Server) *SearchClient {
45 | 	client := SearchClient{
46 | 		ServerConfig: config,
47 | 		cluster:      &Cluster{},
48 | 	}
49 | 
50 | 	err := RpcCall(client.ServerConfig.Address(), "ManagerServer.GetCluster", util.GetLocalIP(), client.cluster)
51 | 	if err != nil {
52 | 		panic(err)
53 | 	}
54 | 	return &client
55 | }
56 | 
57 | func (c *SearchClient) Search(query string) ([]index.Doc, error) {
58 | 	response := make([]index.Doc, 0)
59 | 	if err := RpcCall(c.cluster.RouteSearchNode().Host, "SearchServer.SearchAll", query, &response); err != nil {
60 | 		return response, err
61 | 	}
62 | 	return response, nil
63 | }
64 | 


--------------------------------------------------------------------------------
/cluster/searchclient_test.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/awesomefly/easysearch/config"
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestSearchClient(t *testing.T) {
12 | 	config := &config.Server{
13 | 		Host: "127.0.0.1",
14 | 		Port: 1234,
15 | 	}
16 | 	cli := NewSearchClient(config)
17 | 	result, err := cli.Search("Album Jordan")
18 | 	assert.Nil(t, err)
19 | 	assert.NotNil(t, result)
20 | 	fmt.Printf("result:%+v\n", result)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/cluster/searchserver.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"sort"
 6 | 
 7 | 	"github.com/awesomefly/easysearch/config"
 8 | 
 9 | 	"github.com/awesomefly/easysearch/index"
10 | )
11 | 
12 | type SearchServer struct {
13 | 	cluster Cluster
14 | 	server  *Server
15 | }
16 | 
17 | func NewSearchServer(config *config.Config) *SearchServer {
18 | 	self := Node{
19 | 		ID:   rand.Intn(10000), //todo: support uuid
20 | 		Type: SearchNode,
21 | 		Host: config.Server.Address(),
22 | 	}
23 | 	err := RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.AddServer", self, &Node{})
24 | 	if err != nil {
25 | 		panic(err)
26 | 	}
27 | 
28 | 	c := Cluster{}
29 | 	err = RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.GetCluster", "", &c)
30 | 	if err != nil {
31 | 		panic(err)
32 | 	}
33 | 
34 | 	return &SearchServer{
35 | 		cluster: c,
36 | 		server:  &Server{name: "Search", network: "tcp", address: config.Server.Address()},
37 | 	}
38 | 
39 | 	return nil
40 | }
41 | 
42 | func (s *SearchServer) Run() {
43 | 	if err := s.server.RegisterName("SearchServer", s); err != nil {
44 | 		panic(err)
45 | 	}
46 | 	if err := s.server.Run(); err != nil {
47 | 		panic(err)
48 | 	}
49 | }
50 | 
51 | //SearchAll 分布式搜索
52 | //todo: 实现实时更新&删除接口
53 | func (s *SearchServer) SearchAll(query string, response *[]index.Doc) error {
54 | 	r, err := s.cluster.RouteShardingNode(FollowerSharding) //todo: cache router info
55 | 	if err != nil {
56 | 		return err
57 | 	}
58 | 
59 | 	if r == nil || len(r) == 0 {
60 | 		if r, err = s.cluster.RouteShardingNode(LeaderSharding); err != nil {
61 | 			return err
62 | 		}
63 | 	}
64 | 
65 | 	result := make([]index.Doc, 0)
66 | 	for sharding, nodes := range r {
67 | 		n := rand.Intn(len(nodes))
68 | 
69 | 		request := SearchRequest{
70 | 			Query:    query,
71 | 			Sharding: []int{sharding},
72 | 		}
73 | 		var reply []index.Doc
74 | 		if err = RpcCall(nodes[n].Host, "DataServer.Search", request, &reply); err != nil {
75 | 			return err
76 | 		}
77 | 		result = append(result, reply...)
78 | 	}
79 | 
80 | 	//sort and uniq result
81 | 	sort.Slice(result, func(i, j int) bool {
82 | 		return result[i].Score > result[j].Score //降序
83 | 	})
84 | 	*response = result
85 | 	return nil
86 | }
87 | 


--------------------------------------------------------------------------------
/cluster/searchserver_test.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 	"time"
 7 | 
 8 | 	"github.com/awesomefly/easysearch/config"
 9 | 	"github.com/awesomefly/easysearch/index"
10 | 	"github.com/stretchr/testify/assert"
11 | )
12 | 
13 | func TestSearchServer(t *testing.T) {
14 | 
15 | 	var managerConfig = config.Config{
16 | 		Server: config.Server{
17 | 			Host: "127.0.0.1",
18 | 			Port: 1234,
19 | 		},
20 | 		Cluster: config.Cluster{
21 | 			ShardingNum:  10,
22 | 			ReplicateNum: 3,
23 | 		},
24 | 	}
25 | 
26 | 	var dataSvrConfig = config.Config{
27 | 		Store: config.Storage{
28 | 			IndexFile: "../data/wiki_index",
29 | 		},
30 | 		Server: config.Server{
31 | 			Host: "127.0.0.1",
32 | 			Port: 1240,
33 | 		},
34 | 		Cluster: config.Cluster{
35 | 			ShardingNum:  10,
36 | 			ReplicateNum: 3,
37 | 			ManageServer: config.Server{
38 | 				Host: "127.0.0.1",
39 | 				Port: 1234,
40 | 			},
41 | 		},
42 | 	}
43 | 
44 | 	var srhSvrConfig = config.Config{
45 | 		Server: config.Server{
46 | 			Host: "127.0.0.1",
47 | 			Port: 1235,
48 | 		},
49 | 		Cluster: config.Cluster{
50 | 			ShardingNum:  10,
51 | 			ReplicateNum: 3,
52 | 			ManageServer: config.Server{
53 | 				Host: "127.0.0.1",
54 | 				Port: 1234,
55 | 			},
56 | 		},
57 | 	}
58 | 
59 | 	//start ManagerServer
60 | 	ms := NewManagerServer(&managerConfig)
61 | 	assert.NotNil(t, ms)
62 | 	go ms.Run()
63 | 	time.Sleep(1 * time.Second)
64 | 
65 | 	//start DataServer
66 | 
67 | 	ds := NewDataServer(&dataSvrConfig)
68 | 	assert.NotNil(t, ds)
69 | 	go ds.Run()
70 | 	time.Sleep(1 * time.Second)
71 | 
72 | 	srh := NewSearchServer(&srhSvrConfig)
73 | 	var response []index.Doc
74 | 	err := srh.SearchAll("Jordan", &response)
75 | 	assert.Nil(t, err)
76 | 
77 | 	fmt.Printf("%+v\n", response)
78 | }
79 | 


--------------------------------------------------------------------------------
/cluster/server.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"log"
 6 | 	"net"
 7 | 	"net/rpc"
 8 | 	"os"
 9 | 	"os/signal"
10 | 	"syscall"
11 | )
12 | 
13 | type Server struct {
14 | 	name    string
15 | 	network string
16 | 	address string
17 | 
18 | 	listener net.Listener
19 | 	handler  func(conn io.ReadWriteCloser)
20 | }
21 | 
22 | func (s *Server) RegisterName(name string, rcvr interface{}) error {
23 | 	if err := rpc.RegisterName(name, rcvr); err != nil {
24 | 		return err
25 | 	}
26 | 
27 | 	s.handler = func(conn io.ReadWriteCloser) {
28 | 		rpc.ServeConn(conn)
29 | 	}
30 | 	return nil
31 | }
32 | 
33 | func (s *Server) Run() error {
34 | 	errChan := make(chan error, 1)
35 | 	go func() { errChan <- s.Start() }()
36 | 
37 | 	if err := s.waitSignal(errChan); err != nil {
38 | 		log.Fatal("received error and exit: ", err.Error())
39 | 		return err
40 | 	}
41 | 
42 | 	// stop server after user hooks
43 | 	if err := s.Stop(); err != nil {
44 | 		log.Fatal("stop server error: ", err.Error())
45 | 		return err
46 | 	}
47 | 	return nil
48 | }
49 | 
50 | func (s *Server) Start() error {
51 | 	var err error
52 | 	s.listener, err = net.Listen(s.network, s.address)
53 | 	if err != nil {
54 | 		log.Fatal("Server ListenTCP error:", err)
55 | 		return err
56 | 	}
57 | 	log.Printf("%s Server Started.", s.name)
58 | 
59 | 	for {
60 | 		conn, err := s.listener.Accept()
61 | 		if err != nil {
62 | 			log.Fatal("Accept error:", err)
63 | 			return err
64 | 		}
65 | 		s.handler(conn)
66 | 	}
67 | }
68 | 
69 | func (s *Server) waitSignal(errCh chan error) error {
70 | 	signals := make(chan os.Signal, 1)
71 | 	signal.Notify(signals, syscall.SIGINT, syscall.SIGHUP, syscall.SIGTERM)
72 | 
73 | 	for {
74 | 		select {
75 | 		case sig := <-signals:
76 | 			switch sig {
77 | 			case syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM:
78 | 				return nil
79 | 			}
80 | 		case err := <-errCh:
81 | 			return err
82 | 		}
83 | 	}
84 | }
85 | 
86 | func (s *Server) Stop() error {
87 | 	if s.listener != nil {
88 | 		return s.listener.Close()
89 | 	}
90 | 	return nil
91 | }
92 | 


--------------------------------------------------------------------------------
/cluster/shardingindexer.go:
--------------------------------------------------------------------------------
 1 | package cluster
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"time"
 8 | 
 9 | 	"github.com/awesomefly/easysearch/config"
10 | 
11 | 	"github.com/awesomefly/easysearch/index"
12 | )
13 | 
14 | func Index(conf *config.Config) {
15 | 	log.Println("Starting sharding index...")
16 | 
17 | 	start := time.Now()
18 | 	docs, err := index.LoadDocuments(conf.Store.DumpFile)
19 | 	if err != nil {
20 | 		log.Fatal(err)
21 | 	}
22 | 	log.Printf("Loaded %d documents in %v", len(docs), time.Since(start))
23 | 
24 | 	shards := conf.Cluster.ShardingNum
25 | 	idxes := make([]*index.BTreeIndex, 0, shards)
26 | 	for i := 0; i < shards; i++ {
27 | 		IndexFile := fmt.Sprintf("%s.%d", conf.Store.IndexFile, i)
28 | 		os.Remove(IndexFile + ".idx")
29 | 		os.Remove(IndexFile + ".kv")
30 | 		os.Remove(IndexFile + ".sum")
31 | 
32 | 		idx := index.NewBTreeIndex(IndexFile)
33 | 		idxes = append(idxes, idx)
34 | 	}
35 | 
36 | 	buf := make([][]index.Document, shards)
37 | 	for i := 0; i < len(buf); i++ {
38 | 		buf[i] = make([]index.Document, 0)
39 | 	}
40 | 
41 | 	start = time.Now()
42 | 	for i := 0; i < len(docs); i++ {
43 | 		id := docs[i].ID % shards
44 | 		buf[id] = append(buf[id], docs[i])
45 | 		//log.Printf("keys:%s", docs[i].Text)
46 | 
47 | 		if len(buf[id]) > 20 {
48 | 			idxes[id].Add(buf[id])
49 | 			buf[id] = make([]index.Document, 0)
50 | 		}
51 | 	}
52 | 
53 | 	for i := 0; i < len(buf); i++ {
54 | 		if len(buf[i]) > 0 {
55 | 			idxes[i].Add(buf[i])
56 | 		}
57 | 	}
58 | 
59 | 	for i := 0; i < shards; i++ {
60 | 		idxes[i].BT.Drain()
61 | 		log.Printf("sharding index_%d has %d keys", i, idxes[i].BT.Count())
62 | 		idxes[i].Close()
63 | 	}
64 | 	log.Printf("build index %d documents in %v", len(docs), time.Since(start))
65 | }
66 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | Storage:
 2 |   IndexFile: ./data/wiki_index
 3 |   DumpFile: ./data/enwiki-latest-abstract18.xml.gz
 4 | BM25:
 5 |   K1: 2
 6 |   B: 0.75
 7 | Server:
 8 |   Host:
 9 |   Port:
10 | Cluster:
11 |   ShardingNum: 10
12 |   ReplicateNum: 3
13 |   ManageServer:
14 |     Host: 127.0.0.1
15 |     Port: 1234


--------------------------------------------------------------------------------
/config/Config.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io/ioutil"
 6 | 	"path/filepath"
 7 | 
 8 | 	"gopkg.in/yaml.v2"
 9 | )
10 | 
11 | type BM25Parameters struct {
12 | 	K1 float32 `yaml:"K1"`
13 | 	B  float32 `yaml:"B"`
14 | }
15 | 
16 | type Storage struct {
17 | 	DumpFile  string `yaml:"DumpFile"`
18 | 	IndexFile string `yaml:"IndexFile"`
19 | 	ModelFile string `yaml:"ModelFile"`
20 | }
21 | 
22 | type Cluster struct {
23 | 	ShardingNum  int      `yaml:"ShardingNum"`
24 | 	ReplicateNum int      `yaml:"ReplicateNum"`
25 | 	ManageServer Server   `yaml:"ManageServer"`
26 | 	SearchServer []Server `yaml:"SearchServer"`
27 | 	DataServer   []Server `yaml:"DataServer"`
28 | }
29 | 
30 | type Server struct {
31 | 	Host string `yaml:"Host"`
32 | 	Port int    `yaml:"Port"`
33 | }
34 | 
35 | func (s *Server) Address() string {
36 | 	return fmt.Sprint(s.Host, ":", s.Port)
37 | }
38 | 
39 | type Config struct {
40 | 	Store   Storage        `yaml:"Storage"`
41 | 	BM25    BM25Parameters `yaml:"BM25"`
42 | 	Server  Server         `yaml:"Server"`
43 | 	Cluster Cluster        `yaml:"Cluster"`
44 | }
45 | 
46 | func InitClusterConfig(path string) *Cluster {
47 | 	file, _ := filepath.Abs(path)
48 | 	buffer, err := ioutil.ReadFile(file)
49 | 	if err != nil {
50 | 		panic(err.Error())
51 | 	}
52 | 
53 | 	cluster := Cluster{}
54 | 	if err = yaml.Unmarshal(buffer, &cluster); err != nil {
55 | 		panic(err.Error())
56 | 	}
57 | 	fmt.Printf("cluster: %+v\n", cluster)
58 | 	return &cluster
59 | }
60 | 
61 | func InitConfig(path string) *Config {
62 | 	file, _ := filepath.Abs(path)
63 | 	buffer, err := ioutil.ReadFile(file)
64 | 	if err != nil {
65 | 		panic(err.Error())
66 | 	}
67 | 
68 | 	config := Config{}
69 | 	if err = yaml.Unmarshal(buffer, &config); err != nil {
70 | 		panic(err.Error())
71 | 	}
72 | 	fmt.Printf("config: %+v\n", config)
73 | 	return &config
74 | }
75 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/awesomefly/easysearch
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	code.sajari.com/word2vec v1.0.0
 7 | 	github.com/RoaringBitmap/roaring v1.2.1
 8 | 	//github.com/awesomefly/gobtree v0.0.0-20211221104356-310dd71c2899
 9 | 	github.com/awesomefly/gobtree v0.0.0
10 | 	github.com/davecgh/go-spew v1.1.1 // indirect
11 | 	github.com/go-nlp/bm25 v1.0.0
12 | 	github.com/go-nlp/tfidf v1.1.0
13 | 	github.com/gogo/protobuf v1.3.2 // indirect
14 | 	github.com/golang/protobuf v1.5.2 // indirect
15 | 	github.com/google/go-cmp v0.5.6 // indirect
16 | 	github.com/kljensen/snowball v0.6.0
17 | 	github.com/kr/pretty v0.2.0 // indirect
18 | 	github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b
19 | 	github.com/stretchr/testify v1.7.0
20 | 	github.com/xtgo/set v1.0.0
21 | 	github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb // indirect
22 | 	golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6 // indirect
23 | 	google.golang.org/protobuf v1.27.1 // indirect
24 | 	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
25 | 	gopkg.in/yaml.v2 v2.4.0
26 | 	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
27 | )
28 | 
29 | replace github.com/awesomefly/gobtree => ../gobtree
30 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | code.sajari.com/word2vec v1.0.0 h1:gg1Bk3ea3mGPZMS2/qh1iPJM5iotSSHyIxq4gUdlH+0=
  2 | code.sajari.com/word2vec v1.0.0/go.mod h1:Ut8mx+2Q79Js3uGW1+HtbuuUIWoGMAMCOZK06xDly00=
  3 | dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
  4 | github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
  5 | github.com/RoaringBitmap/roaring v1.2.1 h1:58/LJlg/81wfEHd5L9qsHduznOIhyv4qb1yWcSvVq9A=
  6 | github.com/RoaringBitmap/roaring v1.2.1/go.mod h1:icnadbWcNyfEHlYdr+tDlOTih1Bf/h+rzPpv4sbomAA=
  7 | github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
  8 | github.com/apache/arrow/go/arrow v0.0.0-20200909005831-30143fc493df h1:iXnL0pMIR/RDUWl0kCbc0CQ3UyehlyV+t/DYCLJTbFc=
  9 | github.com/apache/arrow/go/arrow v0.0.0-20200909005831-30143fc493df/go.mod h1:QNYViu/X0HXDHw7m3KXzWSVXIbfUvJqBFe6Gj8/pYA0=
 10 | github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA=
 11 | github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
 12 | github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 13 | github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 14 | github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
 15 | github.com/chewxy/math32 v1.0.6 h1:JWZYUNl2rtgVVui6z8JBsDgkOG2DYmfSODyo95yKfx4=
 16 | github.com/chewxy/math32 v1.0.6/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
 17 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 18 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 19 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 20 | github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 21 | github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 22 | github.com/go-nlp/bm25 v1.0.0 h1:PqYFen7iJ9ELrwfX0bCDlHRFDI9Noh70L7Tiu4I5w0U=
 23 | github.com/go-nlp/bm25 v1.0.0/go.mod h1:S6rXCa2AzMDkurLgYY1PDvjr0DhjBs2V6RwyI/jFDrc=
 24 | github.com/go-nlp/tfidf v1.1.0 h1:o7WgFHu6ZhhB2t9r60ZOvJczRc9wIXcHpdihHPFzR4c=
 25 | github.com/go-nlp/tfidf v1.1.0/go.mod h1:mp0+0R3dyswbtXFKrMO4cS134Qd5ZcTG8Z6QGFiSqBI=
 26 | github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 27 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 28 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 29 | github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 30 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 31 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
 32 | github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
 33 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 34 | github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A=
 35 | github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
 36 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 37 | github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ=
 38 | github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 39 | github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 40 | github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 41 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 42 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 43 | github.com/kljensen/snowball v0.6.0 h1:6DZLCcZeL0cLfodx+Md4/OLC6b/bfurWUOUGs1ydfOU=
 44 | github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw=
 45 | github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
 46 | github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 47 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 48 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 49 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 50 | github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
 51 | github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 52 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 53 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 54 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 55 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 56 | github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b h1:h+3JX2VoWTFuyQEo87pStk/a99dzIO1mM9KxIyLPGTU=
 57 | github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc=
 58 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 59 | github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 60 | github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 61 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 62 | github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 63 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 64 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 65 | github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 66 | github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
 67 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 68 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 69 | github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb h1:uWiILQloLUVdtPYr1ZZo2zqtlpzo4G8vUpglo/Fs2H8=
 70 | github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb/go.mod h1:J3xKssoVdrwZ2E29fIox/EKxOZWimS7AZ4fOTCFkOLo=
 71 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 72 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 73 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 74 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 75 | golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 76 | golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 77 | golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 78 | golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6 h1:QE6XYQK6naiK1EPAe1g/ILLxN5RBoH5xkJk3CqlMI/Y=
 79 | golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
 80 | golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 81 | golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 82 | golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 83 | golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
 84 | golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 85 | golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 86 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 87 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 88 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 89 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 90 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 91 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 92 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 93 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 94 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 95 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 96 | golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 97 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 98 | golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 99 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
100 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
101 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
102 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
103 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
104 | golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
105 | golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
106 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
107 | golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
108 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
109 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
110 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
111 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
112 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
113 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
114 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
115 | gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
116 | gonum.org/v1/gonum v0.7.0 h1:Hdks0L0hgznZLG9nzXb8vZ0rRvqNvAcgAp84y7Mwkgw=
117 | gonum.org/v1/gonum v0.7.0/go.mod h1:L02bwd0sqlsvRv41G7wGWFCsVNZFv/k1xzGIxeANHGM=
118 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
119 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
120 | gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
121 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
122 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
123 | google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
124 | google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
125 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
126 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
127 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
128 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
129 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
130 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
131 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
132 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
133 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
134 | gorgonia.org/tensor v0.9.11 h1:L7C+syNtsIcZ/91tJFT0QnAzXJyFt6tWSW6+URIucDM=
135 | gorgonia.org/tensor v0.9.11/go.mod h1:fsbuoeL1vV3fe8N+HZxEXJ7WI4z1pPP3luMBCgn0HAA=
136 | gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
137 | gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
138 | gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
139 | gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
140 | rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
141 | 


--------------------------------------------------------------------------------
/index/btreeindex.go:
--------------------------------------------------------------------------------
  1 | //
  2 | // btree inverted index's data structure
  3 | //
  4 | // |<-------------btree------------>| <--posting list-->|
  5 | // |<-intermediate->|<--leaf node-->|
  6 | //				 -     --- ---          --- --- --- ---
  7 | //				| | - |   |   |    -   |   |   |   |   |
  8 | //               -     --- ---          --- --- --- ---
  9 | //			  /
 10 | //		   -     -     --- ---          --- --- --- ---
 11 | //        | | - | | - |   |   |    -   |   |   |   |   |
 12 | //		   -     -	   --- ---          --- --- --- ---
 13 | //		/
 14 | //   -     -     -     --- ---          --- --- --- ---
 15 | //	| | - | | - | | - |   |   |    -   |   |   |   |   |
 16 | //	 -	   -     -     --- ---          --- --- --- ---
 17 | //		\
 18 | //		   -     -     --- ---          --- --- --- ---
 19 | //        | | - | | - |   |   |    -   |   |   |   |   |
 20 | //		   -	 -     --- ---          --- --- --- ---
 21 | // |<--in memory--> | <-----------on disk-------------->|
 22 | //
 23 | //
 24 | 
 25 | package index
 26 | 
 27 | import (
 28 | 	"bytes"
 29 | 	"encoding/binary"
 30 | 	"io"
 31 | 	"os"
 32 | 	"sort"
 33 | 	"unsafe"
 34 | 
 35 | 	"github.com/awesomefly/easysearch/util"
 36 | 	btree "github.com/awesomefly/gobtree"
 37 | )
 38 | 
 39 | var DefaultConfig = btree.Config{
 40 | 	IndexConfig: btree.IndexConfig{
 41 | 		Sectorsize: 512,
 42 | 		Flistsize:  1000 * btree.OFFSET_SIZE,
 43 | 		Blocksize:  512,
 44 | 	},
 45 | 	Maxlevel:      4,
 46 | 	RebalanceThrs: 30,
 47 | 	AppendRatio:   0.7,
 48 | 	DrainRate:     100,
 49 | 	MaxLeafCache:  0, // intermediate node in memory and leaf node in disk
 50 | 	Sync:          false,
 51 | 	Nocache:       false,
 52 | }
 53 | 
 54 | type BTreeIndex struct {
 55 | 	//skip-list vs btree:
 56 | 	//https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537
 57 | 	BT        *btree.BTree
 58 | 	IndexFile string
 59 | 
 60 | 	property Property
 61 | }
 62 | 
 63 | func NewBTreeIndex(file string) *BTreeIndex {
 64 | 	conf := DefaultConfig
 65 | 	conf.Idxfile, conf.Kvfile = file+".idx", file+".kv"
 66 | 	bt := BTreeIndex{
 67 | 		IndexFile: file,
 68 | 		BT:        btree.NewBTree(btree.NewStore(conf)), // todo: 索引文件太大，索引压缩、posting list压缩
 69 | 		property: Property{
 70 | 			docNum:     0,
 71 | 			tokenCount: 0,
 72 | 			dataRange: DataRange{Start: 0, End: 0},
 73 | 		},
 74 | 	}
 75 | 
 76 | 	bt.Load()
 77 | 	return &bt
 78 | }
 79 | 
 80 | func (bt *BTreeIndex) Save() {
 81 | 	file := bt.IndexFile + ".sum"
 82 | 	os.Create(file)
 83 | 
 84 | 	// Index store
 85 | 	fd, err := os.OpenFile(file, os.O_RDWR|os.O_CREATE, 0660)
 86 | 	if err != nil {
 87 | 		panic(err.Error())
 88 | 	}
 89 | 
 90 | 	buffer := bytes.NewBuffer([]byte{})
 91 | 	if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.docNum)); err != nil {
 92 | 		panic(err)
 93 | 	}
 94 | 
 95 | 	if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.tokenCount)); err != nil {
 96 | 		panic(err)
 97 | 	}
 98 | 
 99 | 	if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.dataRange.Start)); err != nil {
100 | 		panic(err)
101 | 	}
102 | 	if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.dataRange.End)); err != nil {
103 | 		panic(err)
104 | 	}
105 | 
106 | 	if _, err := fd.Write(buffer.Bytes()); err != nil {
107 | 		panic(err)
108 | 	}
109 | 	fd.Close()
110 | }
111 | 
112 | func (bt *BTreeIndex) Load() {
113 | 	// Index store
114 | 	file := bt.IndexFile + ".sum"
115 | 	fd, err := os.OpenFile(file, os.O_RDONLY|os.O_CREATE, 0660)
116 | 	if err != nil {
117 | 		panic(err.Error())
118 | 	}
119 | 
120 | 	data := make([]byte, unsafe.Sizeof(bt.property.docNum)+unsafe.Sizeof(bt.property.tokenCount))
121 | 	if n, err := fd.Read(data); err != nil {
122 | 		if n == 0 && err == io.EOF {
123 | 			return
124 | 		}
125 | 		panic(err.Error())
126 | 	}
127 | 
128 | 	buffer := bytes.NewBuffer(data)
129 | 	if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.docNum))); err != nil {
130 | 		panic(err.Error())
131 | 	}
132 | 
133 | 	if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.tokenCount))); err != nil {
134 | 		panic(err.Error())
135 | 	}
136 | 	if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.dataRange.Start))); err != nil {
137 | 		panic(err.Error())
138 | 	}
139 | 	if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.dataRange.End))); err != nil {
140 | 		panic(err.Error())
141 | 	}
142 | 
143 | 	fd.Close()
144 | }
145 | 
146 | func (bt *BTreeIndex) Close() {
147 | 	bt.BT.Drain()
148 | 	bt.BT.Close()
149 | 	bt.Save()
150 | }
151 | 
152 | func (bt *BTreeIndex) Clear() {
153 | 	bt.Close()
154 | 
155 | 	// delete deprecated index
156 | 	os.Remove(bt.IndexFile + ".sum")
157 | 	os.Remove(bt.IndexFile + ".idx")
158 | 	os.Remove(bt.IndexFile + ".kv")
159 | }
160 | 
161 | func (bt *BTreeIndex) Keys() []string {
162 | 	keys := make(sort.StringSlice, bt.Property().tokenCount)
163 | 
164 | 	ch := bt.BT.KeySet()
165 | 	for {
166 | 		key := <-ch
167 | 		if key == nil {
168 | 			break
169 | 		}
170 | 		keys = append(keys, string(key))
171 | 	}
172 | 	return keys
173 | }
174 | 
175 | func (bt *BTreeIndex) Lookup(token string, dirty bool) PostingList {
176 | 	key := &btree.TestKey{K: token}
177 | 
178 | 	var ch chan []byte
179 | 	if dirty {
180 | 		ch = bt.BT.LookupDirty(key)
181 | 	} else {
182 | 		ch = bt.BT.Lookup(key)
183 | 	}
184 | 	values := make([][]byte, 0)
185 | 	for {
186 | 		x := <-ch
187 | 		if x == nil {
188 | 			break
189 | 		}
190 | 		values = append(values, x)
191 | 	}
192 | 
193 | 	if len(values) == 0 {
194 | 		return nil
195 | 	}
196 | 
197 | 	var p PostingList
198 | 	p.FromBytes(values[0])
199 | 	return p
200 | }
201 | 
202 | // Add 该方法比较低效，批量插入文档会在posting list后不段追加新文档，但postinglist并未预留空间，
203 | // 因此需要移动到新的空间，导致文件数据拷贝
204 | func (bt *BTreeIndex) Add(docs []Document) {
205 | 	for _, doc := range docs {
206 | 		tokens := util.Analyze(doc.Text)
207 | 		for _, token := range tokens {
208 | 			//log.Printf("token:%s", token)
209 | 			key := &btree.TestKey{K: token}
210 | 			postingList := bt.Lookup(token, true)
211 | 			if postingList != nil {
212 | 				if last := postingList.Find(doc.ID); last != nil {
213 | 					// Don't add same ID twice. But should update frequency
214 | 					last.TF++
215 | 					last.QualityScore = CalDocScore(last.TF, 0)
216 | 					bt.BT.Insert(key, postingList)
217 | 					continue
218 | 				}
219 | 			}
220 | 			item := Doc{
221 | 				ID:           int32(doc.ID),
222 | 				DocLen:       int32(len(tokens)),
223 | 				TF:           1,
224 | 				QualityScore: CalDocScore(1, 0),
225 | 			}
226 | 			//add to posting list & sort by score
227 | 			postingList = append(postingList, item)
228 | 			sort.Slice(postingList, func(i, j int) bool {
229 | 				return postingList[i].QualityScore > postingList[j].QualityScore
230 | 			})
231 | 			bt.BT.Insert(key, postingList)
232 | 		}
233 | 		bt.property.docNum++
234 | 		bt.property.tokenCount += len(tokens)
235 | 	}
236 | 	bt.BT.Drain()
237 | }
238 | 
239 | func (bt *BTreeIndex) Insert(key string, pl PostingList) {
240 | 	bt.BT.Insert(&btree.TestKey{K: key}, pl)
241 | 	bt.property.docNum += pl.Len()
242 | 	bt.property.tokenCount++
243 | }
244 | 
245 | func (bt *BTreeIndex) Get(term string) []Doc {
246 | 	if postingList := bt.Lookup(term, false); postingList != nil {
247 | 		return postingList
248 | 	}
249 | 	return nil
250 | }
251 | 
252 | func (bt *BTreeIndex) Property() *Property {
253 | 	return &bt.property
254 | }
255 | 
256 | func (bt *BTreeIndex) SetProperty(p Property) {
257 | 	bt.property = p
258 | }
259 | 
260 | func (bt *BTreeIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc {
261 | 	return DoRetrieval(bt, must, should, not, k, r, m)
262 | }


--------------------------------------------------------------------------------
/index/btreeindex_test.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/awesomefly/easysearch/util"
 6 | 	"github.com/stretchr/testify/assert"
 7 | 	"os"
 8 | 	"testing"
 9 | )
10 | 
11 | func GetIDs(docs []Doc) []int {
12 | 	var ids []int
13 | 	for _, doc := range docs {
14 | 		ids = append(ids, int(doc.ID))
15 | 	}
16 | 	return ids
17 | }
18 | 
19 | func TestBTreeIndex(t *testing.T) {
20 | 	os.Remove("../data/btree_idx_test.idx")
21 | 	os.Remove("../data/btree_idx_test.kv")
22 | 	os.Remove("../data/btree_idx_test.sum")
23 | 
24 | 	idx := NewBTreeIndex("../data/btree_idx_test")
25 | 	idx.Add([]Document{{ID: 1, Text: "A donut on a glass plate. Only the."}})
26 | 	idx.Add([]Document{{ID: 2, Text: "donut is a donut"}})
27 | 	fmt.Printf("Count:%d\n", idx.BT.Count())
28 | 
29 | 	ch := idx.BT.FullSet()
30 | 	for {
31 | 		k := <-ch
32 | 		d := <-ch
33 | 		v := <-ch
34 | 		if k == nil || d == nil || v == nil {
35 | 			break
36 | 		}
37 | 		//id, err := strconv.ParseInt(string(d), 10, 64)  // key's id
38 | 		//if err != nil {
39 | 		//	panic(err)
40 | 		//}
41 | 		//fmt.Printf("id:%d\n", id)
42 | 
43 | 		var nv PostingList
44 | 		nv.FromBytes(v)
45 | 		fmt.Printf("key:%s, val:%+v\n", k, nv)
46 | 	}
47 | 
48 | 
49 | 	fmt.Printf("Lookup: %+v\n", idx.Lookup("donut", false))
50 | 	fmt.Printf("Retrieval: %+v\n", idx.Retrieval([]string{"glass"}, []string{"donut"}, nil, 100, 10, Boolean))
51 | 
52 | 	assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean))
53 | 
54 | 	ids := GetIDs(idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean))
55 | 	assert.Equal(t, []int{2, 1}, ids)
56 | 	assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean)))
57 | 	assert.Equal(t, []int{1}, GetIDs(idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean)))
58 | 
59 | 	assert.Nil(t, GetIDs(idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean)))
60 | 	assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean)))
61 | 	assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean)))
62 | 	assert.Equal(t, []int{1}, GetIDs(idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean)))
63 | 
64 | 	idx.Close()
65 | 	//time.Sleep(5*time.Second)
66 | }
67 | 


--------------------------------------------------------------------------------
/index/document.go:
--------------------------------------------------------------------------------
  1 | package index
  2 | 
  3 | import (
  4 | 	"compress/gzip"
  5 | 	"encoding/xml"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"log"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | )
 12 | 
 13 | // Document represents a Wikipedia abstract dump document.
 14 | type Document struct {
 15 | 	Title string `xml:"title"`
 16 | 	URL   string `xml:"url"`
 17 | 	Text  string `xml:"abstract"`
 18 | 	Timestamp int
 19 | 	ID    int
 20 | }
 21 | 
 22 | // LoadDocuments loads a Wikipedia abstract dump and returns a slice of documents.
 23 | // Dump example from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
 24 | func LoadDocuments(path string) ([]Document, error) {
 25 | 	abspath, err := filepath.Abs(path)
 26 | 	if err != nil {
 27 | 		return nil, err
 28 | 	}
 29 | 	f, err := os.Open(abspath)
 30 | 	if err != nil {
 31 | 		return nil, err
 32 | 	}
 33 | 	defer f.Close()
 34 | 
 35 | 	gz, err := gzip.NewReader(f)
 36 | 	if err != nil {
 37 | 		return nil, err
 38 | 	}
 39 | 	defer gz.Close()
 40 | 
 41 | 
 42 | 	dump := struct {
 43 | 		Documents []Document `xml:"doc"`
 44 | 	}{}
 45 | 	dec := xml.NewDecoder(gz)
 46 | 	dec.Token()
 47 | 	if err := dec.Decode(&dump); err != nil {
 48 | 		return nil, err
 49 | 	}
 50 | 	docs := dump.Documents
 51 | 	for i := range docs {
 52 | 		docs[i].ID = i
 53 | 	}
 54 | 	return docs, nil
 55 | }
 56 | 
 57 | func LoadDocumentStream(path string) (chan *Document, error) {
 58 | 	abspath, err := filepath.Abs(path)
 59 | 	if err != nil {
 60 | 		return nil, err
 61 | 	}
 62 | 	f, err := os.Open(abspath)
 63 | 	if err != nil {
 64 | 		return nil, err
 65 | 	}
 66 | 
 67 | 	gz, err := gzip.NewReader(f)
 68 | 	if err != nil {
 69 | 		return nil, err
 70 | 	}
 71 | 
 72 | 	ch := make(chan *Document, 10)
 73 | 
 74 | 	dec := xml.NewDecoder(gz)
 75 | 	go func() {
 76 | 		defer f.Close()
 77 | 		defer gz.Close()
 78 | 		id := 0
 79 | 		for {
 80 | 			tok, err := dec.Token()
 81 | 			if tok == nil && err == io.EOF {
 82 | 				ch <- nil
 83 | 				// EOF means we're done.
 84 | 				log.Println("EOF means we're done.")
 85 | 				break
 86 | 			} else if err != nil {
 87 | 				//log.Fatalf("Error decoding token: %s", err.Error())
 88 | 				panic(err)
 89 | 			}
 90 | 
 91 | 			switch ty := tok.(type) {
 92 | 			case xml.StartElement:
 93 | 				if ty.Name.Local == "doc" {
 94 | 					// If this is a start element named "location", parse this element
 95 | 					// fully.
 96 | 					doc := Document{}
 97 | 					if err = dec.DecodeElement(&doc, &ty); err != nil {
 98 | 						//log.Fatalf("Error decoding item: %s", err.Error())
 99 | 						panic(err)
100 | 					}
101 | 					id++
102 | 					doc.ID = id
103 | 					ch <- &doc
104 | 					if id % 5000 == 0 {
105 | 						fmt.Printf("load %d docs\n", id)
106 | 					}
107 | 				}
108 | 			default:
109 | 			}
110 | 		}
111 | 	}()
112 | 	return ch, nil
113 | }


--------------------------------------------------------------------------------
/index/document_test.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"testing"
 7 | 	"time"
 8 | )
 9 | 
10 | func TestLoadDocumentStream(t *testing.T) {
11 | 	ch, err := LoadDocumentStream("../data/tem.xml")
12 | 	if err != nil {
13 | 		log.Fatal(err)
14 | 		return
15 | 	}
16 | 
17 | 	for {
18 | 		//timeout := time.NewTimer(1 * time.Second)
19 | 		select {
20 | 		case doc := <-ch:
21 | 			if doc == nil {
22 | 				fmt.Println("doc is nil")
23 | 				return
24 | 			}
25 | 			fmt.Println(doc)
26 | 			//fmt.Printf("recv doc: %v", *doc)
27 | 
28 | 			continue
29 | 			//case <-timeout.C:
30 | 			//	log.Printf("Read timeout. err: %s", err.Error())
31 | 			//	break
32 | 		}
33 | 		break
34 | 	}
35 | 	time.Sleep(5*time.Second)
36 | }
37 | 


--------------------------------------------------------------------------------
/index/hashmapindex.go:
--------------------------------------------------------------------------------
  1 | package index
  2 | 
  3 | import (
  4 | 	"github.com/awesomefly/easysearch/util"
  5 | 	"sort"
  6 | )
  7 | 
  8 | func IfElseInt(condition bool, o1 int, o2 int) int {
  9 | 	if condition {
 10 | 		return o1
 11 | 	}
 12 | 	return o2
 13 | }
 14 | 
 15 | // CalDocScore
 16 | // todo: calculate doc static score by PageRank + frequency
 17 | func CalDocScore(frequency int32, pagerank int) float64 {
 18 | 	return float64(frequency * 1.0)
 19 | }
 20 | 
 21 | // HashMapIndex is an inverted index. It maps tokens to document IDs.
 22 | type HashMapIndex struct {
 23 | 	tbl map[string]PostingList
 24 | 
 25 | 	property Property
 26 | }
 27 | 
 28 | func NewHashMapIndex() *HashMapIndex {
 29 | 	return &HashMapIndex{
 30 | 		tbl: make(map[string]PostingList),
 31 | 		property: Property{
 32 | 			docNum:     0,
 33 | 			tokenCount: 0,
 34 | 			dataRange:  DataRange{Start: 0, End: 0},
 35 | 		},
 36 | 	}
 37 | }
 38 | 
 39 | func (idx *HashMapIndex) Property() *Property {
 40 | 	return &idx.property
 41 | }
 42 | 
 43 | func (idx *HashMapIndex) Map() map[string]PostingList {
 44 | 	return idx.tbl
 45 | }
 46 | 
 47 | func (idx *HashMapIndex) Keys() []string {
 48 | 	//keys := make(sort.StringSlice, idx.Property().tokenCount)
 49 | 	var keys sort.StringSlice
 50 | 	for k := range idx.tbl { //map 遍历访问是无序的
 51 | 		keys = append(keys, k)
 52 | 	}
 53 | 	return keys
 54 | }
 55 | // Add adds documents to the index.
 56 | // todo: Support indexing multiple document fields.
 57 | func (idx *HashMapIndex) Add(docs []Document) {
 58 | 	for _, doc := range docs {
 59 | 		tokens := util.Analyze(doc.Text)
 60 | 		for _, token := range tokens {
 61 | 			postingList := idx.tbl[token]
 62 | 			if postingList != nil {
 63 | 				if last := postingList.Find(doc.ID); last != nil {
 64 | 					// Don't add same ID twice. But should update frequency
 65 | 					//last := &postingList[tokenCount(postingList)-1]
 66 | 					last.TF++
 67 | 					last.QualityScore = CalDocScore(last.TF, 0)
 68 | 					//idx.tbl[token] = postingList
 69 | 					continue
 70 | 				}
 71 | 			}
 72 | 			item := Doc{
 73 | 				ID:           int32(doc.ID),
 74 | 				DocLen:       int32(len(tokens)),
 75 | 				TF:           1,
 76 | 				QualityScore: CalDocScore(1, 0),
 77 | 			}
 78 | 			//add to posting list
 79 | 			idx.tbl[token] = append(postingList, item)
 80 | 		}
 81 | 
 82 | 		idx.property.docNum++
 83 | 		idx.property.tokenCount += len(tokens)
 84 | 	}
 85 | 
 86 | 	//sort by score
 87 | 	for k, v := range idx.tbl {
 88 | 		sort.Slice(v, func(i, j int) bool {
 89 | 			return v[i].QualityScore > v[j].QualityScore
 90 | 		})
 91 | 		idx.tbl[k] = v
 92 | 	}
 93 | }
 94 | 
 95 | // Clear unsafe function
 96 | func (idx *HashMapIndex) Clear() {
 97 | 	idx.property.docNum = 0
 98 | 	idx.property.tokenCount = 0
 99 | 	idx.property.dataRange = DataRange{Start: 0, End: 0}
100 | 	idx.tbl = make(map[string]PostingList)
101 | }
102 | 
103 | func (idx *HashMapIndex) Get(term string) []Doc {
104 | 	if postingList, ok := idx.tbl[term]; ok {
105 | 		return postingList
106 | 	}
107 | 	return nil
108 | }
109 | 
110 | func (idx *HashMapIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc {
111 | 	return DoRetrieval(idx, must, should, not, k, r, m)
112 | }


--------------------------------------------------------------------------------
/index/hashmapindex_test.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/awesomefly/easysearch/util"
 8 | 
 9 | 	"github.com/stretchr/testify/assert"
10 | )
11 | 
12 | func TestIndex(t *testing.T) {
13 | 	idx := NewHashMapIndex()
14 | 
15 | 	idx.Add([]Document{{ID: 1, Text: "A donut on a glass plate. Only the donut"}})
16 | 	assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean))
17 | 
18 | 	result := idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean)
19 | 	assert.Equal(t, []int{1}, (PostingList)(result).IDs())
20 | 
21 | 	result = idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean)
22 | 	assert.Equal(t, []int{1}, (PostingList)(result).IDs())
23 | 
24 | 	result = idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean)
25 | 	assert.Equal(t, []int{1}, (PostingList)(result).IDs())
26 | 
27 | 	for s, list := range idx.tbl {
28 | 		fmt.Printf("%s:%v\n", s, list)
29 | 	}
30 | 
31 | 	//=====================================================
32 | 	idx.Add([]Document{{ID: 2, Text: "donut is a donut"}})
33 | 	assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean))
34 | 
35 | 	result = idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean)
36 | 	assert.Equal(t, []int{1, 2}, (PostingList)(result).IDs())
37 | 
38 | 	result = idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean)
39 | 	assert.Equal(t, []int{1, 2}, (PostingList)(result).IDs())
40 | 
41 | 	result = idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean)
42 | 	assert.Equal(t, []int{1}, (PostingList)(result).IDs())
43 | 
44 | 	for s, list := range idx.tbl {
45 | 		fmt.Printf("%s:%v\n", s, list)
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/index/index.go:
--------------------------------------------------------------------------------
  1 | package index
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"encoding/binary"
  7 | 	"log"
  8 | 	"os"
  9 | 	"sort"
 10 | )
 11 | 
 12 | type SearchModel int
 13 | 
 14 | const (
 15 | 	Boolean SearchModel = iota
 16 | 	VectorSpace
 17 | 	BM25
 18 | )
 19 | 
 20 | type KVPair struct {
 21 | 	Key   string
 22 | 	Value PostingList
 23 | }
 24 | 
 25 | type Index interface {
 26 | 	Property() *Property
 27 | 	Keys() []string
 28 | 	Clear()
 29 | 
 30 | 	Add(docs []Document)
 31 | 	Get(term string) []Doc
 32 | 
 33 | 	Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
 34 | }
 35 | 
 36 | // DoRetrieval returns top k docs sorted by boolean model
 37 | // todo: compress posting list and opt intersection/union rt
 38 | // https://blog.csdn.net/weixin_39890629/article/details/111268898
 39 | func DoRetrieval(idx Index, must []string, should []string, not []string, k int, r int, model SearchModel) []Doc {
 40 | 	tfidf := NewTFIDF()
 41 | 
 42 | 	//query's term frequency
 43 | 	tfidf.DOC2TF[VirtualQueryDocId] = make(TF, 0)
 44 | 
 45 | 	calTFIDF := func(term string, dn, df int, plr PostingList) {
 46 | 		tfidf.IDF[term] = CalIDF(dn, df)
 47 | 		for _, doc := range plr {
 48 | 			var tf TF
 49 | 			if tf = tfidf.DOC2TF[doc.ID]; tf == nil {
 50 | 				tf = make(TF, 0)
 51 | 			}
 52 | 			tf[term] = doc.TF
 53 | 			tfidf.DOC2TF[doc.ID] = tf
 54 | 		}
 55 | 	}
 56 | 	properties := idx.Property()
 57 | 
 58 | 	var result PostingList
 59 | 	for _, term := range must {
 60 | 		tfidf.DOC2TF[VirtualQueryDocId][term]++
 61 | 		if pl := (PostingList)(idx.Get(term)); pl != nil {
 62 | 			plr := pl[:IfElseInt(len(pl) > r, r, len(pl))] //胜者表按TF排序,截断前r个,加速归并
 63 | 			sort.Sort(plr)                                 //按docID排序
 64 | 			if result == nil {
 65 | 				result = plr
 66 | 			} else {
 67 | 				result.Inter(plr)
 68 | 			}
 69 | 			calTFIDF(term, properties.DocNum(), len(pl), plr)
 70 | 		} else {
 71 | 			// Token doesn't exist.
 72 | 			continue
 73 | 		}
 74 | 	}
 75 | 
 76 | 	for _, term := range should {
 77 | 		tfidf.DOC2TF[VirtualQueryDocId][term]++
 78 | 		if pl := (PostingList)(idx.Get(term)); pl != nil {
 79 | 			plr := pl[:IfElseInt(len(pl) > r, r, len(pl))]
 80 | 			sort.Sort(plr)
 81 | 			if result == nil {
 82 | 				result = plr //胜者表，截断r
 83 | 			} else {
 84 | 				result.Union(plr)
 85 | 			}
 86 | 			calTFIDF(term, properties.DocNum(), len(pl), plr)
 87 | 		} else {
 88 | 			// Token doesn't exist.
 89 | 			continue
 90 | 		}
 91 | 	}
 92 | 
 93 | 	for _, term := range not {
 94 | 		if pl := (PostingList)(idx.Get(term)); pl != nil {
 95 | 			sort.Sort(pl)
 96 | 			result.Filter(pl)
 97 | 		} else {
 98 | 			// Token doesn't exist.
 99 | 			continue
100 | 		}
101 | 	}
102 | 
103 | 	if model == BM25 {
104 | 		result = CalBM25(result, tfidf, properties.TokenCount(), properties.DocNum())
105 | 	} else if model == VectorSpace {
106 | 		result = CalCosine(result, tfidf)
107 | 	}
108 | 
109 | 	//排序
110 | 	sort.Slice(result, func(i, j int) bool {
111 | 		return result[i].Score > result[j].Score //降序
112 | 	})
113 | 	log.Printf("result sorted:%+v", result)
114 | 
115 | 	if len(result) > k {
116 | 		return result[:k]
117 | 	}
118 | 	return result
119 | }
120 | 
121 | // Drain data to file. sort by key
122 | func Drain(idx Index, file string) {
123 | 	if idx.Property().docNum == 0 {
124 | 		return
125 | 	}
126 | 
127 | 	fd, err := os.OpenFile(file, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0660)
128 | 	if err != nil {
129 | 		panic(err.Error())
130 | 	}
131 | 
132 | 	writer := bufio.NewWriter(fd)
133 | 	defer func() {
134 | 		writer.Flush()
135 | 		fd.Close()
136 | 	}()
137 | 
138 | 	keys := idx.Keys()
139 | 	sort.Strings(keys)
140 | 
141 | 	buffer := bytes.NewBuffer([]byte{})
142 | 	for i := 0; i < len(keys); i++ {
143 | 		k := keys[i]
144 | 		pl := (PostingList)(idx.Get(k)).Bytes()
145 | 
146 | 		buffer.Reset()
147 | 		l := int32(len(k))
148 | 		if err := binary.Write(buffer, binary.LittleEndian, l); err != nil {
149 | 			panic(err)
150 | 		}
151 | 
152 | 		if err := binary.Write(buffer, binary.LittleEndian, []byte(k)); err != nil {
153 | 			panic(err)
154 | 		}
155 | 
156 | 		l = int32(len(pl))
157 | 		if err := binary.Write(buffer, binary.LittleEndian, l); err != nil {
158 | 			panic(err)
159 | 		}
160 | 
161 | 		if err := binary.Write(buffer, binary.LittleEndian, pl); err != nil {
162 | 			panic(err)
163 | 		}
164 | 
165 | 		if _, err := writer.Write(buffer.Bytes()); err != nil {
166 | 			panic(err)
167 | 		}
168 | 	}
169 | }
170 | 
171 | // Load file.
172 | func Load(file string) (chan *KVPair, error) {
173 | 	ch := make(chan *KVPair, 10)
174 | 
175 | 	fd, err := os.OpenFile(file, os.O_RDONLY|os.O_CREATE, 0660)
176 | 	if err != nil {
177 | 		return nil, err
178 | 	}
179 | 
180 | 	ReadInt := func() (int, error) {
181 | 		buf := make([]byte, 4)
182 | 		if n, err := fd.Read(buf); err != nil {
183 | 			return 0, err
184 | 		} else {
185 | 			var leng int32
186 | 			if err = binary.Read(bytes.NewBuffer(buf[:n]), binary.LittleEndian, &leng); err != nil {
187 | 				panic(err)
188 | 			}
189 | 			return int(leng), nil
190 | 		}
191 | 
192 | 	}
193 | 
194 | 	ReadString := func(n int) (string, error) {
195 | 		buf := make([]byte, n)
196 | 		if n, err := fd.Read(buf); err != nil {
197 | 			return "", err
198 | 		} else {
199 | 			return string(buf[:n]), nil
200 | 		}
201 | 
202 | 	}
203 | 
204 | 	go func() {
205 | 		defer fd.Close()
206 | 
207 | 		for {
208 | 			pair := KVPair{}
209 | 			n, err := ReadInt()
210 | 			if err != nil {
211 | 				ch <- nil
212 | 				break
213 | 			}
214 | 			if pair.Key, err = ReadString(n); err != nil {
215 | 				ch <- nil
216 | 				break
217 | 			}
218 | 			if n, err = ReadInt(); err != nil {
219 | 				ch <- nil
220 | 				break
221 | 			}
222 | 
223 | 			var v string
224 | 			if v, err = ReadString(n); err != nil {
225 | 				ch <- nil
226 | 				break
227 | 			}
228 | 			pair.Value.FromBytes([]byte(v))
229 | 			ch <- &pair
230 | 		}
231 | 	}()
232 | 	return ch, nil
233 | }
234 | 


--------------------------------------------------------------------------------
/index/postinglist.go:
--------------------------------------------------------------------------------
  1 | package index
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"sort"
  7 | 
  8 | 	"github.com/xtgo/set"
  9 | )
 10 | 
 11 | type Term struct {
 12 | 	K  string //key
 13 | 	Id int32  //key id
 14 | 	DF int32  //Document Frequency
 15 | }
 16 | 
 17 | type Doc struct {
 18 | 	ID           int32   //doc id
 19 | 	DocLen       int32   //doc length
 20 | 
 21 | 	TF     int32   //词频, eg. 在倒排表term->[doc1,doc2,doc3]中，仅表示term在docX中的词频
 22 | 	QualityScore float64 //静态分、质量分
 23 | 
 24 | 	Score  float64 //bm25/Cosine score used by sort
 25 | }
 26 | 
 27 | func (doc Doc) Bytes() []byte {
 28 | 	buffer := bytes.NewBuffer([]byte{})
 29 | 	err := binary.Write(buffer, binary.LittleEndian, doc)
 30 | 	if err != nil {
 31 | 		panic(err)
 32 | 	}
 33 | 	return buffer.Bytes()
 34 | }
 35 | 
 36 | func (doc *Doc) FromBytes(b []byte) {
 37 | 	buffer := bytes.NewBuffer(b)
 38 | 
 39 | 	err := binary.Read(buffer, binary.LittleEndian, doc)
 40 | 	if err != nil {
 41 | 		panic(err)
 42 | 	}
 43 | }
 44 | 
 45 | type PostingList []Doc
 46 | 
 47 | func (pl PostingList) Len() int           { return len(pl) }
 48 | func (pl PostingList) Less(i, j int) bool { return pl[i].ID > pl[j].ID } //降序, sort by score
 49 | func (pl PostingList) Swap(i, j int) {
 50 | 	pl[i], pl[j] = pl[j], pl[i]
 51 | }
 52 | 
 53 | func (pl PostingList) Find(id int) *Doc {
 54 | 	for i := 0; i < pl.Len(); i++ {
 55 | 		if pl[i].ID == int32(id) {
 56 | 			return &pl[i]
 57 | 		}
 58 | 	}
 59 | 	return nil
 60 | }
 61 | 
 62 | func (pl PostingList) IDs() []int {
 63 | 	ids := make([]int, 0, len(pl))
 64 | 	for _, item := range pl {
 65 | 		ids = append(ids, int(item.ID))
 66 | 	}
 67 | 	sort.Sort(sort.IntSlice(ids))
 68 | 	return ids
 69 | }
 70 | 
 71 | func (pl *PostingList) Inter(docs []Doc) {
 72 | 	l := len(*pl)
 73 | 	*pl = append(*pl, docs...)
 74 | 	size := set.Inter(pl, l)
 75 | 	*pl = (*pl)[:size]
 76 | }
 77 | 
 78 | func (pl *PostingList) Union(docs []Doc) {
 79 | 	l := len(*pl)
 80 | 	*pl = append(*pl, docs...)
 81 | 	size := set.Union(pl, l)
 82 | 	*pl = (*pl)[:size]
 83 | }
 84 | 
 85 | func (pl *PostingList) Filter(docs []Doc) {
 86 | 	l := len(*pl)
 87 | 	join := append(*pl, docs...)
 88 | 	size := set.Inter(join, l)
 89 | 	inter := join[:size]
 90 | 
 91 | 	*pl = append(*pl, inter...)
 92 | 	size = set.Diff(pl, l)
 93 | 	*pl = (*pl)[:size]
 94 | }
 95 | 
 96 | func (pl *PostingList) Append(docs ...Doc) {
 97 | 	*pl = append(*pl, docs...)
 98 | }
 99 | 
100 | func (pl PostingList) Bytes() []byte {
101 | 	buffer := bytes.NewBuffer([]byte{})
102 | 	for _, v := range pl {
103 | 		err := binary.Write(buffer, binary.LittleEndian, v)
104 | 		if err != nil {
105 | 			panic(err)
106 | 		}
107 | 	}
108 | 	return buffer.Bytes()
109 | }
110 | 
111 | func (pl *PostingList) FromBytes(buf []byte) {
112 | 	if buf == nil {
113 | 		return
114 | 	}
115 | 
116 | 	buffer := bytes.NewBuffer(buf)
117 | 	for buffer.Len() > 0 {
118 | 		var item Doc
119 | 		binary.Read(buffer, binary.LittleEndian, &item)
120 | 		*pl = append(*pl, item)
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/index/postinglist_test.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/stretchr/testify/assert"
 8 | )
 9 | 
10 | func TestPostingList(t *testing.T) {
11 | 	it := Doc{
12 | 		ID:           1,
13 | 		TF:           30,
14 | 		QualityScore: 10.11,
15 | 	}
16 | 
17 | 	it2 := Doc{
18 | 		ID:           2,
19 | 		TF:           20,
20 | 		QualityScore: 20.22,
21 | 	}
22 | 
23 | 	bb := it.Bytes()
24 | 	it3 := Doc{}
25 | 	it3.FromBytes(bb)
26 | 	fmt.Printf("it3: %+v\n", it3)
27 | 
28 | 	var pl PostingList
29 | 	pl = append(pl, it)
30 | 	pl = append(pl, it2)
31 | 
32 | 	fmt.Printf("pl:%+v\n", pl)
33 | 
34 | 	bytes := pl.Bytes()
35 | 
36 | 	var pl2 PostingList
37 | 	pl2.FromBytes(bytes)
38 | 	fmt.Printf("pl2:%+v\n", pl2)
39 | 	assert.Equal(t, len(pl), len(pl2))
40 | }
41 | 


--------------------------------------------------------------------------------
/index/property.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | type DataRange struct {
 4 | 	Start int
 5 | 	End   int
 6 | }
 7 | 
 8 | type Property struct {
 9 | 	// docNum is the count of documents
10 | 	docNum int
11 | 
12 | 	// tokenCount is the total length of tokens
13 | 	tokenCount int
14 | 
15 | 	//dataRange
16 | 	dataRange DataRange
17 | }
18 | 
19 | func (idx *Property) DocNum() int {
20 | 	return idx.docNum
21 | }
22 | 
23 | func (idx *Property) SetDocNum(num int) {
24 | 	idx.docNum = num
25 | }
26 | 
27 | func (idx *Property) TokenCount() int {
28 | 	return idx.tokenCount
29 | }
30 | 
31 | func (idx *Property) SetTokenCount(cnt int) {
32 | 	idx.tokenCount = cnt
33 | }
34 | 
35 | func (idx *Property) DataRange() DataRange {
36 | 	return idx.dataRange
37 | }
38 | 
39 | func (idx *Property) SetDataRange(d DataRange)  {
40 | 	idx.dataRange = d
41 | }
42 | 


--------------------------------------------------------------------------------
/index/tfidf.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"math"
 6 | 	"strconv"
 7 | )
 8 | 
 9 | type TF map[string]int32
10 | type TFIDF struct {
11 | 	IDF    map[string]float64
12 | 	DOC2TF map[int32]TF
13 | }
14 | 
15 | func NewTFIDF() *TFIDF {
16 | 	return &TFIDF{
17 | 		IDF:    make(map[string]float64),
18 | 		DOC2TF: make(map[int32]TF, 0),
19 | 	}
20 | }
21 | 
22 | func CalIDF(docNum int, df int) float64 {
23 | 	return math.Log2(float64(docNum)/float64(df) + 1)
24 | }
25 | 
26 | const VirtualQueryDocId int32 = -10000
27 | //CalCosine 余弦距离相似度 https://blog.csdn.net/weixin_42398658/article/details/85063004
28 | func CalCosine(hits []Doc, tfidf *TFIDF) []Doc {
29 | 	queryDocId := VirtualQueryDocId
30 | 
31 | 	var querySum float64
32 | 	for term, tf := range tfidf.DOC2TF[queryDocId] {
33 | 		idf := tfidf.IDF[term]
34 | 		weight := float64(tf) * idf
35 | 		querySum += math.Pow(weight, 2)
36 | 	}
37 | 
38 | 	for i, hit := range hits {
39 | 		var docSum, multiplySum float64
40 | 		for term, tf := range tfidf.DOC2TF[hit.ID] {
41 | 			idf := tfidf.IDF[term]
42 | 			docTermWeight := float64(tf) * idf
43 | 			queryTermWeight := float64(tfidf.DOC2TF[queryDocId][term]) * idf
44 | 
45 | 			multiplySum += docTermWeight * queryTermWeight
46 | 			docSum += math.Pow(docTermWeight, 2)
47 | 		}
48 | 		hits[i].Score = multiplySum / math.Sqrt(querySum*docSum)
49 | 		hits[i].Score, _ = strconv.ParseFloat(fmt.Sprintf("%.4f", hits[i].Score), 64)
50 | 	}
51 | 	return hits
52 | }
53 | 
54 | //CalBM25 计算bm25得分并排序
55 | //docsLen 索引文档总长度(词的数量), DocsNum 索取文档总数
56 | func CalBM25(hits []Doc, tfidf *TFIDF, docLen int, docNum int) []Doc {
57 | 	// 计算bm25 参考:https://www.jianshu.com/p/1e498888f505
58 | 	for i, hit := range hits {
59 | 		for term, tf := range tfidf.DOC2TF[hit.ID] { //hit doc包含多个term
60 | 			d := float64(docLen)
61 | 			avg := float64(docLen) / float64(docNum)
62 | 			idf := tfidf.IDF[term]
63 | 			k1 := float64(2)
64 | 			b := 0.75
65 | 			hits[i].Score += idf * float64(tf) * (k1 + 1) / (float64(tf) + k1*(1-b+b*d/avg))
66 | 		}
67 | 		hits[i].Score, _ = strconv.ParseFloat(fmt.Sprintf("%.4f", hits[i].Score), 64)
68 | 	}
69 | 	return hits
70 | }
71 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"fmt"
  6 | 	"github.com/awesomefly/easysearch/cluster"
  7 | 	"github.com/awesomefly/easysearch/config"
  8 | 	"runtime"
  9 | 	"runtime/pprof"
 10 | 	"strings"
 11 | 
 12 | 	"log"
 13 | 	"os"
 14 | 	"time"
 15 | 
 16 | 	"github.com/awesomefly/easysearch/index"
 17 | 	"github.com/awesomefly/easysearch/search"
 18 | )
 19 | 
 20 | func startStandaloneCluster() error {
 21 | 	conf := config.InitClusterConfig("./cluster.yml")
 22 | 	procAttr := &os.ProcAttr{
 23 | 		Files: []*os.File{os.Stdin, os.Stdout, os.Stderr},
 24 | 	}
 25 | 
 26 | 	procs := make([]*os.Process, 0)
 27 | 
 28 | 	//start manager server
 29 | 	baseArgs := os.Args[0] + " -m cluster "
 30 | 	argv := strings.Fields(baseArgs + "--servername=managerserver --host=" +
 31 | 		conf.ManageServer.Host + " --port=" + fmt.Sprint(conf.ManageServer.Port))
 32 | 	proc, err := os.StartProcess(os.Args[0], argv, procAttr)
 33 | 	if err != nil {
 34 | 		fmt.Println("start manager server process error:", err)
 35 | 		return err
 36 | 	}
 37 | 	procs = append(procs, proc)
 38 | 	time.Sleep(3 * time.Second)
 39 | 
 40 | 	//start data server
 41 | 	for i := 0; i < len(conf.DataServer); i++ {
 42 | 		srv := conf.DataServer[i]
 43 | 
 44 | 		argv = strings.Fields(baseArgs + "--servername=dataserver --host=" + srv.Host + " --port=" + fmt.Sprint(srv.Port))
 45 | 		proc, err = os.StartProcess(os.Args[0], argv, procAttr)
 46 | 		if err != nil {
 47 | 			fmt.Println("start data server process error:", err)
 48 | 			return err
 49 | 		}
 50 | 		procs = append(procs, proc)
 51 | 	}
 52 | 	time.Sleep(3 * time.Second)
 53 | 
 54 | 	//start search server
 55 | 	for i := 0; i < len(conf.SearchServer); i++ {
 56 | 		srv := conf.SearchServer[i]
 57 | 
 58 | 		argv = strings.Fields(baseArgs + "--servername=searchserver --host=" + srv.Host + " --port=" + fmt.Sprint(srv.Port))
 59 | 		proc, err = os.StartProcess(os.Args[0], argv, procAttr)
 60 | 		if err != nil {
 61 | 			fmt.Println("start search server process error:", err)
 62 | 			return err
 63 | 		}
 64 | 		procs = append(procs, proc)
 65 | 		time.Sleep(100 * time.Millisecond)
 66 | 	}
 67 | 
 68 | 	for i := 0; i < len(procs); i++ {
 69 | 		_, err = procs[i].Wait()
 70 | 		if err != nil {
 71 | 			fmt.Println("wait error:", err)
 72 | 			return err
 73 | 		}
 74 | 	}
 75 | 	return nil
 76 | }
 77 | 
 78 | func startProfile() {
 79 | 	f, err := os.OpenFile("cpu.pprof", os.O_CREATE|os.O_RDWR, 0666)
 80 | 	if err != nil {
 81 | 		log.Fatal(err)
 82 | 		return
 83 | 	}
 84 | 	defer f.Close()
 85 | 
 86 | 	if err = pprof.StartCPUProfile(f); err != nil {
 87 | 		log.Fatal(err)
 88 | 		return
 89 | 	}
 90 | 	defer pprof.StopCPUProfile()
 91 | }
 92 | 
 93 | func main() {
 94 | 	f, _ := os.Create("cpu.pprof")
 95 | 	defer f.Close()
 96 | 	pprof.StartCPUProfile(f)
 97 | 	defer pprof.StopCPUProfile()
 98 | 
 99 | 	log.SetOutput(os.Stdout)
100 | 	//log.Printf("args:%+v\n", os.Args)
101 | 	//runtime.GOMAXPROCS(2)
102 | 	log.Println("GOMAXPROCS:", runtime.GOMAXPROCS(0))
103 | 
104 | 	var module string
105 | 	flag.StringVar(&module, "m", "", "[indexer|searcher|merger|cluster]")
106 | 
107 | 	//searcher
108 | 	var query, source, modelFile, searchModel string
109 | 	flag.StringVar(&query, "q", "Album Jordan", "search query")
110 | 	flag.StringVar(&source, "source", "", "[local|remote]")
111 | 	flag.StringVar(&searchModel, "search_model", "", "[boolean|bm25|vs]")
112 | 	flag.StringVar(&modelFile, "paraphrase_file", "", "paraphrase model file")
113 | 
114 | 	//indexer
115 | 	var sharding bool
116 | 	flag.BoolVar(&sharding, "sharding", false, "true|false")
117 | 
118 | 	//merger
119 | 	var srcPath, dstPath string
120 | 	flag.StringVar(&srcPath, "f", "", "src index file")
121 | 	flag.StringVar(&dstPath, "t", "", "dst index file")
122 | 
123 | 	//server
124 | 	var servername string
125 | 	flag.StringVar(&servername, "servername", "", "[all|managerserver|dataserver|searchserver]")
126 | 
127 | 	var host string
128 | 	var port int
129 | 	flag.StringVar(&host, "host", "", "server host")
130 | 	flag.IntVar(&port, "port", 0, "server port")
131 | 	flag.Parse()
132 | 
133 | 	conf := config.InitConfig("./config.yml")
134 | 	if module == "indexer" {
135 | 		log.Println("Starting Index ...")
136 | 		if sharding {
137 | 			cluster.Index(conf)
138 | 		} else {
139 | 			search.Index(*conf) //todo: 构建索引耗时过长，性能分析下具体耗时原因
140 | 		}
141 | 	} else if module == "searcher" {
142 | 		start := time.Now()
143 | 		var matched []index.Doc
144 | 		var err error
145 | 		if source == "local" {
146 | 			log.Println("Starting local search..")
147 | 			searcher := search.NewSearcher(conf.Store.IndexFile)
148 | 			if modelFile != "" {
149 | 				searcher.InitParaphrase(modelFile)
150 | 			}
151 | 			log.Printf("index loaded %d keys in %v", searcher.Count() , time.Since(start))
152 | 			matched = searcher.Search(query)
153 | 		} else if source == "remote" {
154 | 			log.Println("Starting remote search..")
155 | 			cli := cluster.NewSearchClient(&conf.Cluster.ManageServer)
156 | 			matched, err = cli.Search(query)
157 | 			if err != nil {
158 | 				log.Fatal(err)
159 | 				return
160 | 			}
161 | 		}
162 | 		log.Printf("Search found %d documents in %v", len(matched), time.Since(start))
163 | 	} else if module == "merger" {
164 | 		search.Merge(srcPath, dstPath)
165 | 	} else if module == "cluster" {
166 | 		if host != "" && port != 0 {
167 | 			conf.Server.Host = host
168 | 			conf.Server.Port = port
169 | 		}
170 | 		if servername == "all" {
171 | 			log.Println("Starting Standalone Cluster..")
172 | 			if err := startStandaloneCluster(); err != nil {
173 | 				panic(err)
174 | 			}
175 | 		} else if servername == "managerserver" {
176 | 			log.Println("Starting ManagerServer..")
177 | 			svr := cluster.NewManagerServer(conf)
178 | 			svr.Run()
179 | 		} else if servername == "dataserver" {
180 | 			log.Println("Starting DataServer..")
181 | 			ds := cluster.NewDataServer(conf)
182 | 			ds.Run()
183 | 		} else if servername == "searchserver" {
184 | 			log.Println("Starting SearchServer..")
185 | 			srh := cluster.NewSearchServer(conf)
186 | 			srh.Run()
187 | 		}
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/paraphrase/serving/model.go:
--------------------------------------------------------------------------------
 1 | package serving
 2 | 
 3 | // https://pkg.go.dev/code.sajari.com/word2vec#section-readme
 4 | //https://github.com/sajari/word2vec
 5 | import (
 6 | 	"log"
 7 | 	"os"
 8 | 
 9 | 	"code.sajari.com/word2vec"
10 | )
11 | 
12 | type ParaphraseModel struct {
13 | 	path string
14 | 	mode *word2vec.Model
15 | }
16 | 
17 | func NewModel(path string) *ParaphraseModel {
18 | 	return &ParaphraseModel{
19 | 		path: path,
20 | 		mode: load(path),
21 | 	}
22 | }
23 | 
24 | func load(path string) *word2vec.Model {
25 | 	// Load the model from an io.Reader (i.e. a file).
26 | 	file, err := os.Open(path)
27 | 	defer file.Close()
28 | 
29 | 	if err != nil {
30 | 		log.Fatalf("error open file fail: %v", err)
31 | 		panic(err)
32 | 	}
33 | 	//r := bufio.NewReader(file)
34 | 
35 | 	model, err := word2vec.FromReader(file)
36 | 	if err != nil {
37 | 		log.Fatalf("error loading model: %v", err)
38 | 		panic(err)
39 | 	}
40 | 	return model
41 | }
42 | 
43 | //GetSimilar 语义改写、近义词
44 | func (m *ParaphraseModel) GetSimilar(positive []string, negative []string, n int) []string {
45 | 	// Create an expression.
46 | 	expr := word2vec.Expr{}
47 | 	for _, text := range positive {
48 | 		expr.Add(1, text)
49 | 	}
50 | 	for _, text := range negative {
51 | 		expr.Add(-1, text)
52 | 	}
53 | 
54 | 	// Hit the most similar result by cosine similarity.
55 | 	matches, err := m.mode.CosN(expr, n)
56 | 	if err != nil {
57 | 		log.Fatalf("error evaluating cosine similarity: %v", err)
58 | 	}
59 | 
60 | 	var result []string
61 | 	for _, match := range matches {
62 | 		result = append(result, match.Word)
63 | 	}
64 | 	return result
65 | }
66 | 


--------------------------------------------------------------------------------
/paraphrase/serving/model_test.go:
--------------------------------------------------------------------------------
 1 | package serving
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"path/filepath"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestGetSimilar(t *testing.T) {
12 | 	path := "../../data/word2vec.format.bin"
13 | 	x, _ := filepath.Abs(path)
14 | 	fmt.Println(x)
15 | 	model := NewModel(path)
16 | 
17 | 	var (
18 | 		positive = []string{"king", "woman"}
19 | 		negative = []string{"man"}
20 | 	)
21 | 	out := model.GetSimilar(positive, negative, 3)
22 | 	for _, v := range out {
23 | 		println(v)
24 | 	}
25 | 	// assert.EqualValues(t, out, model.GetSimilar(positive, negative, 3))
26 | 	assert.Equal(t, 1, 1)
27 | }
28 | 


--------------------------------------------------------------------------------
/paraphrase/train/wiki2txt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import sys
 4 | from gensim.corpora import WikiCorpus
 5 | import opencc
 6 | 
 7 | dir_path = "data/"
 8 | 
 9 | 
10 | def read_sample():
11 |     i = 0
12 |     with open(dir_path + "wiki_texts.txt", 'r') as f:
13 |         for line in f:
14 |             print(line)
15 |             if i == 10:
16 |                 return
17 |             i += 1
18 | 
19 | 
20 | # train corpus source  https://dumps.wikimedia.org/enwiki/latest/
21 | # xml to txt
22 | def wiki_to_txt(file):
23 |     if file is None:
24 |         return
25 | 
26 |     corpus_path = file #"~/Downloads/enwiki-latest-pages-articles11.xml-p6899367p7054859.bz2"
27 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
28 | 
29 |     output = open(dir_path + "wiki_texts.txt", 'w')
30 |     wiki = WikiCorpus(corpus_path, processes=15, dictionary={})
31 |     i = 0
32 |     for text in wiki.get_texts():
33 |         output.write(" ".join(text) + "\n")
34 |         i += 1
35 |         if i % 10000 == 0:
36 |             logging.info("Saved " + str(i) + " articles")
37 |     output.close()
38 |     logging.info("Finished Saved " + str(i) + " articles")
39 | 
40 | 
41 | def convert2simple():
42 |     cc = opencc.OpenCC('t2s')
43 |     for i in range(1, 5):
44 |         src_file = dir_path + "wiki_texts" + str(i) + ".txt"
45 |         des_file = dir_path + "wiki_simple" + str(i) + ".txt"
46 |         des_f = open(des_file, 'w')
47 |         with open(src_file, 'r') as f:
48 |             for line in f:
49 |                 # print line.decode('utf-8')
50 |                 content = cc.convert(line.decode('utf-8'))
51 |                 print(content)
52 |                 des_f.write(content.encode('utf-8') + '\n')
53 |         des_f.close()
54 |         print(str(i) + " finished.")
55 | 
56 | 
57 | # /usr/bin/python3 paraphrase/train/wiki2txt.py --cmd=parse --file=$WIKI_FILE
58 | if __name__ == "__main__":
59 |     import argparse
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument("--cmd", type=str, required=True)
62 |     parser.add_argument("--file", type=str, required=False)
63 |     args = parser.parse_args()
64 |     if args.cmd == 'parse':
65 |         wiki_to_txt(args.file)
66 |     elif args.cmd == 'sample':
67 |         read_sample()
68 |     # convert2simple()
69 | 


--------------------------------------------------------------------------------
/paraphrase/train/word2vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import string
 3 | 
 4 | from gensim.models import word2vec, KeyedVectors
 5 | import logging
 6 | 
 7 | corpus_dir = "./data/"
 8 | project_dir = "./data/"
 9 | 
10 | 
11 | # 训练word2vec模型
12 | def train(corpus_file):
13 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
14 | 
15 |     # corpus_path = corpus_dir + 'wiki_texts.txt'
16 |     model_path = project_dir + 'med200_less.model.bin'
17 |     model_word2vec_format_path = project_dir + 'word2vec.format.bin'
18 | 
19 |     sentences = word2vec.LineSentence(corpus_file)
20 |     model = word2vec.Word2Vec(sentences, vector_size=200)
21 | 
22 |     # 保存模型，供日後使用
23 |     model.save(model_path)
24 | 
25 |     # 按word2vec格式存储向量信息
26 |     model.wv.save_word2vec_format(model_word2vec_format_path, binary=True)
27 | 
28 | 
29 | def similar_test(positive=None, negative=None):
30 |     # model_path = project_dir + 'med200_less.model.bin'
31 |     model_word2vec_format_path = project_dir + 'word2vec.format.bin'
32 | 
33 |     # model = serving.Word2Vec.load(model_path)
34 |     model = KeyedVectors.load_word2vec_format(model_word2vec_format_path, binary=True)
35 | 
36 |     try:
37 |         # items = model.wv.most_similar(positive, negative, topn=10)
38 |         items = model.most_similar(positive, negative, topn=10)
39 |         for item in items:
40 |             print(item[0].encode('utf-8'), item[1])
41 |     except Exception as e:
42 |         print(repr(e))
43 | 
44 | 
45 | # /usr/bin/python3 paraphrase/train/word2vec.py --cmd=train --corpus_file=./data/wiki_texts.txt
46 | if __name__ == "__main__":
47 |     import argparse
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument("--cmd", type=str, required=True)
50 |     parser.add_argument("--corpus_file", type=str, required=False)
51 |     args = parser.parse_args()
52 |     if args.cmd == 'train':
53 |         train(args.corpus_file)
54 |     elif args.cmd == 'test':
55 |         positive = ['king', 'woman']
56 |         negative = ['man']
57 |         similar_test(positive, negative)
58 | 


--------------------------------------------------------------------------------
/score/bm25.go:
--------------------------------------------------------------------------------
 1 | package score
 2 | 
 3 | import (
 4 | 	"github.com/go-nlp/bm25"
 5 | 	"github.com/go-nlp/tfidf"
 6 | 	"sort"
 7 | )
 8 | 
 9 | type BM25Document []int           //token id list
10 | func (d BM25Document) IDs() []int { return []int(d) }
11 | 
12 | func newTFIDF(docs []BM25Document) *tfidf.TFIDF {
13 | 	tf := tfidf.New()
14 | 
15 | 	for _, doc := range docs {
16 | 		tf.Add(doc)
17 | 	}
18 | 	tf.CalculateIDF()
19 | 	return tf
20 | }
21 | 
22 | // MostSimilar 相关性计算
23 | // q query words, docs is doc id list, return most similar docs' id list
24 | func MostSimilar(docCorpus map[int]BM25Document, tokenCorpus map[string]int, q []string, docs []int, k int) []int {
25 | 	// sort by bm25
26 | 	// 相关性评分请先阅读：https://www.jianshu.com/p/1e498888f505
27 | 	// 废弃-词集过大时，docs无法完全放入内存，需要自行统计词频并计算score
28 | 	var corpus []BM25Document
29 | 	for _, ts := range docCorpus {
30 | 		corpus = append(corpus, ts)
31 | 	}
32 | 	tf := newTFIDF(corpus)
33 | 
34 | 	var query BM25Document
35 | 	for _, term := range q {
36 | 		query = append(query, tokenCorpus[term])
37 | 	}
38 | 
39 | 	resultDocs := make([]tfidf.Document, 0, len(docs))
40 | 	for _, id := range docs {
41 | 		resultDocs = append(resultDocs, docCorpus[id])
42 | 	}
43 | 
44 | 	// FIXME: IDF计算公式不对
45 | 	scores := bm25.BM25(tf, query, resultDocs, 2, 0.75)
46 | 	sort.Sort(scores) //order by asc
47 | 	//sort.Reverse(scores) //order by desc
48 | 
49 | 	var final []int
50 | 	for i := len(scores) - 1; i >= 0 && k > 0; i-- {
51 | 		final = append(final, docs[scores[i].ID])
52 | 		k--
53 | 	}
54 | 	return final
55 | }
56 | 


--------------------------------------------------------------------------------
/score/bm25_test.go:
--------------------------------------------------------------------------------
  1 | package score
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"sort"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/go-nlp/bm25"
  9 | 	"github.com/go-nlp/tfidf"
 10 | )
 11 | 
 12 | var mobydick = []string{
 13 | 	"Call me Ishmael .",
 14 | 	"Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .",
 15 | 	"It is a way I have of driving off the spleen and regulating the circulation .",
 16 | 	"Whenever I find myself growing grim about the mouth ; ",
 17 | 	"whenever it is a damp , drizzly November in my soul ; ",
 18 | 	"whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ",
 19 | 	"and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .",
 20 | 	"This is my substitute for pistol and ball . ",
 21 | 	"With a philosophical flourish Cato throws himself upon his sword ; ",
 22 | 	"I quietly take to the ship . There is nothing surprising in this .",
 23 | 	"If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .",
 24 | }
 25 | 
 26 | type doc []int
 27 | 
 28 | func (d doc) IDs() []int { return []int(d) }
 29 | 
 30 | func makeCorpus(a []string) (map[string]int, []string) {
 31 | 	retVal := make(map[string]int)
 32 | 	invRetVal := make([]string, 0)
 33 | 	var id int
 34 | 	for _, s := range a {
 35 | 		for _, f := range strings.Fields(s) {
 36 | 			f = strings.ToLower(f)
 37 | 			if _, ok := retVal[f]; !ok {
 38 | 				retVal[f] = id
 39 | 				invRetVal = append(invRetVal, f)
 40 | 				id++
 41 | 			}
 42 | 		}
 43 | 	}
 44 | 	return retVal, invRetVal
 45 | }
 46 | 
 47 | func makeDocuments(a []string, c map[string]int) []tfidf.Document {
 48 | 	retVal := make([]tfidf.Document, 0, len(a))
 49 | 	for _, s := range a {
 50 | 		var ts []int
 51 | 		for _, f := range strings.Fields(s) {
 52 | 			f = strings.ToLower(f)
 53 | 			id := c[f]
 54 | 			ts = append(ts, id)
 55 | 		}
 56 | 		retVal = append(retVal, doc(ts))
 57 | 	}
 58 | 	return retVal
 59 | }
 60 | 
 61 | func Example_BM25() {
 62 | 	corpus, _ := makeCorpus(mobydick)
 63 | 	docs := makeDocuments(mobydick, corpus)
 64 | 	tf := tfidf.New()
 65 | 
 66 | 	for _, doc := range docs {
 67 | 		tf.Add(doc)
 68 | 	}
 69 | 	tf.CalculateIDF()
 70 | 
 71 | 	// now we search
 72 | 
 73 | 	// "ishmael" is a query
 74 | 	ishmael := doc{corpus["ishmael"]}
 75 | 
 76 | 	// "whenever i find" is another query
 77 | 	whenever := doc{corpus["whenever"]}
 78 | 
 79 | 	ishmaelScores := bm25.BM25(tf, ishmael, docs, 1.5, 0.75)
 80 | 	wheneverScores := bm25.BM25(tf, whenever, docs, 1.5, 0.75)
 81 | 
 82 | 	sort.Sort(sort.Reverse(ishmaelScores))
 83 | 	sort.Sort(sort.Reverse(wheneverScores))
 84 | 
 85 | 	fmt.Printf("Top 3 Relevant Docs to \"Ishmael\":\n")
 86 | 	for _, d := range ishmaelScores[:3] {
 87 | 		fmt.Printf("\tID   : %d\n\tScore: %1.3f\n\tDoc  : %q\n", d.ID, d.Score, mobydick[d.ID])
 88 | 	}
 89 | 	fmt.Println("")
 90 | 	fmt.Printf("Top 3 Relevant Docs to \"whenever i find\":\n")
 91 | 	for _, d := range wheneverScores[:3] {
 92 | 		fmt.Printf("\tID   : %d\n\tScore: %1.3f\n\tDoc  : %q\n", d.ID, d.Score, mobydick[d.ID])
 93 | 	}
 94 | 	// Output:
 95 | 	// Top 3 Relevant Docs to "Ishmael":
 96 | 	//	ID   : 0
 97 | 	//	QualityScore: 3.706
 98 | 	//	Doc  : "Call me Ishmael ."
 99 | 	//	ID   : 1
100 | 	//	QualityScore: 0.000
101 | 	//	Doc  : "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ."
102 | 	//	ID   : 2
103 | 	//	QualityScore: 0.000
104 | 	//	Doc  : "It is a way I have of driving off the spleen and regulating the circulation ."
105 | 	//
106 | 	// Top 3 Relevant Docs to "whenever i find":
107 | 	//	ID   : 3
108 | 	//	QualityScore: 2.031
109 | 	//	Doc  : "Whenever I find myself growing grim about the mouth ; "
110 | 	//	ID   : 4
111 | 	//	QualityScore: 1.982
112 | 	//	Doc  : "whenever it is a damp , drizzly November in my soul ; "
113 | 	//	ID   : 5
114 | 	//	QualityScore: 1.810
115 | 	//	Doc  : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; "
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/search/indexer.go:
--------------------------------------------------------------------------------
  1 | package search
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io/fs"
  6 | 	"log"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"regexp"
 10 | 	"sort"
 11 | 	"time"
 12 | 
 13 | 	"github.com/awesomefly/easysearch/config"
 14 | 
 15 | 	"github.com/awesomefly/easysearch/index"
 16 | )
 17 | 
 18 | type Indexer interface {
 19 | 	// Drain data to file. sort by key
 20 | 	Drain(file string)
 21 | 	Merge(file string)
 22 | }
 23 | 
 24 | const SpiltThresholdDocNum int = 50000
 25 | func Index(c config.Config) {
 26 | 	log.Println("Starting index...")
 27 | 
 28 | 	//remove old index files
 29 | 	IndexDir := filepath.Dir(c.Store.IndexFile)
 30 | 	IndexPathPrefix := "_tmp." + filepath.Base(c.Store.IndexFile)
 31 | 	reg, _ := regexp.Compile(IndexPathPrefix + ".*")
 32 | 	if err := Remove(IndexDir, reg); err != nil {
 33 | 		log.Fatal(err)
 34 | 		return
 35 | 	}
 36 | 
 37 | 	reg1, _ := regexp.Compile("^" + filepath.Base(c.Store.IndexFile) + ".*")
 38 | 	if err := Remove(filepath.Dir(c.Store.IndexFile), reg1); err != nil {
 39 | 		log.Fatal(err)
 40 | 		return
 41 | 	}
 42 | 
 43 | 	//文件太大，先拆分生成小文件，在内存中构造到排表，最后再归并到一个索引文件
 44 | 	//无法直接在文件中构建构建索引，因为posting list在文件中是连续存储的，随着posting list逐渐变长，需要不断的拷贝到新空间
 45 | 	Spilt(c, IndexDir+"/"+IndexPathPrefix)
 46 | 
 47 | 	//归并合并
 48 | 	files, err := Walk(IndexDir, reg)
 49 | 	if err != nil {
 50 | 		panic(err)
 51 | 	}
 52 | 	MergeAll(c, files)
 53 | }
 54 | 
 55 | func Spilt(c config.Config, filePrefix string) (files []string) {
 56 | 	start := time.Now()
 57 | 	//1. spilt to small file.
 58 | 	ch, err := index.LoadDocumentStream(c.Store.DumpFile)
 59 | 	if err != nil {
 60 | 		log.Fatal(err)
 61 | 		return
 62 | 	}
 63 | 
 64 | 	//2. index and dump posting list
 65 | 	idx := index.NewHashMapIndex()
 66 | 
 67 | 	WriteToFile := func() string {
 68 | 		file := fmt.Sprintf("%s.%d", filePrefix, time.Now().Nanosecond())
 69 | 		fmt.Printf("Loaded all docs, Drain to file: %s \n", file)
 70 | 
 71 | 		index.Drain(idx, file)
 72 | 		return file
 73 | 	}
 74 | 
 75 | 	for {
 76 | 		//timeout := time.NewTimer(1 * time.Second)
 77 | 		select {
 78 | 		case doc := <-ch:
 79 | 			if doc == nil {
 80 | 				file := WriteToFile()
 81 | 				files = append(files, file)
 82 | 				break
 83 | 			}
 84 | 
 85 | 			idx.Add([]index.Document{*doc}) //内存中操作
 86 | 			if idx.Property().DocNum() >= SpiltThresholdDocNum {
 87 | 				file := WriteToFile()
 88 | 				files = append(files, file)
 89 | 
 90 | 				idx.Clear()
 91 | 			}
 92 | 			continue
 93 | 			//case <-timeout.C:
 94 | 			//	log.Printf("Read timeout. err: %s", err.Error())
 95 | 			//	break
 96 | 		}
 97 | 		break
 98 | 	}
 99 | 	log.Printf("Dump all documents in %v.", time.Since(start))
100 | 	return files
101 | }
102 | 
103 | func MergeAll(c config.Config, files []string) {
104 | 	var chs []chan *index.KVPair
105 | 	for i := 0; i < len(files); i++ {
106 | 		chl, err := index.Load(files[i])
107 | 		if err != nil {
108 | 			panic(err)
109 | 		}
110 | 		chs = append(chs, chl)
111 | 	}
112 | 
113 | 	start := time.Now()
114 | 	bt := index.NewBTreeIndex(c.Store.IndexFile)
115 | 
116 | 	//3. merge posting list
117 | 	//频繁往Posting List中追加doc，导致元分配空间不足，需要拷贝PostingList到新的空间，文件读写IO高
118 | 	//必须归并后在写入索引，
119 | 	finished := make(map[int]bool)
120 | 	pairs := make([]*index.KVPair, len(files))
121 | 	for {
122 | 		pivot := -1
123 | 		for i := 0; i < len(pairs); i++ {
124 | 			if pairs[i] == nil && chs[i] != nil {
125 | 				timeout := time.NewTimer(1000 * time.Millisecond)
126 | 				select {
127 | 				case kv := <-chs[i]:
128 | 					if kv == nil {
129 | 						close(chs[i])
130 | 						chs[i] = nil
131 | 					}
132 | 					pairs[i] = kv
133 | 				case <-timeout.C:
134 | 					close(chs[i])
135 | 					chs[i] = nil
136 | 				}
137 | 			}
138 | 
139 | 			if pairs[i] == nil {//已完成一路
140 | 				finished[i] = true
141 | 				continue
142 | 			} else if pivot == -1 {//取第一非空值作为哨兵
143 | 				pivot = i
144 | 				continue
145 | 			}
146 | 
147 | 			if pairs[i].Key < pairs[pivot].Key {
148 | 				pivot = i
149 | 			} else if pairs[i].Key == pairs[pivot].Key {
150 | 				pairs[pivot].Value.Append(pairs[i].Value...)
151 | 				pairs[i] = nil
152 | 			}
153 | 		}
154 | 		if len(finished) == len(files) { //all finished
155 | 			break
156 | 		}
157 | 
158 | 		//4. insert "word->posting list"
159 | 		sort.Sort(pairs[pivot].Value)
160 | 		bt.Insert(pairs[pivot].Key, pairs[pivot].Value)
161 | 		pairs[pivot] = nil
162 | 	}
163 | 	log.Printf("Indexed %d documents and %d keys in %v", bt.Property().DocNum(), bt.BT.Count(), time.Since(start))
164 | 
165 | 	bt.BT.Stats(true)
166 | 	bt.Close()
167 | 	time.Sleep(5*time.Second)
168 | }
169 | 
170 | func Remove(dir string, reg *regexp.Regexp) error {
171 | 	files, err := Walk(dir, reg)
172 | 	if err != nil {
173 | 		return err
174 | 	}
175 | 	fmt.Printf("remove %d files.\n", len(files))
176 | 	for i := 0; i < len(files); i++ {
177 | 		//fmt.Println(files[i])
178 | 		os.Remove(files[i])
179 | 	}
180 | 	return nil
181 | }
182 | 
183 | func Walk(dir string, re *regexp.Regexp) ([]string, error) {
184 | 	// Just a demo, this is how we capture the files that match
185 | 	// the pattern.
186 | 	var files []string
187 | 
188 | 	walk := func(path string, d fs.DirEntry, err error) error {
189 | 		if re.MatchString(d.Name()) == false {
190 | 			return nil
191 | 		}
192 | 		if d.IsDir() {
193 | 			fmt.Println(path + string(os.PathSeparator))
194 | 		} else {
195 | 			//fmt.Println(path)
196 | 			files = append(files, path)
197 | 		}
198 | 		return nil
199 | 	}
200 | 	filepath.WalkDir(dir, walk)
201 | 	return files, nil
202 | }
203 | 
204 | 


--------------------------------------------------------------------------------
/search/indexer_test.go:
--------------------------------------------------------------------------------
 1 | package search
 2 | 
 3 | import (
 4 | 	"github.com/awesomefly/easysearch/config"
 5 | 	"github.com/awesomefly/easysearch/index"
 6 | 	"regexp"
 7 | 	"runtime"
 8 | 	"testing"
 9 | )
10 | 
11 | func TestIndexer(t *testing.T) {
12 | 	reg,_ := regexp.Compile("enwiki-latest-abstract.*.xml.gz")
13 | 	files,_ := Walk("../data", reg)
14 | 	println(files)
15 | }
16 | 
17 | func TestIndex(t *testing.T) {
18 | 	runtime.GOMAXPROCS(2)
19 | 	conf := config.Config {
20 | 		Store:config.Storage {
21 | 			DumpFile:  "../data/enwiki-latest-abstract27.xml.gz",
22 | 			IndexFile: "../data/enwiki_idx",
23 | 		},
24 | 	}
25 | 
26 | 
27 | 	Index(conf)
28 | 	//r := recover()
29 | 	//assert.Nil(t, r)
30 | 
31 | 	bt := index.NewBTreeIndex(conf.Store.IndexFile)
32 | 	bt.BT.Stats(true)
33 | }


--------------------------------------------------------------------------------
/search/merger.go:
--------------------------------------------------------------------------------
 1 | package search
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"sort"
 6 | 	"strconv"
 7 | 	"time"
 8 | 
 9 | 	"github.com/awesomefly/easysearch/index"
10 | 	btree "github.com/awesomefly/gobtree"
11 | )
12 | 
13 | func Merge(srcPath, dstPath string) {
14 | 	log.Println("Starting merge ...")
15 | 
16 | 	start := time.Now()
17 | 	idx := index.NewBTreeIndex(srcPath)
18 | 	log.Printf("Source index loaded %d keys in %v", idx.BT.Count(), time.Since(start))
19 | 
20 | 	start = time.Now()
21 | 	dstIdx := index.NewBTreeIndex(dstPath)
22 | 	log.Printf("Dst index loaded %d keys in %v", dstIdx.BT.Count(), time.Since(start))
23 | 
24 | 	start = time.Now()
25 | 	ch := idx.BT.FullSet()
26 | 	for {
27 | 		k := <-ch
28 | 		d := <-ch
29 | 		v := <-ch
30 | 		if k == nil || d == nil || v == nil {
31 | 			break
32 | 		}
33 | 
34 | 		var src index.PostingList
35 | 		src.FromBytes(v)
36 | 
37 | 		dst := dstIdx.Lookup(string(k), true)
38 | 		dst = append(dst, src...)
39 | 		sort.Sort(dst)
40 | 
41 | 		id, err := strconv.ParseInt(string(d), 10, 64) //TestKey.Docid()对应
42 | 		if err != nil {
43 | 			panic(err)
44 | 		}
45 | 
46 | 		key := &btree.TestKey{K: string(k), Id: id}
47 | 		dstIdx.BT.Insert(key, &dst)
48 | 	}
49 | 	log.Printf("merge %s to %s in %v", srcPath, dstPath, time.Since(start))
50 | 	idx.Close()
51 | 	dstIdx.Close()
52 | }
53 | 


--------------------------------------------------------------------------------
/search/searcher.go:
--------------------------------------------------------------------------------
  1 | package search
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"sort"
  6 | 	"strconv"
  7 | 	"sync"
  8 | 	"sync/atomic"
  9 | 	"time"
 10 | 	"unsafe"
 11 | 
 12 | 	"github.com/xtgo/set"
 13 | 
 14 | 	"github.com/RoaringBitmap/roaring"
 15 | 
 16 | 	"github.com/awesomefly/easysearch/index"
 17 | 	"github.com/awesomefly/easysearch/paraphrase/serving"
 18 | 	"github.com/awesomefly/easysearch/util"
 19 | )
 20 | 
 21 | type IndexType int
 22 | 
 23 | const (
 24 | 	FullIndex IndexType = iota
 25 | 	AuxIndex
 26 | )
 27 | 
 28 | type MsgType int
 29 | 
 30 | const (
 31 | 	STOP MsgType = iota
 32 | 	FLUSH
 33 | )
 34 | 
 35 | type Message struct {
 36 | 	MsgType MsgType
 37 | 	Msg     string
 38 | }
 39 | 
 40 | type DoubleBuffer struct {
 41 | 	CurrentIdx uint32 //current write index
 42 | 	msgChan    chan Message
 43 | 
 44 | 	Indices []*index.HashMapIndex
 45 | 	Queues  []chan index.Document
 46 | }
 47 | 
 48 | func NewDoubleBuffer() *DoubleBuffer {
 49 | 	buf := DoubleBuffer{}
 50 | 	atomic.StoreUint32(&buf.CurrentIdx, 0)
 51 | 
 52 | 	for i := 0; i < 2; i++ {
 53 | 		idx := index.NewHashMapIndex()
 54 | 		buf.Indices = append(buf.Indices, idx)
 55 | 		buf.Queues = append(buf.Queues, make(chan index.Document, 100))
 56 | 	}
 57 | 
 58 | 	buf.msgChan = buf.Start()
 59 | 	return &buf
 60 | }
 61 | 
 62 | func (b *DoubleBuffer) WithDataRange(timestamp int64) *DoubleBuffer {
 63 | 	t := time.Now()
 64 | 	if timestamp != 0 {
 65 | 		t = time.Unix(timestamp, 0)
 66 | 	}
 67 | 	start := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix()
 68 | 
 69 | 	t = t.AddDate(0, 0, 1)
 70 | 	end := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix()
 71 | 
 72 | 	for i := 0; i < len(b.Indices); i++ {
 73 | 		b.Indices[i].Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)})
 74 | 	}
 75 | 	return b
 76 | }
 77 | 
 78 | func (b *DoubleBuffer) Start() chan Message {
 79 | 	msgChan := make(chan Message, 10)
 80 | 	go func() {
 81 | 		for {
 82 | 			select {
 83 | 			case msg := <-msgChan:
 84 | 				switch msg.MsgType {
 85 | 				case STOP:
 86 | 					log.Printf("stop double buffer. msg:%s\n", msg.Msg)
 87 | 					return
 88 | 				case FLUSH:
 89 | 					b.DoFlush()
 90 | 				}
 91 | 			default:
 92 | 				b.DoAdd()
 93 | 			}
 94 | 		}
 95 | 	}()
 96 | 	return msgChan
 97 | }
 98 | 
 99 | func (b *DoubleBuffer) Stop() {
100 | 	b.msgChan <- Message{
101 | 		MsgType: STOP,
102 | 		Msg:     "stop",
103 | 	}
104 | }
105 | 
106 | // DoFlush unsafe
107 | func (b *DoubleBuffer) DoFlush() {
108 | 	for i := 0; i < len(b.Indices); i++ {
109 | 		idx := b.Indices[i]
110 | 		docs := make([]index.Document, 0)
111 | 		for {
112 | 			select {
113 | 			case doc := <-b.Queues[i]:
114 | 				docs = append(docs, doc)
115 | 				continue
116 | 			default:
117 | 				break
118 | 			}
119 | 			break
120 | 		}
121 | 
122 | 		if len(docs) > 0 {
123 | 			idx.Add(docs)
124 | 		}
125 | 	}
126 | }
127 | 
128 | func (b *DoubleBuffer) DoAdd() {
129 | 	writeIdx := atomic.LoadUint32(&b.CurrentIdx)
130 | 
131 | 	//单协程写，无需加锁
132 | 	idx := b.Indices[writeIdx]
133 | 	docs := make([]index.Document, 0)
134 | 	for {
135 | 		timeout := time.NewTimer(1 * time.Millisecond)
136 | 		select {
137 | 		case doc := <-b.Queues[writeIdx]:
138 | 			docs = append(docs, doc)
139 | 			continue
140 | 		case <-timeout.C:
141 | 			break
142 | 		}
143 | 		break
144 | 	}
145 | 
146 | 	if len(docs) > 0 {
147 | 		idx.Add(docs)
148 | 	}
149 | 
150 | 	if len(b.Queues[1-writeIdx]) > 10 {
151 | 		atomic.CompareAndSwapUint32(&b.CurrentIdx, writeIdx, 1-writeIdx)
152 | 		//适当sleep，让历史读写操作执行完，避免读写并发
153 | 		time.Sleep(100 * time.Millisecond)
154 | 	}
155 | }
156 | 
157 | func (b *DoubleBuffer) Add(doc index.Document) {
158 | 	for i := 0; i < len(b.Queues); i++ {
159 | 		b.Queues[i] <- doc
160 | 	}
161 | }
162 | 
163 | func (b *DoubleBuffer) ReadIndex() *index.HashMapIndex {
164 | 	writeIdx := atomic.LoadUint32(&b.CurrentIdx)
165 | 	return b.Indices[1-writeIdx]
166 | }
167 | 
168 | func (b *DoubleBuffer) Flush() {
169 | 	b.msgChan <- Message{
170 | 		MsgType: FLUSH,
171 | 		Msg:     "force flush",
172 | 	}
173 | }
174 | 
175 | func (b *DoubleBuffer) Clear() {
176 | }
177 | 
178 | type IndexArray struct {
179 | 	lock    sync.RWMutex
180 | 	indices []*index.BTreeIndex
181 | }
182 | 
183 | func NewIndexArray() *IndexArray {
184 | 	return &IndexArray{
185 | 		indices: make([]*index.BTreeIndex, 0),
186 | 	}
187 | }
188 | 
189 | func (b *IndexArray) WithFile(file string) *IndexArray {
190 | 	idx := index.NewBTreeIndex(file)
191 | 
192 | 	t := time.Now()
193 | 	start := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix()
194 | 	t = t.AddDate(0, 0, 1)
195 | 	end := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix()
196 | 	idx.Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)})
197 | 
198 | 	b.lock.Lock()
199 | 	defer b.lock.Unlock()
200 | 
201 | 	b.indices = append(b.indices, idx)
202 | 	return b
203 | }
204 | 
205 | func (b *IndexArray) Indices() []*index.BTreeIndex {
206 | 	b.lock.RLock()
207 | 	defer b.lock.RUnlock()
208 | 
209 | 	// 将数据复制到新的切片空间中
210 | 	copyData := make([]*index.BTreeIndex, len(b.indices))
211 | 	copy(copyData, b.indices)
212 | 	return copyData
213 | }
214 | 
215 | func (b *IndexArray) Add(idx *index.BTreeIndex) {
216 | 	b.lock.Lock()
217 | 	defer b.lock.Unlock()
218 | 
219 | 	b.indices = append(b.indices, idx)
220 | }
221 | 
222 | // Hit 查找包含dr的index
223 | func (b *IndexArray) Hit(dr index.DataRange) *index.BTreeIndex {
224 | 	b.lock.RLock()
225 | 	defer b.lock.RUnlock()
226 | 
227 | 	for i := 0; i < len(b.indices); i++ {
228 | 		r := b.indices[i].Property().DataRange()
229 | 		if dr.Start >= r.Start && dr.End <= r.End { //在index的range范围内
230 | 			return b.indices[i]
231 | 		}
232 | 	}
233 | 	return nil
234 | }
235 | 
236 | func (b *IndexArray) Swap(old *index.BTreeIndex, new *index.BTreeIndex) bool {
237 | 	b.lock.Lock()
238 | 	defer b.lock.Unlock()
239 | 	for i := 0; i < len(b.indices); i++ {
240 | 		if b.indices[i] == old { //在index的range范围内
241 | 			b.indices[i] = new
242 | 			return true
243 | 		}
244 | 	}
245 | 	return false
246 | }
247 | 
248 | // Evict 淘汰dr范围内的index
249 | func (b *IndexArray) Evict(dr index.DataRange) []*index.BTreeIndex {
250 | 	b.lock.Lock()
251 | 	defer b.lock.Unlock()
252 | 
253 | 	var evicts []*index.BTreeIndex
254 | 	for i := 0; i < len(b.indices); {
255 | 		r := b.indices[i].Property().DataRange()
256 | 		if dr.Start <= r.Start && dr.End >= r.End { //在dr范围内的所以index
257 | 			evicts = append(evicts, b.indices[i])
258 | 			b.indices = append(b.indices[:i], b.indices[i+1:]...) //删除元素i
259 | 		} else {
260 | 			i++
261 | 		}
262 | 	}
263 | 	return evicts
264 | }
265 | 
266 | type Searcher struct {
267 | 	//全量索引/主索引，历史全量数据静态构建成本高
268 | 	fullIndex unsafe.Pointer
269 | 
270 | 	// 辅助索引（auxiliary index），全量索引较大重建不方便，可以近期新增数据构建成增量索引。
271 | 	// eg.每天只对1天前的数据重建索引，当天数据构建成增量索引
272 | 	auxIndex unsafe.Pointer //*IndexArray 带时间段的索引数组
273 | 
274 | 	// 临时索引（incremental index）支持实时更新索引,利用双buff在内存中构建,支持无锁并发读写；
275 | 	// 内存不足时合并到辅助索引
276 | 	incrIndex unsafe.Pointer
277 | 
278 | 	//deleteList []index.Doc //delete docs list.  update doc = delete old doc and create new one
279 | 	//BloomFilter  *bloom.Filter //也可使用布谷鸟过滤器效率更高
280 | 	roaringFilter *roaring.Bitmap //todo：如何删除过期数据
281 | 
282 | 	model *serving.ParaphraseModel //todo: 移到search server更合适
283 | 
284 | 	indexFile string
285 | }
286 | 
287 | func NewSearcher(file string) *Searcher {
288 | 	srh := &Searcher{
289 | 		fullIndex: unsafe.Pointer(index.NewBTreeIndex(file)),
290 | 		auxIndex:  unsafe.Pointer(NewIndexArray().WithFile(file + ".aux." + strconv.Itoa(int(time.Now().Unix())))),
291 | 		incrIndex: unsafe.Pointer(NewDoubleBuffer().WithDataRange(0)),
292 | 		//deleteList: make([]index.Doc, 0),
293 | 		//BloomFilter: bloom.New(10000, 1000),
294 | 		roaringFilter: roaring.New(),
295 | 		model:         nil,
296 | 		indexFile:     file,
297 | 	}
298 | 	return srh
299 | }
300 | 
301 | func (srh *Searcher) InitParaphrase(file string) {
302 | 	srh.model = serving.NewModel(file)
303 | }
304 | 
305 | func (srh *Searcher) Paraphrase(texts []string, n int) []string {
306 | 	if srh.model == nil {
307 | 		return nil
308 | 	}
309 | 	var (
310 | 		positive = texts
311 | 		negative []string
312 | 	)
313 | 	l := len(texts)
314 | 	sim := srh.model.GetSimilar(positive, negative, l+n)
315 | 	return sim[l:]
316 | }
317 | 
318 | // Add doc to index double-buffer async
319 | // write need lock but read do not
320 | func (srh *Searcher) Add(doc index.Document) {
321 | 	incr := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex))
322 | 
323 | 	//跨天，新建个增量索引
324 | 	end := incr.ReadIndex().Property().DataRange().End
325 | 	if doc.Timestamp > end {
326 | 		srh.Drain(end)
327 | 	}
328 | 
329 | 	//可能触发Drain需要重新Load
330 | 	(*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).Add(doc)
331 | }
332 | 
333 | // Del doc from index
334 | func (srh *Searcher) Del(doc index.Document) {
335 | 	//todo: 加锁
336 | 	srh.roaringFilter.Add(uint32(doc.ID))
337 | }
338 | 
339 | func (srh *Searcher) Count() int {
340 | 	a := (*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex)).Property().DocNum()
341 | 	copyData := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)).Indices()
342 | 	for i := 0; i < len(copyData); i++ {
343 | 		a += copyData[i].Property().DocNum()
344 | 	}
345 | 	a += (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).ReadIndex().Property().DocNum()
346 | 	return a
347 | }
348 | 
349 | func (srh *Searcher) Clear() {
350 | 	(*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex)).Clear()
351 | 	copyData := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)).Indices()
352 | 	for i := 0; i < len(copyData); i++ {
353 | 		copyData[i].Clear()
354 | 	}
355 | }
356 | 
357 | // Drain incremental index to disk
358 | // 实际的原地更新策略，需要PostingList末尾预留足够空间，否则大量PostingList需要移动效率更低
359 | // 磁盘空间足够时使用再合并策略，实现简单且不影响并发，但需要足够的内存
360 | func (srh *Searcher) Drain(timestamp int) {
361 | 	oldIncr := (*DoubleBuffer)(atomic.SwapPointer(&srh.incrIndex, unsafe.Pointer(NewDoubleBuffer().WithDataRange(int64(timestamp)))))
362 | 	go func() {
363 | 		//flush after sleep any second
364 | 		time.Sleep(100 * time.Millisecond)
365 | 		oldIncr.Flush()
366 | 		oldIncr.Stop()
367 | 
368 | 		oldIncrDR := oldIncr.ReadIndex().Property().DataRange()
369 | 		auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex))
370 | 		oldAux := auxIdxArray.Hit(oldIncrDR)
371 | 		if oldAux != nil {
372 | 			//合并keys
373 | 			keys := make(sort.StringSlice, len(oldIncr.ReadIndex().Map())+int(oldAux.BT.Count()))
374 | 			for k := range oldIncr.ReadIndex().Map() {
375 | 				keys = append(keys, k)
376 | 			}
377 | 			ch := oldAux.BT.KeySet()
378 | 			for {
379 | 				key := <-ch
380 | 				if key == nil {
381 | 					break
382 | 				}
383 | 				keys = append(keys, string(key))
384 | 			}
385 | 			sort.Strings(keys)
386 | 			set.Uniq(keys)
387 | 
388 | 			//合并到新索引
389 | 			newAux := index.NewBTreeIndex(srh.indexFile + ".aux." + strconv.Itoa(int(time.Now().Unix())))
390 | 			for i := 0; i < keys.Len(); i++ {
391 | 				key := keys[i]
392 | 				pl := oldAux.Lookup(key, false)
393 | 				if pl2 := oldIncr.ReadIndex().Get(key); pl2 != nil {
394 | 					pl = append(pl, pl2...)
395 | 				}
396 | 				if len(pl) > 0 {
397 | 					newAux.Insert(key, pl)
398 | 				}
399 | 			}
400 | 			newAux.SetProperty(*oldAux.Property())
401 | 			newAux.Property().SetDocNum(oldIncr.ReadIndex().Property().DocNum() + oldAux.Property().DocNum())
402 | 			newAux.Property().SetTokenCount(oldIncr.ReadIndex().Property().TokenCount() + oldAux.Property().TokenCount())
403 | 			newAux.BT.Drain()
404 | 
405 | 			//oldAux = (*index.BTreeIndex)(atomic.SwapPointer(&srh.auxIndex, unsafe.Pointer(newAux)))
406 | 			if auxIdxArray.Swap(oldAux, newAux) {
407 | 				oldAux.Clear()
408 | 				oldIncr.Clear()
409 | 			}
410 | 		} else {
411 | 			idx := index.NewBTreeIndex(srh.indexFile + ".aux." + strconv.Itoa(oldIncrDR.Start))
412 | 			idx.Property().SetDataRange(oldIncrDR)
413 | 			auxIdxArray.Add(idx)
414 | 		}
415 | 	}()
416 | }
417 | 
418 | // Load index, use for rebuild index
419 | func (srh *Searcher) Load(file string, flag IndexType) {
420 | 	newIndex := index.NewBTreeIndex(file)
421 | 	auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex))
422 | 
423 | 	evicts := auxIdxArray.Evict(newIndex.Property().DataRange())
424 | 	switch flag {
425 | 	case FullIndex:
426 | 		old := (*index.BTreeIndex)(atomic.SwapPointer(&srh.fullIndex, unsafe.Pointer(newIndex)))
427 | 		evicts = append(evicts, old)
428 | 	case AuxIndex:
429 | 		auxIdxArray.Add(newIndex) //如果先添加后淘汰，需要避免自身也被淘汰
430 | 		//old = (*index.BTreeIndex)(atomic.SwapPointer(&srh.auxIndex, unsafe.Pointer(newIndex)))
431 | 	}
432 | 
433 | 	for i := 0; i < len(evicts); i++ {
434 | 		evicts[i].Clear()
435 | 	}
436 | }
437 | 
438 | //SearchTips todo: 支持搜索提示
439 | //Trie 适合英文词典，如果系统中存在大量字符串且这些字符串基本没有公共前缀，则相应的trie树将非常消耗内存（数据结构之trie树）
440 | //Double Array Trie 适合做中文词典，内存占用小
441 | func (srh *Searcher) SearchTips() []string {
442 | 	//支持trie树 or FST
443 | 	return nil
444 | }
445 | 
446 | func (srh *Searcher) Retrieval(terms []string, ext []string, model index.SearchModel) []index.Doc {
447 | 	var result []index.Doc
448 | 
449 | 	fullIdx := (*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex))
450 | 	auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex))
451 | 	incrIdx := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).ReadIndex()
452 | 
453 | 	result = fullIdx.Retrieval(terms, ext, nil, 10, 1000, model)
454 | 
455 | 	copyData := auxIdxArray.Indices()
456 | 	for i := 0; i < len(copyData); i++ {
457 | 		y := copyData[i].Retrieval(terms, ext, nil, 10, 1000, model)
458 | 		(*index.PostingList)(&result).Union(y)
459 | 	}
460 | 
461 | 	z := incrIdx.Retrieval(terms, ext, nil, 10, 1000, model)
462 | 	(*index.PostingList)(&result).Union(z)
463 | 	return result
464 | }
465 | 
466 | //Filter deleted docs
467 | func (srh *Searcher) Filter(docs []index.Doc) []index.Doc {
468 | 	var result []index.Doc
469 | 	for _, doc := range docs {
470 | 		hit := srh.roaringFilter.Contains(uint32(doc.ID))
471 | 		if !hit {
472 | 			result = append(result, doc)
473 | 		}
474 | 	}
475 | 	return result
476 | }
477 | 
478 | // Search queries the index for the given text.
479 | // todo: 检索召回（多路召回） -> 粗排sort(CTR by LR) -> 精排sort(CVR by DNN) -> topN(堆排序)
480 | func (srh *Searcher) Search(query string) []index.Doc {
481 | 	//todo: 支持前缀查找
482 | 	//参考：Lucene builds an inverted index using Skip-Lists on disk,
483 | 	//and then loads a mapping for the indexed terms into memory using a Finite State Transducer (FST).
484 | 
485 | 	//1. Query Rewrite todo:支持查询纠错，意图识别
486 | 	//1.1 文本预处理：分词、去除停用词、词干提取
487 | 	terms := util.Analyze(query)
488 | 	//1.2 语义扩展，即近义词/含义相同等
489 | 	ext := srh.Paraphrase(terms, 3)
490 | 
491 | 	//2. todo:多路召回（传统检索+向量检索）
492 | 	r := srh.Retrieval(terms, ext, index.BM25)
493 | 
494 | 	//3. 过滤已删除文档filter
495 | 	r = srh.Filter(r)
496 | 	return r
497 | }
498 | 


--------------------------------------------------------------------------------
/search/searcher_test.go:
--------------------------------------------------------------------------------
  1 | package search
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/stretchr/testify/assert"
  6 | 	"math/rand"
  7 | 	"runtime"
  8 | 	"sync/atomic"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/awesomefly/easysearch/index"
 13 | )
 14 | 
 15 | var letters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 16 | 
 17 | func randSeq(n int) string {
 18 | 	b := make([]rune, n)
 19 | 	for i := range b {
 20 | 		b[i] = letters[rand.Intn(len(letters))]
 21 | 	}
 22 | 	return string(b)
 23 | }
 24 | 
 25 | func get(srh *Searcher, text string) []int {
 26 | 	rt := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex))
 27 | 	return (index.PostingList)(rt.ReadIndex().Get(text)).IDs()
 28 | }
 29 | 
 30 | var searcher = NewSearcher("../data/test_insread") //必须为全局变量
 31 | func BenchmarkDoubleBuffer(b *testing.B) {
 32 | 	//b.N = 10000
 33 | 	rand.Seed(time.Now().UnixNano())
 34 | 	fmt.Println(runtime.GOMAXPROCS(0))
 35 | 
 36 | 	fmt.Println("start")
 37 | 	searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."})
 38 | 	for i := 0; i < b.N; i++ {
 39 | 		searcher.Add(index.Document{ID: 1, Text: randSeq(5)})
 40 | 		get(searcher, "donut")
 41 | 	}
 42 | 	fmt.Println("done")
 43 | }
 44 | 
 45 | func BenchmarkDoubleBufferParallel(b *testing.B) {
 46 | 	rand.Seed(time.Now().UnixNano())
 47 | 	fmt.Println(runtime.GOMAXPROCS(0))
 48 | 	searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."})
 49 | 
 50 | 	// 测试一个对象或者函数在多线程的场景下面是否安全
 51 | 	b.SetParallelism(10000) //协程总数：b.parallelism * runtime.GOMAXPROCS(0)
 52 | 	b.RunParallel(func(pb *testing.PB) {
 53 | 		for pb.Next() { //每个协程运行b.N个case
 54 | 			t := randSeq(5)
 55 | 			searcher.Add(index.Document{ID: 1, Text: t})
 56 | 			get(searcher, "donut")
 57 | 			get(searcher, t)
 58 | 		}
 59 | 	})
 60 | }
 61 | func TestSearcherLoad(t *testing.T) {
 62 | 	searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate.", Timestamp: int(time.Now().Unix())}) //当天的文档
 63 | 	searcher.Add(index.Document{ID: 2, Text: "Only the donuts.", Timestamp: int(time.Now().AddDate(0,0,1).Unix())}) //第二天的文档
 64 | 	time.Sleep(2 * time.Second)
 65 | 
 66 | 	copyData :=(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices()
 67 | 	fmt.Printf("1index len:%d\n", len(copyData))
 68 | 	assert.Equal(t, 1, len(copyData)) //触发索引分裂，此时当天的索引文件已持久化
 69 | 
 70 | 
 71 | 	searcher.Drain(0)
 72 | 	time.Sleep(2 * time.Second)
 73 | 	copyData =(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices()
 74 | 	fmt.Printf("2index len:%d\n", len(copyData))
 75 | 	assert.Equal(t, 2, len(copyData)) //手动持久化第二天的文档
 76 | 
 77 | 
 78 | 	newIndex := index.NewBTreeIndex("../data/test_insread_xxx")
 79 | 
 80 | 	ts := time.Now()
 81 | 	start := time.Date(ts.Year(), ts.Month(), ts.Day(), 0, 0, 0, 0, ts.Location()).Unix()
 82 | 	ts = ts.AddDate(0, 0, 3)
 83 | 	end := time.Date(ts.Year(), ts.Month(), ts.Day(), 0, 0, 0, 0, ts.Location()).Unix()
 84 | 	newIndex.Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)})
 85 | 
 86 | 	newIndex.Add([]index.Document{{ID: 3, Text: "god is girl."}})
 87 | 	newIndex.Close()
 88 | 
 89 | 	searcher.Load("../data/test_insread_xxx", AuxIndex)
 90 | 	copyData =(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices()
 91 | 	fmt.Printf("3index len:%d\n", len(copyData))
 92 | 	assert.Equal(t, 1, len(copyData)) //手动持久化第二天的文档
 93 | 
 94 | 	rst := searcher.Search("girl")
 95 | 	fmt.Printf("%+v", rst)
 96 | 	assert.Equal(t, 1, len(rst))
 97 | 
 98 | 	searcher.Clear()
 99 | }
100 | 
101 | func TestSearcher(t *testing.T) {
102 | 	searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."})
103 | 	for i := 0; i < 12; i++ {
104 | 		searcher.Add(index.Document{ID: 10+i, Text: randSeq(5)})
105 | 	}
106 | 	time.Sleep(2 * time.Second)
107 | 	fmt.Printf("count:%d\n", searcher.Count())
108 | 	assert.Equal(t, 13, searcher.Count()) //默认10个doc会切换双buffer
109 | 
110 | 	searcher.Drain(0)
111 | 	time.Sleep(2 * time.Second)
112 | 
113 | 	var a int
114 | 	copyData :=(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices()
115 | 	for i := 0; i < len(copyData); i++ {
116 | 		a += copyData[i].Property().DocNum()
117 | 	}
118 | 	fmt.Printf("index len:%d\n", len(copyData))
119 | 	fmt.Printf("auxIndex count:%d\n", a)
120 | 	assert.Equal(t, 13, a)
121 | 
122 | 	rst := searcher.Search("donut")
123 | 	fmt.Printf("%+v", rst)
124 | 	assert.Equal(t, 1, len(rst))
125 | 
126 | 	//Del&Filter
127 | 	searcher.Del(index.Document{ID: 1})
128 | 	rst = searcher.Search("donut")
129 | 	assert.Equal(t, 0, len(rst))
130 | 
131 | 	//Clear
132 | 	searcher.Clear()
133 | }
134 | 


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | CURDIR=$(cd $(dirname $0); pwd)
 3 | echo "$CURDIR"
 4 | 
 5 | BIN=easysearch
 6 | 
 7 | if [ $1 = "standalone" ]; then
 8 |   ps -ef|grep $BIN|grep -v "grep"|awk -F " " '{print $2}'|xargs kill -9
 9 |   sleep 1
10 | 
11 |   ./$BIN -m cluster --servername=all >> ${CURDIR}/err.log 2>&1 &
12 |   echo "started standalone cluster "
13 | elif [ $1 = "manager" ]; then
14 |   ./$BIN -m cluster --servername=managerserver >> ${CURDIR}/err.log 2>&1 &
15 | elif [ $1 = "data" ]; then
16 |   ./$BIN -m cluster --servername=dataserver >> ${CURDIR}/err.log 2>&1 &
17 | elif [ $1 = "search" ]; then
18 |   ./$BIN -m cluster --servername=searchserver >> ${CURDIR}/err.log 2>&1 &
19 | elif [ $1 = "kill" ]; then
20 |   ps -ef|grep $BIN|grep -v "grep"|awk -F " " '{print $2}'|xargs kill -9
21 | fi
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/util/collection.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | // InterInt returns the set intersection between a and b.
  4 | // a and b have to be sorted in ascending order and contain no duplicates.
  5 | func InterInt(a []int, b []int) []int {
  6 | 	maxLen := len(a)
  7 | 	if len(b) > maxLen {
  8 | 		maxLen = len(b)
  9 | 	}
 10 | 	r := make([]int, 0, maxLen)
 11 | 	var i, j int
 12 | 	for i < len(a) && j < len(b) {
 13 | 		if a[i] < b[j] {
 14 | 			i++
 15 | 		} else if a[i] > b[j] {
 16 | 			j++
 17 | 		} else {
 18 | 			r = append(r, a[i])
 19 | 			i++
 20 | 			j++
 21 | 		}
 22 | 	}
 23 | 	return r
 24 | }
 25 | 
 26 | // MergeInt returns the unique set a union b.
 27 | // a and b have to be sorted in ascending order and contain no duplicates.
 28 | func MergeInt(a []int, b []int) []int {
 29 | 	if a == nil {
 30 | 		return b
 31 | 	}
 32 | 	if b == nil {
 33 | 		return a
 34 | 	}
 35 | 	r := make([]int, 0, len(a)+len(b))
 36 | 	var i, j int
 37 | 	for i < len(a) && j < len(b) {
 38 | 		if a[i] < b[j] {
 39 | 			r = append(r, a[i])
 40 | 			i++
 41 | 		} else if a[i] > b[j] {
 42 | 			r = append(r, b[j])
 43 | 			j++
 44 | 		} else {
 45 | 			r = append(r, a[i])
 46 | 			i++
 47 | 			j++
 48 | 		}
 49 | 	}
 50 | 	return r
 51 | }
 52 | 
 53 | // DiffInt returns the diff set a between b.
 54 | // a and b have to be sorted in ascending order and contain no duplicates.
 55 | func DiffInt(a []int, b []int) []int {
 56 | 	minLen := len(a)
 57 | 	if len(b) < minLen {
 58 | 		minLen = len(b)
 59 | 	}
 60 | 	r := make([]int, 0, minLen)
 61 | 	var i, j int
 62 | 	for i < len(a) && j < len(b) {
 63 | 		if a[i] < b[j] {
 64 | 			r = append(r, a[i])
 65 | 			i++
 66 | 		} else if a[i] > b[j] {
 67 | 			r = append(r, b[j])
 68 | 			j++
 69 | 		} else {
 70 | 			i++
 71 | 			j++
 72 | 		}
 73 | 	}
 74 | 	return r
 75 | }
 76 | 
 77 | // FilterInt returns the set a filter b.
 78 | // a and b have to be sorted in ascending order and contain no duplicates.
 79 | func FilterInt(a []int, b []int) []int {
 80 | 	var i, j int
 81 | 	for i < len(a) && j < len(b) {
 82 | 		if a[i] < b[j] {
 83 | 			i++
 84 | 		} else if a[i] > b[j] {
 85 | 			j++
 86 | 		} else {
 87 | 			a[i] = -1
 88 | 			i++
 89 | 			j++
 90 | 		}
 91 | 	}
 92 | 	r := make([]int, 0)
 93 | 	for _, v := range a {
 94 | 		if v != -1 {
 95 | 			r = append(r, v)
 96 | 		}
 97 | 	}
 98 | 	return r
 99 | }
100 | 


--------------------------------------------------------------------------------
/util/condition.go:
--------------------------------------------------------------------------------
1 | package util
2 | 
3 | func IfElseInt(condition bool, o1 int, o2 int) int {
4 | 	if condition {
5 | 		return o1
6 | 	}
7 | 	return o2
8 | }
9 | 


--------------------------------------------------------------------------------
/util/filter.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	snowballeng "github.com/kljensen/snowball/english"
 7 | )
 8 | 
 9 | // lowercaseFilter returns a slice of tokens normalized to lower case.
10 | func lowercaseFilter(tokens []string) []string {
11 | 	r := make([]string, len(tokens))
12 | 	for i, token := range tokens {
13 | 		r[i] = strings.ToLower(token)
14 | 	}
15 | 	return r
16 | }
17 | 
18 | // stopwordFilter returns a slice of tokens with stop words removed.
19 | func stopwordFilter(tokens []string) []string {
20 | 	var stopwords = map[string]struct{}{
21 | 		"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
22 | 		"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
23 | 	}
24 | 	r := make([]string, 0, len(tokens))
25 | 	for _, token := range tokens {
26 | 		if _, ok := stopwords[token]; !ok {
27 | 			r = append(r, token)
28 | 		}
29 | 	}
30 | 	return r
31 | }
32 | 
33 | // stemmerFilter returns a slice of stemmed tokens.
34 | func stemmerFilter(tokens []string) []string {
35 | 	r := make([]string, len(tokens))
36 | 	for i, token := range tokens {
37 | 		r[i] = snowballeng.Stem(token, false)
38 | 	}
39 | 	return r
40 | }
41 | 


--------------------------------------------------------------------------------
/util/filter_test.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func TestLowercaseFilter(t *testing.T) {
10 | 	var (
11 | 		in  = []string{"Cat", "DOG", "fish"}
12 | 		out = []string{"cat", "dog", "fish"}
13 | 	)
14 | 	assert.Equal(t, out, lowercaseFilter(in))
15 | }
16 | 
17 | func TestStopwordFilter(t *testing.T) {
18 | 	var (
19 | 		in  = []string{"i", "am", "the", "cat"}
20 | 		out = []string{"am", "cat"}
21 | 	)
22 | 	assert.Equal(t, out, stopwordFilter(in))
23 | }
24 | 
25 | func TestStemmerFilter(t *testing.T) {
26 | 	var (
27 | 		in  = []string{"cat", "cats", "fish", "fishing", "fished", "airline"}
28 | 		out = []string{"cat", "cat", "fish", "fish", "fish", "airlin"}
29 | 	)
30 | 	assert.Equal(t, out, stemmerFilter(in))
31 | }
32 | 


--------------------------------------------------------------------------------
/util/net.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net"
 6 | )
 7 | 
 8 | func GetLocalIP() string {
 9 | 	list, err := net.Interfaces()
10 | 	if err != nil {
11 | 		panic(err)
12 | 	}
13 | 
14 | 	for i, iface := range list {
15 | 		fmt.Printf("%d name=%s %v\n", i, iface.Name, iface)
16 | 		addrs, err := iface.Addrs()
17 | 		if err != nil {
18 | 			panic(err)
19 | 		}
20 | 		for j, addr := range addrs {
21 | 			fmt.Printf(" %d %v\n", j, addr)
22 | 			var ip net.IP
23 | 			switch v := addr.(type) {
24 | 			case *net.IPNet:
25 | 				ip = v.IP
26 | 			case *net.IPAddr:
27 | 				ip = v.IP
28 | 			}
29 | 
30 | 			if !ip.IsLoopback() && ip.To4() != nil {
31 | 				return ip.String()
32 | 			}
33 | 		}
34 | 	}
35 | 	return ""
36 | }
37 | 


--------------------------------------------------------------------------------
/util/tokenizer.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode"
 6 | )
 7 | 
 8 | // tokenize returns a slice of tokens for the given text.
 9 | func tokenize(text string) []string {
10 | 	return strings.FieldsFunc(text, func(r rune) bool {
11 | 		// Split on any character that is not a letter or a number.
12 | 		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
13 | 	})
14 | }
15 | 
16 | // Analyze analyzes the text and returns a slice of tokens.
17 | func Analyze(text string) []string {
18 | 	tokens := tokenize(text)
19 | 	tokens = lowercaseFilter(tokens)
20 | 	tokens = stopwordFilter(tokens)
21 | 	tokens = stemmerFilter(tokens) //提取词干 smiling -> smile
22 | 	return tokens
23 | }
24 | 


--------------------------------------------------------------------------------
/util/tokenizer_test.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | )
 8 | 
 9 | func TestTokenizer(t *testing.T) {
10 | 	testCases := []struct {
11 | 		text   string
12 | 		tokens []string
13 | 	}{
14 | 		{
15 | 			text:   "",
16 | 			tokens: []string{},
17 | 		},
18 | 		{
19 | 			text:   "a",
20 | 			tokens: []string{"a"},
21 | 		},
22 | 		{
23 | 			text:   "small wild,cat!",
24 | 			tokens: []string{"small", "wild", "cat"},
25 | 		},
26 | 	}
27 | 
28 | 	for _, tc := range testCases {
29 | 		t.Run(tc.text, func(st *testing.T) {
30 | 			assert.EqualValues(st, tc.tokens, tokenize(tc.text))
31 | 		})
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------