├── .DS_Store ├── .gitignore ├── EasySearch.jpg ├── LICENSE ├── README.md ├── cluster.yml ├── cluster ├── cluster.go ├── dataserver.go ├── dataserver_test.go ├── managerserver.go ├── managerserver_test.go ├── searchclient.go ├── searchclient_test.go ├── searchserver.go ├── searchserver_test.go ├── server.go └── shardingindexer.go ├── config.yml ├── config └── Config.go ├── go.mod ├── go.sum ├── index ├── btreeindex.go ├── btreeindex_test.go ├── document.go ├── document_test.go ├── hashmapindex.go ├── hashmapindex_test.go ├── index.go ├── postinglist.go ├── postinglist_test.go ├── property.go └── tfidf.go ├── main.go ├── paraphrase ├── serving │ ├── model.go │ └── model_test.go └── train │ ├── wiki2txt.py │ └── word2vec.py ├── score ├── bm25.go └── bm25_test.go ├── search ├── indexer.go ├── indexer_test.go ├── merger.go ├── searcher.go └── searcher_test.go ├── start.sh └── util ├── collection.go ├── condition.go ├── filter.go ├── filter_test.go ├── net.go ├── tokenizer.go └── tokenizer_test.go /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awesomefly/easysearch/6f23c6b3f8dc4ef071ff7c4e7b7ac2bc35363d0e/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | easysearch 2 | err.log 3 | cpu.pprof 4 | /.idea 5 | /data 6 | -------------------------------------------------------------------------------- /EasySearch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awesomefly/easysearch/6f23c6b3f8dc4ef071ff7c4e7b7ac2bc35363d0e/EasySearch.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Easy Full-Text Search Engine 2 | 3 | # Overview 4 | EasySearch是一个分布式的全文检索搜索引擎,同时支持内存检索与磁盘检索,并针对性做了性能优化。 5 | 6 | ## 新特性 7 | 8 | 1. 支持基于wiki文档构建倒排索引 9 | 2. 索引结构支持Hashtable与Btree 10 | 3. 引擎支持全量索引+增量索引,增量索引是基于Hashtable在内存中构建的,支持实时更新,定时合并到全量索引;且支持了DoubleBuffer更新,提升了查询性能; 11 | 4. 全量索引分为SmallSegment、MiddleSegment、BigSegment 3中, 多个SmallSegment达到一定大小后合并到MiddleSegment,以此类推。按不同大小或时间拆分,也可以降低全量索引重建成本 12 | 5. 检索加速:支持非精准topk检索,postinglist归并时,支持按词频等静态分提前截断r个加速归并(胜者)。 归并后支持截断 13 | 6. 相关性打分:支持bm25相关性排序 14 | 7. 支持搜索词语义改写 15 | 16 | ## Requirement 17 | - go 1.16.5 以上 18 | 19 | 20 | ## Quick Start 21 | ### 下载 22 | 23 | - 下载项目代码到你的工作目录: 24 | 25 | ``` 26 | git clone https://github.com/awesomefly/easysearch.git 27 | ``` 28 | 29 | - 通过go mod更新依赖: 30 | 31 | ``` 32 | cd $PROJECT_DIR 33 | go mod tidy 34 | ``` 35 | 36 | - 项目构建: 37 | ``` 38 | go build 39 | ``` 40 | 41 | ### 本地索引 42 | - 下载wiki文档到本地路径, 这里我们下载wiki摘要数据,对摘要建立倒排索引。 [下载链接]( https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz) 43 | ``` 44 | cd $PROJECT_DIR/data 45 | wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz 46 | ``` 47 | - 构建索引文件, 在项目路径下创建config.yml文件,加入构建索引配置项 48 | ``` 49 | cd $PROJECT_DIR 50 | vim config.yml 51 | ``` 52 | - 配置如下: 53 | ``` 54 | Storage: 55 | IndexFile: ./data/wiki_index #索引文件存储路径 56 | DumpFile: ./data/enwiki-latest-abstract1.xml.gz #文档路径 57 | BM25: 58 | K1: 2 59 | B: 0.75 60 | ``` 61 | - 创建索引 62 | ``` 63 | cd $PROJECT_DIR 64 | ./easysearch -m indexer 65 | ``` 66 | 如果索引构建成功,$PROJECT_DIR/data目录下会生成 wiki_index.idx,wiki_index.kv,wiki_index.sum 三个文件 67 | - 本地检索, 通过关键字搜索文档 68 | ``` 69 | ./easysearch -m searcher -q "Album Jordan" --source=local 70 | ``` 71 | 72 | ### 语义改写 [参考](https://github.com/dwt0317/QueryRewritingService/tree/master/embedding) 73 | - requirement 74 | - python 3.8+ 75 | - 下载训练集 [下载链接](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2) 76 | ``` 77 | cd $PROJECT_DIR/data 78 | wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 79 | ``` 80 | - 文本预处理:wiki文档预处理,提取词集到data/wiki_texts.txt 81 | ``` 82 | /usr/bin/python3 paraphrase/train/wiki2txt.py --cmd=parse --file=$WIKI_FILE 83 | ``` 84 | - 模型训练: 采用python gensim.word2vec训练数据,并保存模型与词向量集合 85 | ``` 86 | /usr/bin/python3 paraphrase/train/word2vec.py --cmd=train --corpus_file=./data/wiki_texts.txt 87 | ``` 88 | 训练成功则会生成: 89 | 90 | 模型数据文件./data/med200_less.model.bin 91 | 92 | 向量数据文件 ./data/word2vec.format.bin 93 | - 模型应用 94 | - golang语言可以使用code.sajari.com/word2vec库来加载训练得到的词向量集合, 通过并通过接口获取搜索词的近义词 95 | - 单元测试 paraphrase/serving/model_test.go 96 | - 本地搜索时需要增加选项 --model_file 97 | ``` 98 | ./easysearch -m searcher -q "Album Jordan" --source=local --model_file=./data/word2vec.format.bin 99 | ``` 100 | - 集群搜索时新增配置项 101 | ``` 102 | Storage: 103 | IndexFile: ./data/wiki_index #索引文件存储路径 104 | DumpFile: ./data/enwiki-latest-abstract1.xml.gz #文档路径 105 | ModelFile: ./data/word2vec.format.bin 106 | ``` 107 | 108 | ### 分布式 109 | 110 | #### Architecture 111 | ![](EasySearch.jpg) 112 | - MangerServer 服务信息与元数据管理节点 113 | - DataServer 索引数据存储节点, 每个节点上有多个分片索引数据 114 | - SearchServer 只负责处理查询请求 115 | 116 | 117 | #### 构建分片索引 118 | - 修改配置 119 | ``` 120 | cd $PROJECT_DIR 121 | vim config.yml 122 | ``` 123 | - 新增分片配置如下: 124 | ``` 125 | Storage: 126 | IndexFile: ./data/wiki_index #索引文件存储路径 127 | DumpFile: ./data/enwiki-latest-abstract1.xml.gz #文档路径 128 | BM25: 129 | K1: 2 130 | B: 0.75 131 | Cluster: 132 | ShardingNum: 10 133 | 134 | ``` 135 | - 创建索引分片 136 | ``` 137 | cd $PROJECT_DIR 138 | ./easysearch -m indexer --sharding=true 139 | 140 | ``` 141 | #### 创建集群 142 | 143 | ###### 创建单机standalone集群 144 | 145 | - 创建集群配置文件(cluster.yml只用与创建Standalone集群) 146 | ``` 147 | cd $PROJECT_DIR 148 | vim cluster.yml 149 | ``` 150 | - 配置(创建1个管理节点,10个数据节点,2个查询节点) 151 | ``` 152 | ManageServer: 153 | Host: 127.0.0.1 154 | Port: 1234 155 | SearchServer: 156 | - Host: 127.0.0.1 157 | Port: 1235 158 | - Host: 127.0.0.1 159 | Port: 1236 160 | DataServer: 161 | - Host: 127.0.0.1 162 | Port: 1240 163 | - Host: 127.0.0.1 164 | Port: 1241 165 | - Host: 127.0.0.1 166 | Port: 1242 167 | - Host: 127.0.0.1 168 | Port: 1243 169 | - Host: 127.0.0.1 170 | Port: 1244 171 | - Host: 127.0.0.1 172 | Port: 1245 173 | - Host: 127.0.0.1 174 | Port: 1246 175 | - Host: 127.0.0.1 176 | Port: 1247 177 | - Host: 127.0.0.1 178 | Port: 1248 179 | - Host: 127.0.0.1 180 | Port: 1249 181 | ``` 182 | - 修改config.yml配置如下 183 | ``` 184 | Storage: 185 | IndexFile: ./data/wiki_index #索引文件存储路径 186 | DumpFile: ./data/enwiki-latest-abstract1.xml.gz #文档路径 187 | BM25: 188 | K1: 2 189 | B: 0.75 190 | Cluster: 191 | ShardingNum: 10 192 | ManageServer: #ip port保持与集群配置一致 193 | Host: 127.0.0.1 194 | Port: 1234 195 | ``` 196 | - 集群启动 197 | ``` 198 | bash start.sh standalone 199 | ``` 200 | ###### 创建分布式集群 201 | - 自行创建集群需要准备好机器实例,分别在不同机器节点上启动不同服务 202 | - 启动顺序 ManagerServer->DataServer->SearchServer 203 | - 启动ManagerServer 204 | - 配置如下 205 | ``` 206 | Server: # ManagerServer host和ip 207 | Host: 127.0.0.1 208 | Port: 1234 209 | Cluster: 210 | ShardingNum: 10 211 | ReplicateNum: 3 212 | ``` 213 | - 启动 214 | ``` 215 | ./easysearch -m cluster --servername=managerserver 216 | ``` 217 | 218 | - 启动DataServer 219 | - 配置 220 | ``` 221 | Storage: 222 | IndexFile: ./data/wiki_index #索引文件存储路径 223 | DumpFile: ./data/enwiki-latest-abstract1.xml.gz #文档路径 224 | BM25: 225 | K1: 2 226 | B: 0.75 227 | Server: # DataServer host和ip 228 | Host: 127.0.0.1 229 | Port: 1240 230 | Cluster: 231 | ShardingNum: 10 232 | ReplicateNum: 3 233 | ManageServer: #ManagerServer ip port保持与集群配置一致 234 | Host: 127.0.0.1 235 | Port: 1234 236 | ``` 237 | - 启动 238 | ``` 239 | ./easysearch -m cluster --servername=dataserver 240 | ``` 241 | - 启动SearchServer 242 | - 配置 243 | ``` 244 | Server: # SearchServer host和ip 245 | Host: 127.0.0.1 246 | Port: 1235 247 | Cluster: 248 | ShardingNum: 10 249 | ReplicateNum: 3 250 | ManageServer: #ManagerServer ip port保持与集群配置一致 251 | Host: 127.0.0.1 252 | Port: 1234 253 | ``` 254 | - 启动 255 | ``` 256 | ./easysearch -m cluster --servername=searchserver 257 | ``` 258 | #### 分布式检索 259 | 260 | - 结果查询 261 | ``` 262 | ./easysearch -m searcher -q "Album Jordan" --source=remote 263 | ``` 264 | - OUPUT: 265 | ``` 266 | 2021/12/20 19:45:03 Starting remote search.. 267 | 2021/12/20 19:45:04 Search found 5 documents in 611.645503ms 268 | 2021/12/20 19:45:04 10 The Great Session is an album led by pianist Duke Jordan recorded in 1978 and released on the Danish SteepleChase label in 1981.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015 269 | 2021/12/20 19:45:04 605 Thinking of You is an album led by pianist Duke Jordan recorded in 1979 in Denmark (with one track from 1978) and released on the Danish SteepleChase label in 1982.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015 270 | 2021/12/20 19:45:04 613 Change a Pace is an album led by pianist Duke Jordan recorded in 1979 in Denmark and released on the Danish SteepleChase label in 1980.Duke Jordan discography, accessed March 24, 2015SteepleChase Records discography, accessed March 24, 2015 271 | 2021/12/20 19:45:04 597 Flight to Japan is an album led by the pianist Duke Jordan, recorded in 1976 in Tokyo and released on the Danish SteepleChase label in 1978.Duke Jordan discography, accessed March 24, 2015- SteepleChase Records discography, accessed March 24, 2015 272 | 2021/12/20 19:45:04 564 Suburbs is an album by the American New wave band The Suburbs, released in 1986. It was their first and only release on A&M Records. 273 | 274 | ``` 275 | 276 | 277 | ## TODO 278 | - PostingList压缩与归并效率优化 279 | - 字典索引压缩,减少存储空间 280 | - 精排引入LR、DNN 281 | - 多路召回引入向量检索 282 | 283 | ## 参考 284 | 285 | - [skip-list vs btree](https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537) 286 | - [simple fts](https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/.) 287 | 288 | 289 | -------------------------------------------------------------------------------- /cluster.yml: -------------------------------------------------------------------------------- 1 | ManageServer: 2 | Host: 127.0.0.1 3 | Port: 1234 4 | SearchServer: 5 | - Host: 127.0.0.1 6 | Port: 1235 7 | - Host: 127.0.0.1 8 | Port: 1236 9 | DataServer: 10 | - Host: 127.0.0.1 11 | Port: 1240 12 | - Host: 127.0.0.1 13 | Port: 1241 14 | - Host: 127.0.0.1 15 | Port: 1242 16 | - Host: 127.0.0.1 17 | Port: 1243 18 | - Host: 127.0.0.1 19 | Port: 1244 20 | - Host: 127.0.0.1 21 | Port: 1245 22 | - Host: 127.0.0.1 23 | Port: 1246 24 | - Host: 127.0.0.1 25 | Port: 1247 26 | - Host: 127.0.0.1 27 | Port: 1248 28 | - Host: 127.0.0.1 29 | Port: 1249 -------------------------------------------------------------------------------- /cluster/cluster.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "errors" 5 | "math/rand" 6 | ) 7 | 8 | const ( 9 | ManagerNode = 1 10 | DataNode = 2 11 | SearchNode = 3 12 | ) 13 | 14 | type Node struct { 15 | ID int 16 | Type int 17 | Host string //ip:port 18 | 19 | LeaderSharding []int //主分片 20 | FollowerSharding []int //备份分片 21 | } 22 | 23 | type Cluster struct { 24 | ShardingNum int //分片数 25 | ReplicateNum int //数据备份数 26 | 27 | SearchNodeCorpus []Node 28 | DataNodeCorpus map[string]Node 29 | } 30 | 31 | func NewCluster(shard, replicate int) *Cluster { 32 | return &Cluster{ 33 | ShardingNum: shard, 34 | ReplicateNum: replicate, 35 | SearchNodeCorpus: make([]Node, 0), 36 | DataNodeCorpus: make(map[string]Node, 0), 37 | } 38 | } 39 | 40 | func (c *Cluster) Add(node Node) error { 41 | switch node.Type { 42 | case DataNode: 43 | c.DataNodeCorpus[node.Host] = node 44 | case SearchNode: 45 | c.SearchNodeCorpus = append(c.SearchNodeCorpus, node) 46 | default: 47 | return errors.New("invalid node type") 48 | } 49 | return nil 50 | } 51 | 52 | const ( 53 | LeaderSharding = 1 54 | FollowerSharding = 2 55 | ) 56 | 57 | type Sharding2Node map[int][]Node 58 | 59 | func (c *Cluster) RouteShardingNode(flag int) (Sharding2Node, error) { 60 | result := make(Sharding2Node, 0) 61 | for _, node := range c.DataNodeCorpus { 62 | switch flag { 63 | case LeaderSharding: 64 | for _, shard := range node.LeaderSharding { 65 | result[shard] = append(result[shard], node) 66 | } 67 | case FollowerSharding: 68 | for _, shard := range node.FollowerSharding { 69 | result[shard] = append(result[shard], node) 70 | } 71 | } 72 | } 73 | return result, nil 74 | } 75 | 76 | func (c *Cluster) RouteSearchNode() Node { 77 | n := rand.Intn(len(c.SearchNodeCorpus)) 78 | return c.SearchNodeCorpus[n] 79 | } 80 | -------------------------------------------------------------------------------- /cluster/dataserver.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | 7 | "github.com/awesomefly/easysearch/config" 8 | 9 | "github.com/awesomefly/easysearch/index" 10 | "github.com/awesomefly/easysearch/search" 11 | ) 12 | 13 | type DataServer struct { 14 | self Node 15 | cluster Cluster 16 | 17 | sharding map[int]*search.Searcher 18 | server *Server 19 | } 20 | 21 | func NewDataServer(config *config.Config) *DataServer { 22 | ds := DataServer{ 23 | self: Node{ 24 | ID: rand.Intn(10000), //todo: support uuid 25 | Type: DataNode, 26 | Host: config.Server.Address(), 27 | }, 28 | server: &Server{name: "Data", network: "tcp", address: config.Server.Address()}, 29 | sharding: make(map[int]*search.Searcher, 0), 30 | } 31 | 32 | n := Node{} 33 | err := RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.AddServer", ds.self, &n) 34 | if err != nil { 35 | panic(err) 36 | } 37 | ds.self = n 38 | 39 | c := Cluster{} 40 | err = RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.GetCluster", ds.self.Host, &c) 41 | if err != nil { 42 | panic(err) 43 | } 44 | ds.cluster = c 45 | //fmt.Printf("DataServer:%+v\n", ds) 46 | 47 | if len(config.Store.IndexFile) == 0 { 48 | panic("index file is empty.") 49 | } 50 | 51 | for _, shard := range ds.self.LeaderSharding { 52 | searcher := search.NewSearcher(fmt.Sprintf("%s.%d", config.Store.IndexFile, shard)) 53 | if config.Store.ModelFile != "" { 54 | searcher.InitParaphrase(config.Store.ModelFile) 55 | } 56 | ds.sharding[shard] = searcher 57 | } 58 | 59 | for _, shard := range ds.self.FollowerSharding { 60 | searcher := search.NewSearcher(fmt.Sprintf("%s.%d", config.Store.IndexFile, shard)) 61 | if config.Store.ModelFile != "" { 62 | searcher.InitParaphrase(config.Store.ModelFile) 63 | } 64 | ds.sharding[shard] = searcher 65 | } 66 | return &ds 67 | } 68 | 69 | func (s *DataServer) Run() { 70 | if err := s.server.RegisterName("DataServer", s); err != nil { 71 | panic(err) 72 | } 73 | if err := s.server.Run(); err != nil { 74 | panic(err) 75 | } 76 | } 77 | 78 | type SearchRequest struct { 79 | Query string 80 | Sharding []int 81 | } 82 | 83 | //Search 搜索 84 | func (s *DataServer) Search(request SearchRequest, response *[]index.Doc) error { 85 | result := make([]index.Doc, 0) 86 | for _, shard := range request.Sharding { 87 | srh := s.sharding[shard] 88 | if srh == nil { 89 | continue 90 | } 91 | x := srh.Search(request.Query) 92 | result = append(result, x...) 93 | } 94 | *response = result 95 | return nil 96 | } 97 | 98 | // Add 实时更新 99 | func (s *DataServer) Add(doc index.Document) { 100 | shard := doc.ID % s.cluster.ShardingNum 101 | srh := s.sharding[shard] 102 | srh.Add(doc) 103 | } 104 | 105 | // Del 实时删除 106 | func (s *DataServer) Del(doc index.Document) { 107 | shard := doc.ID % s.cluster.ShardingNum 108 | srh := s.sharding[shard] 109 | srh.Del(doc) 110 | } 111 | 112 | //KeepAlive todo: 备份分片与主分片保持心跳,一旦发现主分片宕机发起选举 or 请求ManageServer重新分配Leader 113 | /* 114 | func (s *DataServer) KeepAlive() { 115 | for _, shardId := range s.self.FollowerSharding { 116 | key, ok := s.cluster.consistentHash.GetNode(fmt.Sprintf("%d", shardId)) 117 | if !ok { 118 | panic("") 119 | } 120 | ip := s.cluster.DataNodeCorpus[key].IP 121 | port := s.cluster.DataNodeCorpus[key].Port 122 | 123 | ok := KeepAlive(ip, port) 124 | if !ok { 125 | keys, ok := s.cluster.consistentHash.GetNodes(fmt.Sprintf("%d", shardId), s.cluster.ReplicateNum+1) 126 | if !ok { 127 | panic("") 128 | } 129 | fllowers := GetNodes(keys) 130 | StratElection(fllowers) 131 | } 132 | } 133 | } 134 | */ 135 | -------------------------------------------------------------------------------- /cluster/dataserver_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/awesomefly/easysearch/config" 9 | "github.com/awesomefly/easysearch/index" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestDataServer(t *testing.T) { 14 | managerConfig := config.Config{ 15 | Server: config.Server{ 16 | Host: "127.0.0.1", 17 | Port: 1234, 18 | }, 19 | Cluster: config.Cluster{ 20 | ShardingNum: 10, 21 | ReplicateNum: 3, 22 | }, 23 | } 24 | server := NewManagerServer(&managerConfig) 25 | assert.NotNil(t, server) 26 | go server.Run() 27 | time.Sleep(1 * time.Second) 28 | 29 | dataSvrConfig := config.Config{ 30 | Store: config.Storage{ 31 | IndexFile: "../data/wiki_index", 32 | }, 33 | Server: config.Server{ 34 | Host: "127.0.0.1", 35 | Port: 1240, 36 | }, 37 | Cluster: config.Cluster{ 38 | ShardingNum: 10, 39 | ReplicateNum: 3, 40 | ManageServer: config.Server{ 41 | Host: "127.0.0.1", 42 | Port: 1234, 43 | }, 44 | }, 45 | } 46 | ds := NewDataServer(&dataSvrConfig) 47 | assert.NotNil(t, ds) 48 | 49 | var response []index.Doc 50 | err := ds.Search(SearchRequest{Query: "Jordan", Sharding: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}}, &response) 51 | assert.Nil(t, err) 52 | 53 | fmt.Printf("%+v\n", response) 54 | } 55 | -------------------------------------------------------------------------------- /cluster/managerserver.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/awesomefly/easysearch/util" 9 | 10 | "github.com/serialx/hashring" 11 | 12 | "github.com/awesomefly/easysearch/config" 13 | ) 14 | 15 | type ManagerServer struct { 16 | cluster *Cluster 17 | hash *hashring.HashRing 18 | 19 | server *Server 20 | } 21 | 22 | func NewManagerServer(config *config.Config) *ManagerServer { 23 | srv := &ManagerServer{ 24 | cluster: NewCluster(config.Cluster.ShardingNum, config.Cluster.ReplicateNum), 25 | hash: hashring.New(make([]string, 0)), 26 | server: &Server{name: "Manage", network: "tcp", address: config.Server.Address()}, 27 | } 28 | return srv 29 | } 30 | 31 | func (m *ManagerServer) Run() { 32 | if err := m.server.RegisterName("ManagerServer", m); err != nil { 33 | panic(err) 34 | } 35 | if err := m.server.Run(); err != nil { 36 | panic(err) 37 | } 38 | } 39 | 40 | // AddServer called by SearchServer 41 | func (m *ManagerServer) AddServer(request Node, response *Node) error { 42 | log.Print("AddServer from ", request.Host) 43 | 44 | m.cluster.Add(request) 45 | if request.Type == DataNode { 46 | m.hash = m.hash.AddNode(request.Host) 47 | if err := m.ReBalance(); err != nil { 48 | return err 49 | } 50 | 51 | go func() { 52 | //todo:使用channel通知分片信息有变化的节点 53 | }() 54 | 55 | *response = m.cluster.DataNodeCorpus[request.Host] 56 | } 57 | return nil 58 | } 59 | 60 | // GetCluster called by DataServer 61 | func (m *ManagerServer) GetCluster(request string, response *Cluster) error { 62 | log.Print("GetCluster from ", request) 63 | *response = *m.cluster 64 | return nil 65 | } 66 | 67 | func (m *ManagerServer) ReBalance() error { 68 | for k, node := range m.cluster.DataNodeCorpus { 69 | node.LeaderSharding = make([]int, 0) 70 | node.FollowerSharding = make([]int, 0) 71 | m.cluster.DataNodeCorpus[k] = node 72 | } 73 | 74 | size := util.IfElseInt(len(m.cluster.DataNodeCorpus) < m.cluster.ReplicateNum, len(m.cluster.DataNodeCorpus), m.cluster.ReplicateNum) 75 | for i := 0; i < m.cluster.ShardingNum; i++ { 76 | nodes, ok := m.hash.GetNodes(fmt.Sprint(i), size) 77 | if !ok { 78 | return errors.New("get nodes err: invalid replicated num ") 79 | } 80 | if len(nodes) < size { 81 | return errors.New("unexpected nodes size err. ") 82 | } 83 | 84 | n := m.cluster.DataNodeCorpus[nodes[0]] 85 | n.LeaderSharding = append(n.LeaderSharding, i) 86 | m.cluster.DataNodeCorpus[n.Host] = n 87 | for _, k := range nodes[1:] { 88 | n = m.cluster.DataNodeCorpus[k] 89 | n.FollowerSharding = append(n.FollowerSharding, i) 90 | m.cluster.DataNodeCorpus[n.Host] = n 91 | } 92 | 93 | } 94 | return nil 95 | } 96 | -------------------------------------------------------------------------------- /cluster/managerserver_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/rpc" 7 | "runtime" 8 | "testing" 9 | "time" 10 | 11 | "github.com/awesomefly/easysearch/config" 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | func addServer(host string) error { 16 | client, err := rpc.Dial("tcp", ":1234") 17 | if err != nil { 18 | log.Fatal("dialing:", err) 19 | return err 20 | } 21 | var request = Node{ 22 | Host: host, 23 | } 24 | var response Node 25 | err = client.Call("ManagerServer.AddServer", request, &response) 26 | if err != nil { 27 | log.Fatal(err) 28 | return err 29 | } 30 | 31 | fmt.Printf("resp:%+v\n", response) 32 | client.Close() 33 | return nil 34 | } 35 | 36 | func getCluster() error { 37 | client, err := rpc.Dial("tcp", ":1234") 38 | if err != nil { 39 | log.Fatal("dialing:", err) 40 | return err 41 | } 42 | 43 | var response Cluster 44 | err = client.Call("ManagerServer.GetCluster", "local", &response) 45 | if err != nil { 46 | log.Fatal(err) 47 | return err 48 | } 49 | 50 | fmt.Printf("resp:%+v\n", response) 51 | client.Close() 52 | return nil 53 | } 54 | 55 | func TestManageServer(t *testing.T) { 56 | conf := config.Config{ 57 | Server: config.Server{ 58 | Host: "127.0.0.1", 59 | Port: 1234, 60 | }, 61 | 62 | Cluster: config.Cluster{ 63 | ShardingNum: 10, 64 | ReplicateNum: 3, 65 | }, 66 | } 67 | server := NewManagerServer(&conf) 68 | go server.Run() 69 | runtime.Gosched() 70 | 71 | time.Sleep(1 * time.Second) 72 | assert.NotNil(t, server) 73 | 74 | assert.Equal(t, nil, addServer("127.0.0.1:8801")) 75 | assert.Equal(t, nil, getCluster()) 76 | assert.Equal(t, nil, addServer("127.0.0.1:8802")) 77 | assert.Equal(t, nil, getCluster()) 78 | assert.Equal(t, nil, addServer("127.0.0.1:8803")) 79 | assert.Equal(t, nil, getCluster()) 80 | assert.Equal(t, nil, addServer("127.0.0.1:8804")) 81 | assert.Equal(t, nil, getCluster()) 82 | } 83 | -------------------------------------------------------------------------------- /cluster/searchclient.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "log" 5 | "net/rpc" 6 | 7 | "github.com/awesomefly/easysearch/util" 8 | 9 | "github.com/awesomefly/easysearch/config" 10 | "github.com/awesomefly/easysearch/index" 11 | ) 12 | 13 | //RpcCall RPC方法必须满足Go语言的RPC规则:方法只能有两个可序列化的参数,其中第二个参数是指针类型,并且返回一个error类型,同时必须是公开的方法 14 | func RpcCall(host string, method string, request interface{}, response interface{}) error { 15 | client, err := rpc.Dial("tcp", host) 16 | if err != nil { 17 | log.Fatal("dialing:", err) 18 | return err 19 | } 20 | 21 | switch v := response.(type) { 22 | case *Node: 23 | err = client.Call(method, request, v) 24 | case *Cluster: 25 | err = client.Call(method, request, v) 26 | case *[]index.Doc: 27 | err = client.Call(method, request, v) 28 | } 29 | if err != nil { 30 | log.Fatal(err) 31 | return err 32 | } 33 | client.Close() 34 | 35 | log.Printf("RPC Response:%+v", response) 36 | return nil 37 | } 38 | 39 | type SearchClient struct { 40 | ServerConfig *config.Server //manager server config 41 | cluster *Cluster //todo: cached and refresh cluster info 42 | } 43 | 44 | func NewSearchClient(config *config.Server) *SearchClient { 45 | client := SearchClient{ 46 | ServerConfig: config, 47 | cluster: &Cluster{}, 48 | } 49 | 50 | err := RpcCall(client.ServerConfig.Address(), "ManagerServer.GetCluster", util.GetLocalIP(), client.cluster) 51 | if err != nil { 52 | panic(err) 53 | } 54 | return &client 55 | } 56 | 57 | func (c *SearchClient) Search(query string) ([]index.Doc, error) { 58 | response := make([]index.Doc, 0) 59 | if err := RpcCall(c.cluster.RouteSearchNode().Host, "SearchServer.SearchAll", query, &response); err != nil { 60 | return response, err 61 | } 62 | return response, nil 63 | } 64 | -------------------------------------------------------------------------------- /cluster/searchclient_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/awesomefly/easysearch/config" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestSearchClient(t *testing.T) { 12 | config := &config.Server{ 13 | Host: "127.0.0.1", 14 | Port: 1234, 15 | } 16 | cli := NewSearchClient(config) 17 | result, err := cli.Search("Album Jordan") 18 | assert.Nil(t, err) 19 | assert.NotNil(t, result) 20 | fmt.Printf("result:%+v\n", result) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /cluster/searchserver.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "math/rand" 5 | "sort" 6 | 7 | "github.com/awesomefly/easysearch/config" 8 | 9 | "github.com/awesomefly/easysearch/index" 10 | ) 11 | 12 | type SearchServer struct { 13 | cluster Cluster 14 | server *Server 15 | } 16 | 17 | func NewSearchServer(config *config.Config) *SearchServer { 18 | self := Node{ 19 | ID: rand.Intn(10000), //todo: support uuid 20 | Type: SearchNode, 21 | Host: config.Server.Address(), 22 | } 23 | err := RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.AddServer", self, &Node{}) 24 | if err != nil { 25 | panic(err) 26 | } 27 | 28 | c := Cluster{} 29 | err = RpcCall(config.Cluster.ManageServer.Address(), "ManagerServer.GetCluster", "", &c) 30 | if err != nil { 31 | panic(err) 32 | } 33 | 34 | return &SearchServer{ 35 | cluster: c, 36 | server: &Server{name: "Search", network: "tcp", address: config.Server.Address()}, 37 | } 38 | 39 | return nil 40 | } 41 | 42 | func (s *SearchServer) Run() { 43 | if err := s.server.RegisterName("SearchServer", s); err != nil { 44 | panic(err) 45 | } 46 | if err := s.server.Run(); err != nil { 47 | panic(err) 48 | } 49 | } 50 | 51 | //SearchAll 分布式搜索 52 | //todo: 实现实时更新&删除接口 53 | func (s *SearchServer) SearchAll(query string, response *[]index.Doc) error { 54 | r, err := s.cluster.RouteShardingNode(FollowerSharding) //todo: cache router info 55 | if err != nil { 56 | return err 57 | } 58 | 59 | if r == nil || len(r) == 0 { 60 | if r, err = s.cluster.RouteShardingNode(LeaderSharding); err != nil { 61 | return err 62 | } 63 | } 64 | 65 | result := make([]index.Doc, 0) 66 | for sharding, nodes := range r { 67 | n := rand.Intn(len(nodes)) 68 | 69 | request := SearchRequest{ 70 | Query: query, 71 | Sharding: []int{sharding}, 72 | } 73 | var reply []index.Doc 74 | if err = RpcCall(nodes[n].Host, "DataServer.Search", request, &reply); err != nil { 75 | return err 76 | } 77 | result = append(result, reply...) 78 | } 79 | 80 | //sort and uniq result 81 | sort.Slice(result, func(i, j int) bool { 82 | return result[i].Score > result[j].Score //降序 83 | }) 84 | *response = result 85 | return nil 86 | } 87 | -------------------------------------------------------------------------------- /cluster/searchserver_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/awesomefly/easysearch/config" 9 | "github.com/awesomefly/easysearch/index" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestSearchServer(t *testing.T) { 14 | 15 | var managerConfig = config.Config{ 16 | Server: config.Server{ 17 | Host: "127.0.0.1", 18 | Port: 1234, 19 | }, 20 | Cluster: config.Cluster{ 21 | ShardingNum: 10, 22 | ReplicateNum: 3, 23 | }, 24 | } 25 | 26 | var dataSvrConfig = config.Config{ 27 | Store: config.Storage{ 28 | IndexFile: "../data/wiki_index", 29 | }, 30 | Server: config.Server{ 31 | Host: "127.0.0.1", 32 | Port: 1240, 33 | }, 34 | Cluster: config.Cluster{ 35 | ShardingNum: 10, 36 | ReplicateNum: 3, 37 | ManageServer: config.Server{ 38 | Host: "127.0.0.1", 39 | Port: 1234, 40 | }, 41 | }, 42 | } 43 | 44 | var srhSvrConfig = config.Config{ 45 | Server: config.Server{ 46 | Host: "127.0.0.1", 47 | Port: 1235, 48 | }, 49 | Cluster: config.Cluster{ 50 | ShardingNum: 10, 51 | ReplicateNum: 3, 52 | ManageServer: config.Server{ 53 | Host: "127.0.0.1", 54 | Port: 1234, 55 | }, 56 | }, 57 | } 58 | 59 | //start ManagerServer 60 | ms := NewManagerServer(&managerConfig) 61 | assert.NotNil(t, ms) 62 | go ms.Run() 63 | time.Sleep(1 * time.Second) 64 | 65 | //start DataServer 66 | 67 | ds := NewDataServer(&dataSvrConfig) 68 | assert.NotNil(t, ds) 69 | go ds.Run() 70 | time.Sleep(1 * time.Second) 71 | 72 | srh := NewSearchServer(&srhSvrConfig) 73 | var response []index.Doc 74 | err := srh.SearchAll("Jordan", &response) 75 | assert.Nil(t, err) 76 | 77 | fmt.Printf("%+v\n", response) 78 | } 79 | -------------------------------------------------------------------------------- /cluster/server.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "net" 7 | "net/rpc" 8 | "os" 9 | "os/signal" 10 | "syscall" 11 | ) 12 | 13 | type Server struct { 14 | name string 15 | network string 16 | address string 17 | 18 | listener net.Listener 19 | handler func(conn io.ReadWriteCloser) 20 | } 21 | 22 | func (s *Server) RegisterName(name string, rcvr interface{}) error { 23 | if err := rpc.RegisterName(name, rcvr); err != nil { 24 | return err 25 | } 26 | 27 | s.handler = func(conn io.ReadWriteCloser) { 28 | rpc.ServeConn(conn) 29 | } 30 | return nil 31 | } 32 | 33 | func (s *Server) Run() error { 34 | errChan := make(chan error, 1) 35 | go func() { errChan <- s.Start() }() 36 | 37 | if err := s.waitSignal(errChan); err != nil { 38 | log.Fatal("received error and exit: ", err.Error()) 39 | return err 40 | } 41 | 42 | // stop server after user hooks 43 | if err := s.Stop(); err != nil { 44 | log.Fatal("stop server error: ", err.Error()) 45 | return err 46 | } 47 | return nil 48 | } 49 | 50 | func (s *Server) Start() error { 51 | var err error 52 | s.listener, err = net.Listen(s.network, s.address) 53 | if err != nil { 54 | log.Fatal("Server ListenTCP error:", err) 55 | return err 56 | } 57 | log.Printf("%s Server Started.", s.name) 58 | 59 | for { 60 | conn, err := s.listener.Accept() 61 | if err != nil { 62 | log.Fatal("Accept error:", err) 63 | return err 64 | } 65 | s.handler(conn) 66 | } 67 | } 68 | 69 | func (s *Server) waitSignal(errCh chan error) error { 70 | signals := make(chan os.Signal, 1) 71 | signal.Notify(signals, syscall.SIGINT, syscall.SIGHUP, syscall.SIGTERM) 72 | 73 | for { 74 | select { 75 | case sig := <-signals: 76 | switch sig { 77 | case syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM: 78 | return nil 79 | } 80 | case err := <-errCh: 81 | return err 82 | } 83 | } 84 | } 85 | 86 | func (s *Server) Stop() error { 87 | if s.listener != nil { 88 | return s.listener.Close() 89 | } 90 | return nil 91 | } 92 | -------------------------------------------------------------------------------- /cluster/shardingindexer.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "time" 8 | 9 | "github.com/awesomefly/easysearch/config" 10 | 11 | "github.com/awesomefly/easysearch/index" 12 | ) 13 | 14 | func Index(conf *config.Config) { 15 | log.Println("Starting sharding index...") 16 | 17 | start := time.Now() 18 | docs, err := index.LoadDocuments(conf.Store.DumpFile) 19 | if err != nil { 20 | log.Fatal(err) 21 | } 22 | log.Printf("Loaded %d documents in %v", len(docs), time.Since(start)) 23 | 24 | shards := conf.Cluster.ShardingNum 25 | idxes := make([]*index.BTreeIndex, 0, shards) 26 | for i := 0; i < shards; i++ { 27 | IndexFile := fmt.Sprintf("%s.%d", conf.Store.IndexFile, i) 28 | os.Remove(IndexFile + ".idx") 29 | os.Remove(IndexFile + ".kv") 30 | os.Remove(IndexFile + ".sum") 31 | 32 | idx := index.NewBTreeIndex(IndexFile) 33 | idxes = append(idxes, idx) 34 | } 35 | 36 | buf := make([][]index.Document, shards) 37 | for i := 0; i < len(buf); i++ { 38 | buf[i] = make([]index.Document, 0) 39 | } 40 | 41 | start = time.Now() 42 | for i := 0; i < len(docs); i++ { 43 | id := docs[i].ID % shards 44 | buf[id] = append(buf[id], docs[i]) 45 | //log.Printf("keys:%s", docs[i].Text) 46 | 47 | if len(buf[id]) > 20 { 48 | idxes[id].Add(buf[id]) 49 | buf[id] = make([]index.Document, 0) 50 | } 51 | } 52 | 53 | for i := 0; i < len(buf); i++ { 54 | if len(buf[i]) > 0 { 55 | idxes[i].Add(buf[i]) 56 | } 57 | } 58 | 59 | for i := 0; i < shards; i++ { 60 | idxes[i].BT.Drain() 61 | log.Printf("sharding index_%d has %d keys", i, idxes[i].BT.Count()) 62 | idxes[i].Close() 63 | } 64 | log.Printf("build index %d documents in %v", len(docs), time.Since(start)) 65 | } 66 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | Storage: 2 | IndexFile: ./data/wiki_index 3 | DumpFile: ./data/enwiki-latest-abstract18.xml.gz 4 | BM25: 5 | K1: 2 6 | B: 0.75 7 | Server: 8 | Host: 9 | Port: 10 | Cluster: 11 | ShardingNum: 10 12 | ReplicateNum: 3 13 | ManageServer: 14 | Host: 127.0.0.1 15 | Port: 1234 -------------------------------------------------------------------------------- /config/Config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "path/filepath" 7 | 8 | "gopkg.in/yaml.v2" 9 | ) 10 | 11 | type BM25Parameters struct { 12 | K1 float32 `yaml:"K1"` 13 | B float32 `yaml:"B"` 14 | } 15 | 16 | type Storage struct { 17 | DumpFile string `yaml:"DumpFile"` 18 | IndexFile string `yaml:"IndexFile"` 19 | ModelFile string `yaml:"ModelFile"` 20 | } 21 | 22 | type Cluster struct { 23 | ShardingNum int `yaml:"ShardingNum"` 24 | ReplicateNum int `yaml:"ReplicateNum"` 25 | ManageServer Server `yaml:"ManageServer"` 26 | SearchServer []Server `yaml:"SearchServer"` 27 | DataServer []Server `yaml:"DataServer"` 28 | } 29 | 30 | type Server struct { 31 | Host string `yaml:"Host"` 32 | Port int `yaml:"Port"` 33 | } 34 | 35 | func (s *Server) Address() string { 36 | return fmt.Sprint(s.Host, ":", s.Port) 37 | } 38 | 39 | type Config struct { 40 | Store Storage `yaml:"Storage"` 41 | BM25 BM25Parameters `yaml:"BM25"` 42 | Server Server `yaml:"Server"` 43 | Cluster Cluster `yaml:"Cluster"` 44 | } 45 | 46 | func InitClusterConfig(path string) *Cluster { 47 | file, _ := filepath.Abs(path) 48 | buffer, err := ioutil.ReadFile(file) 49 | if err != nil { 50 | panic(err.Error()) 51 | } 52 | 53 | cluster := Cluster{} 54 | if err = yaml.Unmarshal(buffer, &cluster); err != nil { 55 | panic(err.Error()) 56 | } 57 | fmt.Printf("cluster: %+v\n", cluster) 58 | return &cluster 59 | } 60 | 61 | func InitConfig(path string) *Config { 62 | file, _ := filepath.Abs(path) 63 | buffer, err := ioutil.ReadFile(file) 64 | if err != nil { 65 | panic(err.Error()) 66 | } 67 | 68 | config := Config{} 69 | if err = yaml.Unmarshal(buffer, &config); err != nil { 70 | panic(err.Error()) 71 | } 72 | fmt.Printf("config: %+v\n", config) 73 | return &config 74 | } 75 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/awesomefly/easysearch 2 | 3 | go 1.14 4 | 5 | require ( 6 | code.sajari.com/word2vec v1.0.0 7 | github.com/RoaringBitmap/roaring v1.2.1 8 | //github.com/awesomefly/gobtree v0.0.0-20211221104356-310dd71c2899 9 | github.com/awesomefly/gobtree v0.0.0 10 | github.com/davecgh/go-spew v1.1.1 // indirect 11 | github.com/go-nlp/bm25 v1.0.0 12 | github.com/go-nlp/tfidf v1.1.0 13 | github.com/gogo/protobuf v1.3.2 // indirect 14 | github.com/golang/protobuf v1.5.2 // indirect 15 | github.com/google/go-cmp v0.5.6 // indirect 16 | github.com/kljensen/snowball v0.6.0 17 | github.com/kr/pretty v0.2.0 // indirect 18 | github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b 19 | github.com/stretchr/testify v1.7.0 20 | github.com/xtgo/set v1.0.0 21 | github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb // indirect 22 | golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6 // indirect 23 | google.golang.org/protobuf v1.27.1 // indirect 24 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect 25 | gopkg.in/yaml.v2 v2.4.0 26 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect 27 | ) 28 | 29 | replace github.com/awesomefly/gobtree => ../gobtree 30 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | code.sajari.com/word2vec v1.0.0 h1:gg1Bk3ea3mGPZMS2/qh1iPJM5iotSSHyIxq4gUdlH+0= 2 | code.sajari.com/word2vec v1.0.0/go.mod h1:Ut8mx+2Q79Js3uGW1+HtbuuUIWoGMAMCOZK06xDly00= 3 | dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= 4 | github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= 5 | github.com/RoaringBitmap/roaring v1.2.1 h1:58/LJlg/81wfEHd5L9qsHduznOIhyv4qb1yWcSvVq9A= 6 | github.com/RoaringBitmap/roaring v1.2.1/go.mod h1:icnadbWcNyfEHlYdr+tDlOTih1Bf/h+rzPpv4sbomAA= 7 | github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= 8 | github.com/apache/arrow/go/arrow v0.0.0-20200909005831-30143fc493df h1:iXnL0pMIR/RDUWl0kCbc0CQ3UyehlyV+t/DYCLJTbFc= 9 | github.com/apache/arrow/go/arrow v0.0.0-20200909005831-30143fc493df/go.mod h1:QNYViu/X0HXDHw7m3KXzWSVXIbfUvJqBFe6Gj8/pYA0= 10 | github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA= 11 | github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= 12 | github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k= 13 | github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0= 14 | github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0= 15 | github.com/chewxy/math32 v1.0.6 h1:JWZYUNl2rtgVVui6z8JBsDgkOG2DYmfSODyo95yKfx4= 16 | github.com/chewxy/math32 v1.0.6/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= 17 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 18 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 19 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 20 | github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= 21 | github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= 22 | github.com/go-nlp/bm25 v1.0.0 h1:PqYFen7iJ9ELrwfX0bCDlHRFDI9Noh70L7Tiu4I5w0U= 23 | github.com/go-nlp/bm25 v1.0.0/go.mod h1:S6rXCa2AzMDkurLgYY1PDvjr0DhjBs2V6RwyI/jFDrc= 24 | github.com/go-nlp/tfidf v1.1.0 h1:o7WgFHu6ZhhB2t9r60ZOvJczRc9wIXcHpdihHPFzR4c= 25 | github.com/go-nlp/tfidf v1.1.0/go.mod h1:mp0+0R3dyswbtXFKrMO4cS134Qd5ZcTG8Z6QGFiSqBI= 26 | github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= 27 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= 28 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= 29 | github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= 30 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 31 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 32 | github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= 33 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 34 | github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A= 35 | github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= 36 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 37 | github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ= 38 | github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 39 | github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= 40 | github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= 41 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= 42 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 43 | github.com/kljensen/snowball v0.6.0 h1:6DZLCcZeL0cLfodx+Md4/OLC6b/bfurWUOUGs1ydfOU= 44 | github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw= 45 | github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= 46 | github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 47 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 48 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= 49 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 50 | github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= 51 | github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= 52 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 53 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 54 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 55 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 56 | github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b h1:h+3JX2VoWTFuyQEo87pStk/a99dzIO1mM9KxIyLPGTU= 57 | github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc= 58 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 59 | github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 60 | github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 61 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 62 | github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 63 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 64 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 65 | github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= 66 | github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= 67 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 68 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 69 | github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb h1:uWiILQloLUVdtPYr1ZZo2zqtlpzo4G8vUpglo/Fs2H8= 70 | github.com/ziutek/blas v0.0.0-20190227122918-da4ca23e90bb/go.mod h1:J3xKssoVdrwZ2E29fIox/EKxOZWimS7AZ4fOTCFkOLo= 71 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 72 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 73 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 74 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 75 | golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 76 | golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 77 | golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 78 | golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6 h1:QE6XYQK6naiK1EPAe1g/ILLxN5RBoH5xkJk3CqlMI/Y= 79 | golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= 80 | golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= 81 | golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= 82 | golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= 83 | golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= 84 | golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= 85 | golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= 86 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 87 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 88 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 89 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 90 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 91 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 92 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 93 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 94 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 95 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 96 | golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 97 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 98 | golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 99 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 100 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 101 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 102 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 103 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 104 | golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 105 | golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 106 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 107 | golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= 108 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 109 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 110 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 111 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 112 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 113 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= 114 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 115 | gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= 116 | gonum.org/v1/gonum v0.7.0 h1:Hdks0L0hgznZLG9nzXb8vZ0rRvqNvAcgAp84y7Mwkgw= 117 | gonum.org/v1/gonum v0.7.0/go.mod h1:L02bwd0sqlsvRv41G7wGWFCsVNZFv/k1xzGIxeANHGM= 118 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= 119 | gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= 120 | gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= 121 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 122 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 123 | google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= 124 | google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 125 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 126 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= 127 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 128 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 129 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 130 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 131 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 132 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= 133 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 134 | gorgonia.org/tensor v0.9.11 h1:L7C+syNtsIcZ/91tJFT0QnAzXJyFt6tWSW6+URIucDM= 135 | gorgonia.org/tensor v0.9.11/go.mod h1:fsbuoeL1vV3fe8N+HZxEXJ7WI4z1pPP3luMBCgn0HAA= 136 | gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg= 137 | gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA= 138 | gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A= 139 | gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA= 140 | rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= 141 | -------------------------------------------------------------------------------- /index/btreeindex.go: -------------------------------------------------------------------------------- 1 | // 2 | // btree inverted index's data structure 3 | // 4 | // |<-------------btree------------>| <--posting list-->| 5 | // |<-intermediate->|<--leaf node-->| 6 | // - --- --- --- --- --- --- 7 | // | | - | | | - | | | | | 8 | // - --- --- --- --- --- --- 9 | // / 10 | // - - --- --- --- --- --- --- 11 | // | | - | | - | | | - | | | | | 12 | // - - --- --- --- --- --- --- 13 | // / 14 | // - - - --- --- --- --- --- --- 15 | // | | - | | - | | - | | | - | | | | | 16 | // - - - --- --- --- --- --- --- 17 | // \ 18 | // - - --- --- --- --- --- --- 19 | // | | - | | - | | | - | | | | | 20 | // - - --- --- --- --- --- --- 21 | // |<--in memory--> | <-----------on disk-------------->| 22 | // 23 | // 24 | 25 | package index 26 | 27 | import ( 28 | "bytes" 29 | "encoding/binary" 30 | "io" 31 | "os" 32 | "sort" 33 | "unsafe" 34 | 35 | "github.com/awesomefly/easysearch/util" 36 | btree "github.com/awesomefly/gobtree" 37 | ) 38 | 39 | var DefaultConfig = btree.Config{ 40 | IndexConfig: btree.IndexConfig{ 41 | Sectorsize: 512, 42 | Flistsize: 1000 * btree.OFFSET_SIZE, 43 | Blocksize: 512, 44 | }, 45 | Maxlevel: 4, 46 | RebalanceThrs: 30, 47 | AppendRatio: 0.7, 48 | DrainRate: 100, 49 | MaxLeafCache: 0, // intermediate node in memory and leaf node in disk 50 | Sync: false, 51 | Nocache: false, 52 | } 53 | 54 | type BTreeIndex struct { 55 | //skip-list vs btree: 56 | //https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537 57 | BT *btree.BTree 58 | IndexFile string 59 | 60 | property Property 61 | } 62 | 63 | func NewBTreeIndex(file string) *BTreeIndex { 64 | conf := DefaultConfig 65 | conf.Idxfile, conf.Kvfile = file+".idx", file+".kv" 66 | bt := BTreeIndex{ 67 | IndexFile: file, 68 | BT: btree.NewBTree(btree.NewStore(conf)), // todo: 索引文件太大,索引压缩、posting list压缩 69 | property: Property{ 70 | docNum: 0, 71 | tokenCount: 0, 72 | dataRange: DataRange{Start: 0, End: 0}, 73 | }, 74 | } 75 | 76 | bt.Load() 77 | return &bt 78 | } 79 | 80 | func (bt *BTreeIndex) Save() { 81 | file := bt.IndexFile + ".sum" 82 | os.Create(file) 83 | 84 | // Index store 85 | fd, err := os.OpenFile(file, os.O_RDWR|os.O_CREATE, 0660) 86 | if err != nil { 87 | panic(err.Error()) 88 | } 89 | 90 | buffer := bytes.NewBuffer([]byte{}) 91 | if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.docNum)); err != nil { 92 | panic(err) 93 | } 94 | 95 | if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.tokenCount)); err != nil { 96 | panic(err) 97 | } 98 | 99 | if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.dataRange.Start)); err != nil { 100 | panic(err) 101 | } 102 | if err := binary.Write(buffer, binary.LittleEndian, int32(bt.property.dataRange.End)); err != nil { 103 | panic(err) 104 | } 105 | 106 | if _, err := fd.Write(buffer.Bytes()); err != nil { 107 | panic(err) 108 | } 109 | fd.Close() 110 | } 111 | 112 | func (bt *BTreeIndex) Load() { 113 | // Index store 114 | file := bt.IndexFile + ".sum" 115 | fd, err := os.OpenFile(file, os.O_RDONLY|os.O_CREATE, 0660) 116 | if err != nil { 117 | panic(err.Error()) 118 | } 119 | 120 | data := make([]byte, unsafe.Sizeof(bt.property.docNum)+unsafe.Sizeof(bt.property.tokenCount)) 121 | if n, err := fd.Read(data); err != nil { 122 | if n == 0 && err == io.EOF { 123 | return 124 | } 125 | panic(err.Error()) 126 | } 127 | 128 | buffer := bytes.NewBuffer(data) 129 | if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.docNum))); err != nil { 130 | panic(err.Error()) 131 | } 132 | 133 | if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.tokenCount))); err != nil { 134 | panic(err.Error()) 135 | } 136 | if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.dataRange.Start))); err != nil { 137 | panic(err.Error()) 138 | } 139 | if err := binary.Read(buffer, binary.LittleEndian, (*int32)(unsafe.Pointer(&bt.property.dataRange.End))); err != nil { 140 | panic(err.Error()) 141 | } 142 | 143 | fd.Close() 144 | } 145 | 146 | func (bt *BTreeIndex) Close() { 147 | bt.BT.Drain() 148 | bt.BT.Close() 149 | bt.Save() 150 | } 151 | 152 | func (bt *BTreeIndex) Clear() { 153 | bt.Close() 154 | 155 | // delete deprecated index 156 | os.Remove(bt.IndexFile + ".sum") 157 | os.Remove(bt.IndexFile + ".idx") 158 | os.Remove(bt.IndexFile + ".kv") 159 | } 160 | 161 | func (bt *BTreeIndex) Keys() []string { 162 | keys := make(sort.StringSlice, bt.Property().tokenCount) 163 | 164 | ch := bt.BT.KeySet() 165 | for { 166 | key := <-ch 167 | if key == nil { 168 | break 169 | } 170 | keys = append(keys, string(key)) 171 | } 172 | return keys 173 | } 174 | 175 | func (bt *BTreeIndex) Lookup(token string, dirty bool) PostingList { 176 | key := &btree.TestKey{K: token} 177 | 178 | var ch chan []byte 179 | if dirty { 180 | ch = bt.BT.LookupDirty(key) 181 | } else { 182 | ch = bt.BT.Lookup(key) 183 | } 184 | values := make([][]byte, 0) 185 | for { 186 | x := <-ch 187 | if x == nil { 188 | break 189 | } 190 | values = append(values, x) 191 | } 192 | 193 | if len(values) == 0 { 194 | return nil 195 | } 196 | 197 | var p PostingList 198 | p.FromBytes(values[0]) 199 | return p 200 | } 201 | 202 | // Add 该方法比较低效,批量插入文档会在posting list后不段追加新文档,但postinglist并未预留空间, 203 | // 因此需要移动到新的空间,导致文件数据拷贝 204 | func (bt *BTreeIndex) Add(docs []Document) { 205 | for _, doc := range docs { 206 | tokens := util.Analyze(doc.Text) 207 | for _, token := range tokens { 208 | //log.Printf("token:%s", token) 209 | key := &btree.TestKey{K: token} 210 | postingList := bt.Lookup(token, true) 211 | if postingList != nil { 212 | if last := postingList.Find(doc.ID); last != nil { 213 | // Don't add same ID twice. But should update frequency 214 | last.TF++ 215 | last.QualityScore = CalDocScore(last.TF, 0) 216 | bt.BT.Insert(key, postingList) 217 | continue 218 | } 219 | } 220 | item := Doc{ 221 | ID: int32(doc.ID), 222 | DocLen: int32(len(tokens)), 223 | TF: 1, 224 | QualityScore: CalDocScore(1, 0), 225 | } 226 | //add to posting list & sort by score 227 | postingList = append(postingList, item) 228 | sort.Slice(postingList, func(i, j int) bool { 229 | return postingList[i].QualityScore > postingList[j].QualityScore 230 | }) 231 | bt.BT.Insert(key, postingList) 232 | } 233 | bt.property.docNum++ 234 | bt.property.tokenCount += len(tokens) 235 | } 236 | bt.BT.Drain() 237 | } 238 | 239 | func (bt *BTreeIndex) Insert(key string, pl PostingList) { 240 | bt.BT.Insert(&btree.TestKey{K: key}, pl) 241 | bt.property.docNum += pl.Len() 242 | bt.property.tokenCount++ 243 | } 244 | 245 | func (bt *BTreeIndex) Get(term string) []Doc { 246 | if postingList := bt.Lookup(term, false); postingList != nil { 247 | return postingList 248 | } 249 | return nil 250 | } 251 | 252 | func (bt *BTreeIndex) Property() *Property { 253 | return &bt.property 254 | } 255 | 256 | func (bt *BTreeIndex) SetProperty(p Property) { 257 | bt.property = p 258 | } 259 | 260 | func (bt *BTreeIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc { 261 | return DoRetrieval(bt, must, should, not, k, r, m) 262 | } -------------------------------------------------------------------------------- /index/btreeindex_test.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "github.com/awesomefly/easysearch/util" 6 | "github.com/stretchr/testify/assert" 7 | "os" 8 | "testing" 9 | ) 10 | 11 | func GetIDs(docs []Doc) []int { 12 | var ids []int 13 | for _, doc := range docs { 14 | ids = append(ids, int(doc.ID)) 15 | } 16 | return ids 17 | } 18 | 19 | func TestBTreeIndex(t *testing.T) { 20 | os.Remove("../data/btree_idx_test.idx") 21 | os.Remove("../data/btree_idx_test.kv") 22 | os.Remove("../data/btree_idx_test.sum") 23 | 24 | idx := NewBTreeIndex("../data/btree_idx_test") 25 | idx.Add([]Document{{ID: 1, Text: "A donut on a glass plate. Only the."}}) 26 | idx.Add([]Document{{ID: 2, Text: "donut is a donut"}}) 27 | fmt.Printf("Count:%d\n", idx.BT.Count()) 28 | 29 | ch := idx.BT.FullSet() 30 | for { 31 | k := <-ch 32 | d := <-ch 33 | v := <-ch 34 | if k == nil || d == nil || v == nil { 35 | break 36 | } 37 | //id, err := strconv.ParseInt(string(d), 10, 64) // key's id 38 | //if err != nil { 39 | // panic(err) 40 | //} 41 | //fmt.Printf("id:%d\n", id) 42 | 43 | var nv PostingList 44 | nv.FromBytes(v) 45 | fmt.Printf("key:%s, val:%+v\n", k, nv) 46 | } 47 | 48 | 49 | fmt.Printf("Lookup: %+v\n", idx.Lookup("donut", false)) 50 | fmt.Printf("Retrieval: %+v\n", idx.Retrieval([]string{"glass"}, []string{"donut"}, nil, 100, 10, Boolean)) 51 | 52 | assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean)) 53 | 54 | ids := GetIDs(idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean)) 55 | assert.Equal(t, []int{2, 1}, ids) 56 | assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean))) 57 | assert.Equal(t, []int{1}, GetIDs(idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean))) 58 | 59 | assert.Nil(t, GetIDs(idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean))) 60 | assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean))) 61 | assert.Equal(t, []int{2, 1}, GetIDs(idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean))) 62 | assert.Equal(t, []int{1}, GetIDs(idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean))) 63 | 64 | idx.Close() 65 | //time.Sleep(5*time.Second) 66 | } 67 | -------------------------------------------------------------------------------- /index/document.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "compress/gzip" 5 | "encoding/xml" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | ) 12 | 13 | // Document represents a Wikipedia abstract dump document. 14 | type Document struct { 15 | Title string `xml:"title"` 16 | URL string `xml:"url"` 17 | Text string `xml:"abstract"` 18 | Timestamp int 19 | ID int 20 | } 21 | 22 | // LoadDocuments loads a Wikipedia abstract dump and returns a slice of documents. 23 | // Dump example from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz 24 | func LoadDocuments(path string) ([]Document, error) { 25 | abspath, err := filepath.Abs(path) 26 | if err != nil { 27 | return nil, err 28 | } 29 | f, err := os.Open(abspath) 30 | if err != nil { 31 | return nil, err 32 | } 33 | defer f.Close() 34 | 35 | gz, err := gzip.NewReader(f) 36 | if err != nil { 37 | return nil, err 38 | } 39 | defer gz.Close() 40 | 41 | 42 | dump := struct { 43 | Documents []Document `xml:"doc"` 44 | }{} 45 | dec := xml.NewDecoder(gz) 46 | dec.Token() 47 | if err := dec.Decode(&dump); err != nil { 48 | return nil, err 49 | } 50 | docs := dump.Documents 51 | for i := range docs { 52 | docs[i].ID = i 53 | } 54 | return docs, nil 55 | } 56 | 57 | func LoadDocumentStream(path string) (chan *Document, error) { 58 | abspath, err := filepath.Abs(path) 59 | if err != nil { 60 | return nil, err 61 | } 62 | f, err := os.Open(abspath) 63 | if err != nil { 64 | return nil, err 65 | } 66 | 67 | gz, err := gzip.NewReader(f) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | ch := make(chan *Document, 10) 73 | 74 | dec := xml.NewDecoder(gz) 75 | go func() { 76 | defer f.Close() 77 | defer gz.Close() 78 | id := 0 79 | for { 80 | tok, err := dec.Token() 81 | if tok == nil && err == io.EOF { 82 | ch <- nil 83 | // EOF means we're done. 84 | log.Println("EOF means we're done.") 85 | break 86 | } else if err != nil { 87 | //log.Fatalf("Error decoding token: %s", err.Error()) 88 | panic(err) 89 | } 90 | 91 | switch ty := tok.(type) { 92 | case xml.StartElement: 93 | if ty.Name.Local == "doc" { 94 | // If this is a start element named "location", parse this element 95 | // fully. 96 | doc := Document{} 97 | if err = dec.DecodeElement(&doc, &ty); err != nil { 98 | //log.Fatalf("Error decoding item: %s", err.Error()) 99 | panic(err) 100 | } 101 | id++ 102 | doc.ID = id 103 | ch <- &doc 104 | if id % 5000 == 0 { 105 | fmt.Printf("load %d docs\n", id) 106 | } 107 | } 108 | default: 109 | } 110 | } 111 | }() 112 | return ch, nil 113 | } -------------------------------------------------------------------------------- /index/document_test.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func TestLoadDocumentStream(t *testing.T) { 11 | ch, err := LoadDocumentStream("../data/tem.xml") 12 | if err != nil { 13 | log.Fatal(err) 14 | return 15 | } 16 | 17 | for { 18 | //timeout := time.NewTimer(1 * time.Second) 19 | select { 20 | case doc := <-ch: 21 | if doc == nil { 22 | fmt.Println("doc is nil") 23 | return 24 | } 25 | fmt.Println(doc) 26 | //fmt.Printf("recv doc: %v", *doc) 27 | 28 | continue 29 | //case <-timeout.C: 30 | // log.Printf("Read timeout. err: %s", err.Error()) 31 | // break 32 | } 33 | break 34 | } 35 | time.Sleep(5*time.Second) 36 | } 37 | -------------------------------------------------------------------------------- /index/hashmapindex.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "github.com/awesomefly/easysearch/util" 5 | "sort" 6 | ) 7 | 8 | func IfElseInt(condition bool, o1 int, o2 int) int { 9 | if condition { 10 | return o1 11 | } 12 | return o2 13 | } 14 | 15 | // CalDocScore 16 | // todo: calculate doc static score by PageRank + frequency 17 | func CalDocScore(frequency int32, pagerank int) float64 { 18 | return float64(frequency * 1.0) 19 | } 20 | 21 | // HashMapIndex is an inverted index. It maps tokens to document IDs. 22 | type HashMapIndex struct { 23 | tbl map[string]PostingList 24 | 25 | property Property 26 | } 27 | 28 | func NewHashMapIndex() *HashMapIndex { 29 | return &HashMapIndex{ 30 | tbl: make(map[string]PostingList), 31 | property: Property{ 32 | docNum: 0, 33 | tokenCount: 0, 34 | dataRange: DataRange{Start: 0, End: 0}, 35 | }, 36 | } 37 | } 38 | 39 | func (idx *HashMapIndex) Property() *Property { 40 | return &idx.property 41 | } 42 | 43 | func (idx *HashMapIndex) Map() map[string]PostingList { 44 | return idx.tbl 45 | } 46 | 47 | func (idx *HashMapIndex) Keys() []string { 48 | //keys := make(sort.StringSlice, idx.Property().tokenCount) 49 | var keys sort.StringSlice 50 | for k := range idx.tbl { //map 遍历访问是无序的 51 | keys = append(keys, k) 52 | } 53 | return keys 54 | } 55 | // Add adds documents to the index. 56 | // todo: Support indexing multiple document fields. 57 | func (idx *HashMapIndex) Add(docs []Document) { 58 | for _, doc := range docs { 59 | tokens := util.Analyze(doc.Text) 60 | for _, token := range tokens { 61 | postingList := idx.tbl[token] 62 | if postingList != nil { 63 | if last := postingList.Find(doc.ID); last != nil { 64 | // Don't add same ID twice. But should update frequency 65 | //last := &postingList[tokenCount(postingList)-1] 66 | last.TF++ 67 | last.QualityScore = CalDocScore(last.TF, 0) 68 | //idx.tbl[token] = postingList 69 | continue 70 | } 71 | } 72 | item := Doc{ 73 | ID: int32(doc.ID), 74 | DocLen: int32(len(tokens)), 75 | TF: 1, 76 | QualityScore: CalDocScore(1, 0), 77 | } 78 | //add to posting list 79 | idx.tbl[token] = append(postingList, item) 80 | } 81 | 82 | idx.property.docNum++ 83 | idx.property.tokenCount += len(tokens) 84 | } 85 | 86 | //sort by score 87 | for k, v := range idx.tbl { 88 | sort.Slice(v, func(i, j int) bool { 89 | return v[i].QualityScore > v[j].QualityScore 90 | }) 91 | idx.tbl[k] = v 92 | } 93 | } 94 | 95 | // Clear unsafe function 96 | func (idx *HashMapIndex) Clear() { 97 | idx.property.docNum = 0 98 | idx.property.tokenCount = 0 99 | idx.property.dataRange = DataRange{Start: 0, End: 0} 100 | idx.tbl = make(map[string]PostingList) 101 | } 102 | 103 | func (idx *HashMapIndex) Get(term string) []Doc { 104 | if postingList, ok := idx.tbl[term]; ok { 105 | return postingList 106 | } 107 | return nil 108 | } 109 | 110 | func (idx *HashMapIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc { 111 | return DoRetrieval(idx, must, should, not, k, r, m) 112 | } -------------------------------------------------------------------------------- /index/hashmapindex_test.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/awesomefly/easysearch/util" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestIndex(t *testing.T) { 13 | idx := NewHashMapIndex() 14 | 15 | idx.Add([]Document{{ID: 1, Text: "A donut on a glass plate. Only the donut"}}) 16 | assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean)) 17 | 18 | result := idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean) 19 | assert.Equal(t, []int{1}, (PostingList)(result).IDs()) 20 | 21 | result = idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean) 22 | assert.Equal(t, []int{1}, (PostingList)(result).IDs()) 23 | 24 | result = idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean) 25 | assert.Equal(t, []int{1}, (PostingList)(result).IDs()) 26 | 27 | for s, list := range idx.tbl { 28 | fmt.Printf("%s:%v\n", s, list) 29 | } 30 | 31 | //===================================================== 32 | idx.Add([]Document{{ID: 2, Text: "donut is a donut"}}) 33 | assert.Nil(t, idx.Retrieval([]string{"a"}, nil, nil, 100, 10, Boolean)) 34 | 35 | result = idx.Retrieval([]string{"donut"}, nil, nil, 100, 10, Boolean) 36 | assert.Equal(t, []int{1, 2}, (PostingList)(result).IDs()) 37 | 38 | result = idx.Retrieval(util.Analyze("DoNuts"), nil, nil, 100, 10, Boolean) 39 | assert.Equal(t, []int{1, 2}, (PostingList)(result).IDs()) 40 | 41 | result = idx.Retrieval([]string{"glass"}, nil, nil, 100, 10, Boolean) 42 | assert.Equal(t, []int{1}, (PostingList)(result).IDs()) 43 | 44 | for s, list := range idx.tbl { 45 | fmt.Printf("%s:%v\n", s, list) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /index/index.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/binary" 7 | "log" 8 | "os" 9 | "sort" 10 | ) 11 | 12 | type SearchModel int 13 | 14 | const ( 15 | Boolean SearchModel = iota 16 | VectorSpace 17 | BM25 18 | ) 19 | 20 | type KVPair struct { 21 | Key string 22 | Value PostingList 23 | } 24 | 25 | type Index interface { 26 | Property() *Property 27 | Keys() []string 28 | Clear() 29 | 30 | Add(docs []Document) 31 | Get(term string) []Doc 32 | 33 | Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc 34 | } 35 | 36 | // DoRetrieval returns top k docs sorted by boolean model 37 | // todo: compress posting list and opt intersection/union rt 38 | // https://blog.csdn.net/weixin_39890629/article/details/111268898 39 | func DoRetrieval(idx Index, must []string, should []string, not []string, k int, r int, model SearchModel) []Doc { 40 | tfidf := NewTFIDF() 41 | 42 | //query's term frequency 43 | tfidf.DOC2TF[VirtualQueryDocId] = make(TF, 0) 44 | 45 | calTFIDF := func(term string, dn, df int, plr PostingList) { 46 | tfidf.IDF[term] = CalIDF(dn, df) 47 | for _, doc := range plr { 48 | var tf TF 49 | if tf = tfidf.DOC2TF[doc.ID]; tf == nil { 50 | tf = make(TF, 0) 51 | } 52 | tf[term] = doc.TF 53 | tfidf.DOC2TF[doc.ID] = tf 54 | } 55 | } 56 | properties := idx.Property() 57 | 58 | var result PostingList 59 | for _, term := range must { 60 | tfidf.DOC2TF[VirtualQueryDocId][term]++ 61 | if pl := (PostingList)(idx.Get(term)); pl != nil { 62 | plr := pl[:IfElseInt(len(pl) > r, r, len(pl))] //胜者表按TF排序,截断前r个,加速归并 63 | sort.Sort(plr) //按docID排序 64 | if result == nil { 65 | result = plr 66 | } else { 67 | result.Inter(plr) 68 | } 69 | calTFIDF(term, properties.DocNum(), len(pl), plr) 70 | } else { 71 | // Token doesn't exist. 72 | continue 73 | } 74 | } 75 | 76 | for _, term := range should { 77 | tfidf.DOC2TF[VirtualQueryDocId][term]++ 78 | if pl := (PostingList)(idx.Get(term)); pl != nil { 79 | plr := pl[:IfElseInt(len(pl) > r, r, len(pl))] 80 | sort.Sort(plr) 81 | if result == nil { 82 | result = plr //胜者表,截断r 83 | } else { 84 | result.Union(plr) 85 | } 86 | calTFIDF(term, properties.DocNum(), len(pl), plr) 87 | } else { 88 | // Token doesn't exist. 89 | continue 90 | } 91 | } 92 | 93 | for _, term := range not { 94 | if pl := (PostingList)(idx.Get(term)); pl != nil { 95 | sort.Sort(pl) 96 | result.Filter(pl) 97 | } else { 98 | // Token doesn't exist. 99 | continue 100 | } 101 | } 102 | 103 | if model == BM25 { 104 | result = CalBM25(result, tfidf, properties.TokenCount(), properties.DocNum()) 105 | } else if model == VectorSpace { 106 | result = CalCosine(result, tfidf) 107 | } 108 | 109 | //排序 110 | sort.Slice(result, func(i, j int) bool { 111 | return result[i].Score > result[j].Score //降序 112 | }) 113 | log.Printf("result sorted:%+v", result) 114 | 115 | if len(result) > k { 116 | return result[:k] 117 | } 118 | return result 119 | } 120 | 121 | // Drain data to file. sort by key 122 | func Drain(idx Index, file string) { 123 | if idx.Property().docNum == 0 { 124 | return 125 | } 126 | 127 | fd, err := os.OpenFile(file, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0660) 128 | if err != nil { 129 | panic(err.Error()) 130 | } 131 | 132 | writer := bufio.NewWriter(fd) 133 | defer func() { 134 | writer.Flush() 135 | fd.Close() 136 | }() 137 | 138 | keys := idx.Keys() 139 | sort.Strings(keys) 140 | 141 | buffer := bytes.NewBuffer([]byte{}) 142 | for i := 0; i < len(keys); i++ { 143 | k := keys[i] 144 | pl := (PostingList)(idx.Get(k)).Bytes() 145 | 146 | buffer.Reset() 147 | l := int32(len(k)) 148 | if err := binary.Write(buffer, binary.LittleEndian, l); err != nil { 149 | panic(err) 150 | } 151 | 152 | if err := binary.Write(buffer, binary.LittleEndian, []byte(k)); err != nil { 153 | panic(err) 154 | } 155 | 156 | l = int32(len(pl)) 157 | if err := binary.Write(buffer, binary.LittleEndian, l); err != nil { 158 | panic(err) 159 | } 160 | 161 | if err := binary.Write(buffer, binary.LittleEndian, pl); err != nil { 162 | panic(err) 163 | } 164 | 165 | if _, err := writer.Write(buffer.Bytes()); err != nil { 166 | panic(err) 167 | } 168 | } 169 | } 170 | 171 | // Load file. 172 | func Load(file string) (chan *KVPair, error) { 173 | ch := make(chan *KVPair, 10) 174 | 175 | fd, err := os.OpenFile(file, os.O_RDONLY|os.O_CREATE, 0660) 176 | if err != nil { 177 | return nil, err 178 | } 179 | 180 | ReadInt := func() (int, error) { 181 | buf := make([]byte, 4) 182 | if n, err := fd.Read(buf); err != nil { 183 | return 0, err 184 | } else { 185 | var leng int32 186 | if err = binary.Read(bytes.NewBuffer(buf[:n]), binary.LittleEndian, &leng); err != nil { 187 | panic(err) 188 | } 189 | return int(leng), nil 190 | } 191 | 192 | } 193 | 194 | ReadString := func(n int) (string, error) { 195 | buf := make([]byte, n) 196 | if n, err := fd.Read(buf); err != nil { 197 | return "", err 198 | } else { 199 | return string(buf[:n]), nil 200 | } 201 | 202 | } 203 | 204 | go func() { 205 | defer fd.Close() 206 | 207 | for { 208 | pair := KVPair{} 209 | n, err := ReadInt() 210 | if err != nil { 211 | ch <- nil 212 | break 213 | } 214 | if pair.Key, err = ReadString(n); err != nil { 215 | ch <- nil 216 | break 217 | } 218 | if n, err = ReadInt(); err != nil { 219 | ch <- nil 220 | break 221 | } 222 | 223 | var v string 224 | if v, err = ReadString(n); err != nil { 225 | ch <- nil 226 | break 227 | } 228 | pair.Value.FromBytes([]byte(v)) 229 | ch <- &pair 230 | } 231 | }() 232 | return ch, nil 233 | } 234 | -------------------------------------------------------------------------------- /index/postinglist.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "sort" 7 | 8 | "github.com/xtgo/set" 9 | ) 10 | 11 | type Term struct { 12 | K string //key 13 | Id int32 //key id 14 | DF int32 //Document Frequency 15 | } 16 | 17 | type Doc struct { 18 | ID int32 //doc id 19 | DocLen int32 //doc length 20 | 21 | TF int32 //词频, eg. 在倒排表term->[doc1,doc2,doc3]中,仅表示term在docX中的词频 22 | QualityScore float64 //静态分、质量分 23 | 24 | Score float64 //bm25/Cosine score used by sort 25 | } 26 | 27 | func (doc Doc) Bytes() []byte { 28 | buffer := bytes.NewBuffer([]byte{}) 29 | err := binary.Write(buffer, binary.LittleEndian, doc) 30 | if err != nil { 31 | panic(err) 32 | } 33 | return buffer.Bytes() 34 | } 35 | 36 | func (doc *Doc) FromBytes(b []byte) { 37 | buffer := bytes.NewBuffer(b) 38 | 39 | err := binary.Read(buffer, binary.LittleEndian, doc) 40 | if err != nil { 41 | panic(err) 42 | } 43 | } 44 | 45 | type PostingList []Doc 46 | 47 | func (pl PostingList) Len() int { return len(pl) } 48 | func (pl PostingList) Less(i, j int) bool { return pl[i].ID > pl[j].ID } //降序, sort by score 49 | func (pl PostingList) Swap(i, j int) { 50 | pl[i], pl[j] = pl[j], pl[i] 51 | } 52 | 53 | func (pl PostingList) Find(id int) *Doc { 54 | for i := 0; i < pl.Len(); i++ { 55 | if pl[i].ID == int32(id) { 56 | return &pl[i] 57 | } 58 | } 59 | return nil 60 | } 61 | 62 | func (pl PostingList) IDs() []int { 63 | ids := make([]int, 0, len(pl)) 64 | for _, item := range pl { 65 | ids = append(ids, int(item.ID)) 66 | } 67 | sort.Sort(sort.IntSlice(ids)) 68 | return ids 69 | } 70 | 71 | func (pl *PostingList) Inter(docs []Doc) { 72 | l := len(*pl) 73 | *pl = append(*pl, docs...) 74 | size := set.Inter(pl, l) 75 | *pl = (*pl)[:size] 76 | } 77 | 78 | func (pl *PostingList) Union(docs []Doc) { 79 | l := len(*pl) 80 | *pl = append(*pl, docs...) 81 | size := set.Union(pl, l) 82 | *pl = (*pl)[:size] 83 | } 84 | 85 | func (pl *PostingList) Filter(docs []Doc) { 86 | l := len(*pl) 87 | join := append(*pl, docs...) 88 | size := set.Inter(join, l) 89 | inter := join[:size] 90 | 91 | *pl = append(*pl, inter...) 92 | size = set.Diff(pl, l) 93 | *pl = (*pl)[:size] 94 | } 95 | 96 | func (pl *PostingList) Append(docs ...Doc) { 97 | *pl = append(*pl, docs...) 98 | } 99 | 100 | func (pl PostingList) Bytes() []byte { 101 | buffer := bytes.NewBuffer([]byte{}) 102 | for _, v := range pl { 103 | err := binary.Write(buffer, binary.LittleEndian, v) 104 | if err != nil { 105 | panic(err) 106 | } 107 | } 108 | return buffer.Bytes() 109 | } 110 | 111 | func (pl *PostingList) FromBytes(buf []byte) { 112 | if buf == nil { 113 | return 114 | } 115 | 116 | buffer := bytes.NewBuffer(buf) 117 | for buffer.Len() > 0 { 118 | var item Doc 119 | binary.Read(buffer, binary.LittleEndian, &item) 120 | *pl = append(*pl, item) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /index/postinglist_test.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestPostingList(t *testing.T) { 11 | it := Doc{ 12 | ID: 1, 13 | TF: 30, 14 | QualityScore: 10.11, 15 | } 16 | 17 | it2 := Doc{ 18 | ID: 2, 19 | TF: 20, 20 | QualityScore: 20.22, 21 | } 22 | 23 | bb := it.Bytes() 24 | it3 := Doc{} 25 | it3.FromBytes(bb) 26 | fmt.Printf("it3: %+v\n", it3) 27 | 28 | var pl PostingList 29 | pl = append(pl, it) 30 | pl = append(pl, it2) 31 | 32 | fmt.Printf("pl:%+v\n", pl) 33 | 34 | bytes := pl.Bytes() 35 | 36 | var pl2 PostingList 37 | pl2.FromBytes(bytes) 38 | fmt.Printf("pl2:%+v\n", pl2) 39 | assert.Equal(t, len(pl), len(pl2)) 40 | } 41 | -------------------------------------------------------------------------------- /index/property.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | type DataRange struct { 4 | Start int 5 | End int 6 | } 7 | 8 | type Property struct { 9 | // docNum is the count of documents 10 | docNum int 11 | 12 | // tokenCount is the total length of tokens 13 | tokenCount int 14 | 15 | //dataRange 16 | dataRange DataRange 17 | } 18 | 19 | func (idx *Property) DocNum() int { 20 | return idx.docNum 21 | } 22 | 23 | func (idx *Property) SetDocNum(num int) { 24 | idx.docNum = num 25 | } 26 | 27 | func (idx *Property) TokenCount() int { 28 | return idx.tokenCount 29 | } 30 | 31 | func (idx *Property) SetTokenCount(cnt int) { 32 | idx.tokenCount = cnt 33 | } 34 | 35 | func (idx *Property) DataRange() DataRange { 36 | return idx.dataRange 37 | } 38 | 39 | func (idx *Property) SetDataRange(d DataRange) { 40 | idx.dataRange = d 41 | } 42 | -------------------------------------------------------------------------------- /index/tfidf.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | ) 8 | 9 | type TF map[string]int32 10 | type TFIDF struct { 11 | IDF map[string]float64 12 | DOC2TF map[int32]TF 13 | } 14 | 15 | func NewTFIDF() *TFIDF { 16 | return &TFIDF{ 17 | IDF: make(map[string]float64), 18 | DOC2TF: make(map[int32]TF, 0), 19 | } 20 | } 21 | 22 | func CalIDF(docNum int, df int) float64 { 23 | return math.Log2(float64(docNum)/float64(df) + 1) 24 | } 25 | 26 | const VirtualQueryDocId int32 = -10000 27 | //CalCosine 余弦距离相似度 https://blog.csdn.net/weixin_42398658/article/details/85063004 28 | func CalCosine(hits []Doc, tfidf *TFIDF) []Doc { 29 | queryDocId := VirtualQueryDocId 30 | 31 | var querySum float64 32 | for term, tf := range tfidf.DOC2TF[queryDocId] { 33 | idf := tfidf.IDF[term] 34 | weight := float64(tf) * idf 35 | querySum += math.Pow(weight, 2) 36 | } 37 | 38 | for i, hit := range hits { 39 | var docSum, multiplySum float64 40 | for term, tf := range tfidf.DOC2TF[hit.ID] { 41 | idf := tfidf.IDF[term] 42 | docTermWeight := float64(tf) * idf 43 | queryTermWeight := float64(tfidf.DOC2TF[queryDocId][term]) * idf 44 | 45 | multiplySum += docTermWeight * queryTermWeight 46 | docSum += math.Pow(docTermWeight, 2) 47 | } 48 | hits[i].Score = multiplySum / math.Sqrt(querySum*docSum) 49 | hits[i].Score, _ = strconv.ParseFloat(fmt.Sprintf("%.4f", hits[i].Score), 64) 50 | } 51 | return hits 52 | } 53 | 54 | //CalBM25 计算bm25得分并排序 55 | //docsLen 索引文档总长度(词的数量), DocsNum 索取文档总数 56 | func CalBM25(hits []Doc, tfidf *TFIDF, docLen int, docNum int) []Doc { 57 | // 计算bm25 参考:https://www.jianshu.com/p/1e498888f505 58 | for i, hit := range hits { 59 | for term, tf := range tfidf.DOC2TF[hit.ID] { //hit doc包含多个term 60 | d := float64(docLen) 61 | avg := float64(docLen) / float64(docNum) 62 | idf := tfidf.IDF[term] 63 | k1 := float64(2) 64 | b := 0.75 65 | hits[i].Score += idf * float64(tf) * (k1 + 1) / (float64(tf) + k1*(1-b+b*d/avg)) 66 | } 67 | hits[i].Score, _ = strconv.ParseFloat(fmt.Sprintf("%.4f", hits[i].Score), 64) 68 | } 69 | return hits 70 | } 71 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/awesomefly/easysearch/cluster" 7 | "github.com/awesomefly/easysearch/config" 8 | "runtime" 9 | "runtime/pprof" 10 | "strings" 11 | 12 | "log" 13 | "os" 14 | "time" 15 | 16 | "github.com/awesomefly/easysearch/index" 17 | "github.com/awesomefly/easysearch/search" 18 | ) 19 | 20 | func startStandaloneCluster() error { 21 | conf := config.InitClusterConfig("./cluster.yml") 22 | procAttr := &os.ProcAttr{ 23 | Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}, 24 | } 25 | 26 | procs := make([]*os.Process, 0) 27 | 28 | //start manager server 29 | baseArgs := os.Args[0] + " -m cluster " 30 | argv := strings.Fields(baseArgs + "--servername=managerserver --host=" + 31 | conf.ManageServer.Host + " --port=" + fmt.Sprint(conf.ManageServer.Port)) 32 | proc, err := os.StartProcess(os.Args[0], argv, procAttr) 33 | if err != nil { 34 | fmt.Println("start manager server process error:", err) 35 | return err 36 | } 37 | procs = append(procs, proc) 38 | time.Sleep(3 * time.Second) 39 | 40 | //start data server 41 | for i := 0; i < len(conf.DataServer); i++ { 42 | srv := conf.DataServer[i] 43 | 44 | argv = strings.Fields(baseArgs + "--servername=dataserver --host=" + srv.Host + " --port=" + fmt.Sprint(srv.Port)) 45 | proc, err = os.StartProcess(os.Args[0], argv, procAttr) 46 | if err != nil { 47 | fmt.Println("start data server process error:", err) 48 | return err 49 | } 50 | procs = append(procs, proc) 51 | } 52 | time.Sleep(3 * time.Second) 53 | 54 | //start search server 55 | for i := 0; i < len(conf.SearchServer); i++ { 56 | srv := conf.SearchServer[i] 57 | 58 | argv = strings.Fields(baseArgs + "--servername=searchserver --host=" + srv.Host + " --port=" + fmt.Sprint(srv.Port)) 59 | proc, err = os.StartProcess(os.Args[0], argv, procAttr) 60 | if err != nil { 61 | fmt.Println("start search server process error:", err) 62 | return err 63 | } 64 | procs = append(procs, proc) 65 | time.Sleep(100 * time.Millisecond) 66 | } 67 | 68 | for i := 0; i < len(procs); i++ { 69 | _, err = procs[i].Wait() 70 | if err != nil { 71 | fmt.Println("wait error:", err) 72 | return err 73 | } 74 | } 75 | return nil 76 | } 77 | 78 | func startProfile() { 79 | f, err := os.OpenFile("cpu.pprof", os.O_CREATE|os.O_RDWR, 0666) 80 | if err != nil { 81 | log.Fatal(err) 82 | return 83 | } 84 | defer f.Close() 85 | 86 | if err = pprof.StartCPUProfile(f); err != nil { 87 | log.Fatal(err) 88 | return 89 | } 90 | defer pprof.StopCPUProfile() 91 | } 92 | 93 | func main() { 94 | f, _ := os.Create("cpu.pprof") 95 | defer f.Close() 96 | pprof.StartCPUProfile(f) 97 | defer pprof.StopCPUProfile() 98 | 99 | log.SetOutput(os.Stdout) 100 | //log.Printf("args:%+v\n", os.Args) 101 | //runtime.GOMAXPROCS(2) 102 | log.Println("GOMAXPROCS:", runtime.GOMAXPROCS(0)) 103 | 104 | var module string 105 | flag.StringVar(&module, "m", "", "[indexer|searcher|merger|cluster]") 106 | 107 | //searcher 108 | var query, source, modelFile, searchModel string 109 | flag.StringVar(&query, "q", "Album Jordan", "search query") 110 | flag.StringVar(&source, "source", "", "[local|remote]") 111 | flag.StringVar(&searchModel, "search_model", "", "[boolean|bm25|vs]") 112 | flag.StringVar(&modelFile, "paraphrase_file", "", "paraphrase model file") 113 | 114 | //indexer 115 | var sharding bool 116 | flag.BoolVar(&sharding, "sharding", false, "true|false") 117 | 118 | //merger 119 | var srcPath, dstPath string 120 | flag.StringVar(&srcPath, "f", "", "src index file") 121 | flag.StringVar(&dstPath, "t", "", "dst index file") 122 | 123 | //server 124 | var servername string 125 | flag.StringVar(&servername, "servername", "", "[all|managerserver|dataserver|searchserver]") 126 | 127 | var host string 128 | var port int 129 | flag.StringVar(&host, "host", "", "server host") 130 | flag.IntVar(&port, "port", 0, "server port") 131 | flag.Parse() 132 | 133 | conf := config.InitConfig("./config.yml") 134 | if module == "indexer" { 135 | log.Println("Starting Index ...") 136 | if sharding { 137 | cluster.Index(conf) 138 | } else { 139 | search.Index(*conf) //todo: 构建索引耗时过长,性能分析下具体耗时原因 140 | } 141 | } else if module == "searcher" { 142 | start := time.Now() 143 | var matched []index.Doc 144 | var err error 145 | if source == "local" { 146 | log.Println("Starting local search..") 147 | searcher := search.NewSearcher(conf.Store.IndexFile) 148 | if modelFile != "" { 149 | searcher.InitParaphrase(modelFile) 150 | } 151 | log.Printf("index loaded %d keys in %v", searcher.Count() , time.Since(start)) 152 | matched = searcher.Search(query) 153 | } else if source == "remote" { 154 | log.Println("Starting remote search..") 155 | cli := cluster.NewSearchClient(&conf.Cluster.ManageServer) 156 | matched, err = cli.Search(query) 157 | if err != nil { 158 | log.Fatal(err) 159 | return 160 | } 161 | } 162 | log.Printf("Search found %d documents in %v", len(matched), time.Since(start)) 163 | } else if module == "merger" { 164 | search.Merge(srcPath, dstPath) 165 | } else if module == "cluster" { 166 | if host != "" && port != 0 { 167 | conf.Server.Host = host 168 | conf.Server.Port = port 169 | } 170 | if servername == "all" { 171 | log.Println("Starting Standalone Cluster..") 172 | if err := startStandaloneCluster(); err != nil { 173 | panic(err) 174 | } 175 | } else if servername == "managerserver" { 176 | log.Println("Starting ManagerServer..") 177 | svr := cluster.NewManagerServer(conf) 178 | svr.Run() 179 | } else if servername == "dataserver" { 180 | log.Println("Starting DataServer..") 181 | ds := cluster.NewDataServer(conf) 182 | ds.Run() 183 | } else if servername == "searchserver" { 184 | log.Println("Starting SearchServer..") 185 | srh := cluster.NewSearchServer(conf) 186 | srh.Run() 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /paraphrase/serving/model.go: -------------------------------------------------------------------------------- 1 | package serving 2 | 3 | // https://pkg.go.dev/code.sajari.com/word2vec#section-readme 4 | //https://github.com/sajari/word2vec 5 | import ( 6 | "log" 7 | "os" 8 | 9 | "code.sajari.com/word2vec" 10 | ) 11 | 12 | type ParaphraseModel struct { 13 | path string 14 | mode *word2vec.Model 15 | } 16 | 17 | func NewModel(path string) *ParaphraseModel { 18 | return &ParaphraseModel{ 19 | path: path, 20 | mode: load(path), 21 | } 22 | } 23 | 24 | func load(path string) *word2vec.Model { 25 | // Load the model from an io.Reader (i.e. a file). 26 | file, err := os.Open(path) 27 | defer file.Close() 28 | 29 | if err != nil { 30 | log.Fatalf("error open file fail: %v", err) 31 | panic(err) 32 | } 33 | //r := bufio.NewReader(file) 34 | 35 | model, err := word2vec.FromReader(file) 36 | if err != nil { 37 | log.Fatalf("error loading model: %v", err) 38 | panic(err) 39 | } 40 | return model 41 | } 42 | 43 | //GetSimilar 语义改写、近义词 44 | func (m *ParaphraseModel) GetSimilar(positive []string, negative []string, n int) []string { 45 | // Create an expression. 46 | expr := word2vec.Expr{} 47 | for _, text := range positive { 48 | expr.Add(1, text) 49 | } 50 | for _, text := range negative { 51 | expr.Add(-1, text) 52 | } 53 | 54 | // Hit the most similar result by cosine similarity. 55 | matches, err := m.mode.CosN(expr, n) 56 | if err != nil { 57 | log.Fatalf("error evaluating cosine similarity: %v", err) 58 | } 59 | 60 | var result []string 61 | for _, match := range matches { 62 | result = append(result, match.Word) 63 | } 64 | return result 65 | } 66 | -------------------------------------------------------------------------------- /paraphrase/serving/model_test.go: -------------------------------------------------------------------------------- 1 | package serving 2 | 3 | import ( 4 | "fmt" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestGetSimilar(t *testing.T) { 12 | path := "../../data/word2vec.format.bin" 13 | x, _ := filepath.Abs(path) 14 | fmt.Println(x) 15 | model := NewModel(path) 16 | 17 | var ( 18 | positive = []string{"king", "woman"} 19 | negative = []string{"man"} 20 | ) 21 | out := model.GetSimilar(positive, negative, 3) 22 | for _, v := range out { 23 | println(v) 24 | } 25 | // assert.EqualValues(t, out, model.GetSimilar(positive, negative, 3)) 26 | assert.Equal(t, 1, 1) 27 | } 28 | -------------------------------------------------------------------------------- /paraphrase/train/wiki2txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import sys 4 | from gensim.corpora import WikiCorpus 5 | import opencc 6 | 7 | dir_path = "data/" 8 | 9 | 10 | def read_sample(): 11 | i = 0 12 | with open(dir_path + "wiki_texts.txt", 'r') as f: 13 | for line in f: 14 | print(line) 15 | if i == 10: 16 | return 17 | i += 1 18 | 19 | 20 | # train corpus source https://dumps.wikimedia.org/enwiki/latest/ 21 | # xml to txt 22 | def wiki_to_txt(file): 23 | if file is None: 24 | return 25 | 26 | corpus_path = file #"~/Downloads/enwiki-latest-pages-articles11.xml-p6899367p7054859.bz2" 27 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 28 | 29 | output = open(dir_path + "wiki_texts.txt", 'w') 30 | wiki = WikiCorpus(corpus_path, processes=15, dictionary={}) 31 | i = 0 32 | for text in wiki.get_texts(): 33 | output.write(" ".join(text) + "\n") 34 | i += 1 35 | if i % 10000 == 0: 36 | logging.info("Saved " + str(i) + " articles") 37 | output.close() 38 | logging.info("Finished Saved " + str(i) + " articles") 39 | 40 | 41 | def convert2simple(): 42 | cc = opencc.OpenCC('t2s') 43 | for i in range(1, 5): 44 | src_file = dir_path + "wiki_texts" + str(i) + ".txt" 45 | des_file = dir_path + "wiki_simple" + str(i) + ".txt" 46 | des_f = open(des_file, 'w') 47 | with open(src_file, 'r') as f: 48 | for line in f: 49 | # print line.decode('utf-8') 50 | content = cc.convert(line.decode('utf-8')) 51 | print(content) 52 | des_f.write(content.encode('utf-8') + '\n') 53 | des_f.close() 54 | print(str(i) + " finished.") 55 | 56 | 57 | # /usr/bin/python3 paraphrase/train/wiki2txt.py --cmd=parse --file=$WIKI_FILE 58 | if __name__ == "__main__": 59 | import argparse 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument("--cmd", type=str, required=True) 62 | parser.add_argument("--file", type=str, required=False) 63 | args = parser.parse_args() 64 | if args.cmd == 'parse': 65 | wiki_to_txt(args.file) 66 | elif args.cmd == 'sample': 67 | read_sample() 68 | # convert2simple() 69 | -------------------------------------------------------------------------------- /paraphrase/train/word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import string 3 | 4 | from gensim.models import word2vec, KeyedVectors 5 | import logging 6 | 7 | corpus_dir = "./data/" 8 | project_dir = "./data/" 9 | 10 | 11 | # 训练word2vec模型 12 | def train(corpus_file): 13 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 14 | 15 | # corpus_path = corpus_dir + 'wiki_texts.txt' 16 | model_path = project_dir + 'med200_less.model.bin' 17 | model_word2vec_format_path = project_dir + 'word2vec.format.bin' 18 | 19 | sentences = word2vec.LineSentence(corpus_file) 20 | model = word2vec.Word2Vec(sentences, vector_size=200) 21 | 22 | # 保存模型,供日後使用 23 | model.save(model_path) 24 | 25 | # 按word2vec格式存储向量信息 26 | model.wv.save_word2vec_format(model_word2vec_format_path, binary=True) 27 | 28 | 29 | def similar_test(positive=None, negative=None): 30 | # model_path = project_dir + 'med200_less.model.bin' 31 | model_word2vec_format_path = project_dir + 'word2vec.format.bin' 32 | 33 | # model = serving.Word2Vec.load(model_path) 34 | model = KeyedVectors.load_word2vec_format(model_word2vec_format_path, binary=True) 35 | 36 | try: 37 | # items = model.wv.most_similar(positive, negative, topn=10) 38 | items = model.most_similar(positive, negative, topn=10) 39 | for item in items: 40 | print(item[0].encode('utf-8'), item[1]) 41 | except Exception as e: 42 | print(repr(e)) 43 | 44 | 45 | # /usr/bin/python3 paraphrase/train/word2vec.py --cmd=train --corpus_file=./data/wiki_texts.txt 46 | if __name__ == "__main__": 47 | import argparse 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("--cmd", type=str, required=True) 50 | parser.add_argument("--corpus_file", type=str, required=False) 51 | args = parser.parse_args() 52 | if args.cmd == 'train': 53 | train(args.corpus_file) 54 | elif args.cmd == 'test': 55 | positive = ['king', 'woman'] 56 | negative = ['man'] 57 | similar_test(positive, negative) 58 | -------------------------------------------------------------------------------- /score/bm25.go: -------------------------------------------------------------------------------- 1 | package score 2 | 3 | import ( 4 | "github.com/go-nlp/bm25" 5 | "github.com/go-nlp/tfidf" 6 | "sort" 7 | ) 8 | 9 | type BM25Document []int //token id list 10 | func (d BM25Document) IDs() []int { return []int(d) } 11 | 12 | func newTFIDF(docs []BM25Document) *tfidf.TFIDF { 13 | tf := tfidf.New() 14 | 15 | for _, doc := range docs { 16 | tf.Add(doc) 17 | } 18 | tf.CalculateIDF() 19 | return tf 20 | } 21 | 22 | // MostSimilar 相关性计算 23 | // q query words, docs is doc id list, return most similar docs' id list 24 | func MostSimilar(docCorpus map[int]BM25Document, tokenCorpus map[string]int, q []string, docs []int, k int) []int { 25 | // sort by bm25 26 | // 相关性评分请先阅读:https://www.jianshu.com/p/1e498888f505 27 | // 废弃-词集过大时,docs无法完全放入内存,需要自行统计词频并计算score 28 | var corpus []BM25Document 29 | for _, ts := range docCorpus { 30 | corpus = append(corpus, ts) 31 | } 32 | tf := newTFIDF(corpus) 33 | 34 | var query BM25Document 35 | for _, term := range q { 36 | query = append(query, tokenCorpus[term]) 37 | } 38 | 39 | resultDocs := make([]tfidf.Document, 0, len(docs)) 40 | for _, id := range docs { 41 | resultDocs = append(resultDocs, docCorpus[id]) 42 | } 43 | 44 | // FIXME: IDF计算公式不对 45 | scores := bm25.BM25(tf, query, resultDocs, 2, 0.75) 46 | sort.Sort(scores) //order by asc 47 | //sort.Reverse(scores) //order by desc 48 | 49 | var final []int 50 | for i := len(scores) - 1; i >= 0 && k > 0; i-- { 51 | final = append(final, docs[scores[i].ID]) 52 | k-- 53 | } 54 | return final 55 | } 56 | -------------------------------------------------------------------------------- /score/bm25_test.go: -------------------------------------------------------------------------------- 1 | package score 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | 8 | "github.com/go-nlp/bm25" 9 | "github.com/go-nlp/tfidf" 10 | ) 11 | 12 | var mobydick = []string{ 13 | "Call me Ishmael .", 14 | "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .", 15 | "It is a way I have of driving off the spleen and regulating the circulation .", 16 | "Whenever I find myself growing grim about the mouth ; ", 17 | "whenever it is a damp , drizzly November in my soul ; ", 18 | "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ", 19 | "and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .", 20 | "This is my substitute for pistol and ball . ", 21 | "With a philosophical flourish Cato throws himself upon his sword ; ", 22 | "I quietly take to the ship . There is nothing surprising in this .", 23 | "If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .", 24 | } 25 | 26 | type doc []int 27 | 28 | func (d doc) IDs() []int { return []int(d) } 29 | 30 | func makeCorpus(a []string) (map[string]int, []string) { 31 | retVal := make(map[string]int) 32 | invRetVal := make([]string, 0) 33 | var id int 34 | for _, s := range a { 35 | for _, f := range strings.Fields(s) { 36 | f = strings.ToLower(f) 37 | if _, ok := retVal[f]; !ok { 38 | retVal[f] = id 39 | invRetVal = append(invRetVal, f) 40 | id++ 41 | } 42 | } 43 | } 44 | return retVal, invRetVal 45 | } 46 | 47 | func makeDocuments(a []string, c map[string]int) []tfidf.Document { 48 | retVal := make([]tfidf.Document, 0, len(a)) 49 | for _, s := range a { 50 | var ts []int 51 | for _, f := range strings.Fields(s) { 52 | f = strings.ToLower(f) 53 | id := c[f] 54 | ts = append(ts, id) 55 | } 56 | retVal = append(retVal, doc(ts)) 57 | } 58 | return retVal 59 | } 60 | 61 | func Example_BM25() { 62 | corpus, _ := makeCorpus(mobydick) 63 | docs := makeDocuments(mobydick, corpus) 64 | tf := tfidf.New() 65 | 66 | for _, doc := range docs { 67 | tf.Add(doc) 68 | } 69 | tf.CalculateIDF() 70 | 71 | // now we search 72 | 73 | // "ishmael" is a query 74 | ishmael := doc{corpus["ishmael"]} 75 | 76 | // "whenever i find" is another query 77 | whenever := doc{corpus["whenever"]} 78 | 79 | ishmaelScores := bm25.BM25(tf, ishmael, docs, 1.5, 0.75) 80 | wheneverScores := bm25.BM25(tf, whenever, docs, 1.5, 0.75) 81 | 82 | sort.Sort(sort.Reverse(ishmaelScores)) 83 | sort.Sort(sort.Reverse(wheneverScores)) 84 | 85 | fmt.Printf("Top 3 Relevant Docs to \"Ishmael\":\n") 86 | for _, d := range ishmaelScores[:3] { 87 | fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) 88 | } 89 | fmt.Println("") 90 | fmt.Printf("Top 3 Relevant Docs to \"whenever i find\":\n") 91 | for _, d := range wheneverScores[:3] { 92 | fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) 93 | } 94 | // Output: 95 | // Top 3 Relevant Docs to "Ishmael": 96 | // ID : 0 97 | // QualityScore: 3.706 98 | // Doc : "Call me Ishmael ." 99 | // ID : 1 100 | // QualityScore: 0.000 101 | // Doc : "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ." 102 | // ID : 2 103 | // QualityScore: 0.000 104 | // Doc : "It is a way I have of driving off the spleen and regulating the circulation ." 105 | // 106 | // Top 3 Relevant Docs to "whenever i find": 107 | // ID : 3 108 | // QualityScore: 2.031 109 | // Doc : "Whenever I find myself growing grim about the mouth ; " 110 | // ID : 4 111 | // QualityScore: 1.982 112 | // Doc : "whenever it is a damp , drizzly November in my soul ; " 113 | // ID : 5 114 | // QualityScore: 1.810 115 | // Doc : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; " 116 | 117 | } 118 | -------------------------------------------------------------------------------- /search/indexer.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | "regexp" 10 | "sort" 11 | "time" 12 | 13 | "github.com/awesomefly/easysearch/config" 14 | 15 | "github.com/awesomefly/easysearch/index" 16 | ) 17 | 18 | type Indexer interface { 19 | // Drain data to file. sort by key 20 | Drain(file string) 21 | Merge(file string) 22 | } 23 | 24 | const SpiltThresholdDocNum int = 50000 25 | func Index(c config.Config) { 26 | log.Println("Starting index...") 27 | 28 | //remove old index files 29 | IndexDir := filepath.Dir(c.Store.IndexFile) 30 | IndexPathPrefix := "_tmp." + filepath.Base(c.Store.IndexFile) 31 | reg, _ := regexp.Compile(IndexPathPrefix + ".*") 32 | if err := Remove(IndexDir, reg); err != nil { 33 | log.Fatal(err) 34 | return 35 | } 36 | 37 | reg1, _ := regexp.Compile("^" + filepath.Base(c.Store.IndexFile) + ".*") 38 | if err := Remove(filepath.Dir(c.Store.IndexFile), reg1); err != nil { 39 | log.Fatal(err) 40 | return 41 | } 42 | 43 | //文件太大,先拆分生成小文件,在内存中构造到排表,最后再归并到一个索引文件 44 | //无法直接在文件中构建构建索引,因为posting list在文件中是连续存储的,随着posting list逐渐变长,需要不断的拷贝到新空间 45 | Spilt(c, IndexDir+"/"+IndexPathPrefix) 46 | 47 | //归并合并 48 | files, err := Walk(IndexDir, reg) 49 | if err != nil { 50 | panic(err) 51 | } 52 | MergeAll(c, files) 53 | } 54 | 55 | func Spilt(c config.Config, filePrefix string) (files []string) { 56 | start := time.Now() 57 | //1. spilt to small file. 58 | ch, err := index.LoadDocumentStream(c.Store.DumpFile) 59 | if err != nil { 60 | log.Fatal(err) 61 | return 62 | } 63 | 64 | //2. index and dump posting list 65 | idx := index.NewHashMapIndex() 66 | 67 | WriteToFile := func() string { 68 | file := fmt.Sprintf("%s.%d", filePrefix, time.Now().Nanosecond()) 69 | fmt.Printf("Loaded all docs, Drain to file: %s \n", file) 70 | 71 | index.Drain(idx, file) 72 | return file 73 | } 74 | 75 | for { 76 | //timeout := time.NewTimer(1 * time.Second) 77 | select { 78 | case doc := <-ch: 79 | if doc == nil { 80 | file := WriteToFile() 81 | files = append(files, file) 82 | break 83 | } 84 | 85 | idx.Add([]index.Document{*doc}) //内存中操作 86 | if idx.Property().DocNum() >= SpiltThresholdDocNum { 87 | file := WriteToFile() 88 | files = append(files, file) 89 | 90 | idx.Clear() 91 | } 92 | continue 93 | //case <-timeout.C: 94 | // log.Printf("Read timeout. err: %s", err.Error()) 95 | // break 96 | } 97 | break 98 | } 99 | log.Printf("Dump all documents in %v.", time.Since(start)) 100 | return files 101 | } 102 | 103 | func MergeAll(c config.Config, files []string) { 104 | var chs []chan *index.KVPair 105 | for i := 0; i < len(files); i++ { 106 | chl, err := index.Load(files[i]) 107 | if err != nil { 108 | panic(err) 109 | } 110 | chs = append(chs, chl) 111 | } 112 | 113 | start := time.Now() 114 | bt := index.NewBTreeIndex(c.Store.IndexFile) 115 | 116 | //3. merge posting list 117 | //频繁往Posting List中追加doc,导致元分配空间不足,需要拷贝PostingList到新的空间,文件读写IO高 118 | //必须归并后在写入索引, 119 | finished := make(map[int]bool) 120 | pairs := make([]*index.KVPair, len(files)) 121 | for { 122 | pivot := -1 123 | for i := 0; i < len(pairs); i++ { 124 | if pairs[i] == nil && chs[i] != nil { 125 | timeout := time.NewTimer(1000 * time.Millisecond) 126 | select { 127 | case kv := <-chs[i]: 128 | if kv == nil { 129 | close(chs[i]) 130 | chs[i] = nil 131 | } 132 | pairs[i] = kv 133 | case <-timeout.C: 134 | close(chs[i]) 135 | chs[i] = nil 136 | } 137 | } 138 | 139 | if pairs[i] == nil {//已完成一路 140 | finished[i] = true 141 | continue 142 | } else if pivot == -1 {//取第一非空值作为哨兵 143 | pivot = i 144 | continue 145 | } 146 | 147 | if pairs[i].Key < pairs[pivot].Key { 148 | pivot = i 149 | } else if pairs[i].Key == pairs[pivot].Key { 150 | pairs[pivot].Value.Append(pairs[i].Value...) 151 | pairs[i] = nil 152 | } 153 | } 154 | if len(finished) == len(files) { //all finished 155 | break 156 | } 157 | 158 | //4. insert "word->posting list" 159 | sort.Sort(pairs[pivot].Value) 160 | bt.Insert(pairs[pivot].Key, pairs[pivot].Value) 161 | pairs[pivot] = nil 162 | } 163 | log.Printf("Indexed %d documents and %d keys in %v", bt.Property().DocNum(), bt.BT.Count(), time.Since(start)) 164 | 165 | bt.BT.Stats(true) 166 | bt.Close() 167 | time.Sleep(5*time.Second) 168 | } 169 | 170 | func Remove(dir string, reg *regexp.Regexp) error { 171 | files, err := Walk(dir, reg) 172 | if err != nil { 173 | return err 174 | } 175 | fmt.Printf("remove %d files.\n", len(files)) 176 | for i := 0; i < len(files); i++ { 177 | //fmt.Println(files[i]) 178 | os.Remove(files[i]) 179 | } 180 | return nil 181 | } 182 | 183 | func Walk(dir string, re *regexp.Regexp) ([]string, error) { 184 | // Just a demo, this is how we capture the files that match 185 | // the pattern. 186 | var files []string 187 | 188 | walk := func(path string, d fs.DirEntry, err error) error { 189 | if re.MatchString(d.Name()) == false { 190 | return nil 191 | } 192 | if d.IsDir() { 193 | fmt.Println(path + string(os.PathSeparator)) 194 | } else { 195 | //fmt.Println(path) 196 | files = append(files, path) 197 | } 198 | return nil 199 | } 200 | filepath.WalkDir(dir, walk) 201 | return files, nil 202 | } 203 | 204 | -------------------------------------------------------------------------------- /search/indexer_test.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "github.com/awesomefly/easysearch/config" 5 | "github.com/awesomefly/easysearch/index" 6 | "regexp" 7 | "runtime" 8 | "testing" 9 | ) 10 | 11 | func TestIndexer(t *testing.T) { 12 | reg,_ := regexp.Compile("enwiki-latest-abstract.*.xml.gz") 13 | files,_ := Walk("../data", reg) 14 | println(files) 15 | } 16 | 17 | func TestIndex(t *testing.T) { 18 | runtime.GOMAXPROCS(2) 19 | conf := config.Config { 20 | Store:config.Storage { 21 | DumpFile: "../data/enwiki-latest-abstract27.xml.gz", 22 | IndexFile: "../data/enwiki_idx", 23 | }, 24 | } 25 | 26 | 27 | Index(conf) 28 | //r := recover() 29 | //assert.Nil(t, r) 30 | 31 | bt := index.NewBTreeIndex(conf.Store.IndexFile) 32 | bt.BT.Stats(true) 33 | } -------------------------------------------------------------------------------- /search/merger.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "log" 5 | "sort" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/awesomefly/easysearch/index" 10 | btree "github.com/awesomefly/gobtree" 11 | ) 12 | 13 | func Merge(srcPath, dstPath string) { 14 | log.Println("Starting merge ...") 15 | 16 | start := time.Now() 17 | idx := index.NewBTreeIndex(srcPath) 18 | log.Printf("Source index loaded %d keys in %v", idx.BT.Count(), time.Since(start)) 19 | 20 | start = time.Now() 21 | dstIdx := index.NewBTreeIndex(dstPath) 22 | log.Printf("Dst index loaded %d keys in %v", dstIdx.BT.Count(), time.Since(start)) 23 | 24 | start = time.Now() 25 | ch := idx.BT.FullSet() 26 | for { 27 | k := <-ch 28 | d := <-ch 29 | v := <-ch 30 | if k == nil || d == nil || v == nil { 31 | break 32 | } 33 | 34 | var src index.PostingList 35 | src.FromBytes(v) 36 | 37 | dst := dstIdx.Lookup(string(k), true) 38 | dst = append(dst, src...) 39 | sort.Sort(dst) 40 | 41 | id, err := strconv.ParseInt(string(d), 10, 64) //TestKey.Docid()对应 42 | if err != nil { 43 | panic(err) 44 | } 45 | 46 | key := &btree.TestKey{K: string(k), Id: id} 47 | dstIdx.BT.Insert(key, &dst) 48 | } 49 | log.Printf("merge %s to %s in %v", srcPath, dstPath, time.Since(start)) 50 | idx.Close() 51 | dstIdx.Close() 52 | } 53 | -------------------------------------------------------------------------------- /search/searcher.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "log" 5 | "sort" 6 | "strconv" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | "unsafe" 11 | 12 | "github.com/xtgo/set" 13 | 14 | "github.com/RoaringBitmap/roaring" 15 | 16 | "github.com/awesomefly/easysearch/index" 17 | "github.com/awesomefly/easysearch/paraphrase/serving" 18 | "github.com/awesomefly/easysearch/util" 19 | ) 20 | 21 | type IndexType int 22 | 23 | const ( 24 | FullIndex IndexType = iota 25 | AuxIndex 26 | ) 27 | 28 | type MsgType int 29 | 30 | const ( 31 | STOP MsgType = iota 32 | FLUSH 33 | ) 34 | 35 | type Message struct { 36 | MsgType MsgType 37 | Msg string 38 | } 39 | 40 | type DoubleBuffer struct { 41 | CurrentIdx uint32 //current write index 42 | msgChan chan Message 43 | 44 | Indices []*index.HashMapIndex 45 | Queues []chan index.Document 46 | } 47 | 48 | func NewDoubleBuffer() *DoubleBuffer { 49 | buf := DoubleBuffer{} 50 | atomic.StoreUint32(&buf.CurrentIdx, 0) 51 | 52 | for i := 0; i < 2; i++ { 53 | idx := index.NewHashMapIndex() 54 | buf.Indices = append(buf.Indices, idx) 55 | buf.Queues = append(buf.Queues, make(chan index.Document, 100)) 56 | } 57 | 58 | buf.msgChan = buf.Start() 59 | return &buf 60 | } 61 | 62 | func (b *DoubleBuffer) WithDataRange(timestamp int64) *DoubleBuffer { 63 | t := time.Now() 64 | if timestamp != 0 { 65 | t = time.Unix(timestamp, 0) 66 | } 67 | start := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix() 68 | 69 | t = t.AddDate(0, 0, 1) 70 | end := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix() 71 | 72 | for i := 0; i < len(b.Indices); i++ { 73 | b.Indices[i].Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)}) 74 | } 75 | return b 76 | } 77 | 78 | func (b *DoubleBuffer) Start() chan Message { 79 | msgChan := make(chan Message, 10) 80 | go func() { 81 | for { 82 | select { 83 | case msg := <-msgChan: 84 | switch msg.MsgType { 85 | case STOP: 86 | log.Printf("stop double buffer. msg:%s\n", msg.Msg) 87 | return 88 | case FLUSH: 89 | b.DoFlush() 90 | } 91 | default: 92 | b.DoAdd() 93 | } 94 | } 95 | }() 96 | return msgChan 97 | } 98 | 99 | func (b *DoubleBuffer) Stop() { 100 | b.msgChan <- Message{ 101 | MsgType: STOP, 102 | Msg: "stop", 103 | } 104 | } 105 | 106 | // DoFlush unsafe 107 | func (b *DoubleBuffer) DoFlush() { 108 | for i := 0; i < len(b.Indices); i++ { 109 | idx := b.Indices[i] 110 | docs := make([]index.Document, 0) 111 | for { 112 | select { 113 | case doc := <-b.Queues[i]: 114 | docs = append(docs, doc) 115 | continue 116 | default: 117 | break 118 | } 119 | break 120 | } 121 | 122 | if len(docs) > 0 { 123 | idx.Add(docs) 124 | } 125 | } 126 | } 127 | 128 | func (b *DoubleBuffer) DoAdd() { 129 | writeIdx := atomic.LoadUint32(&b.CurrentIdx) 130 | 131 | //单协程写,无需加锁 132 | idx := b.Indices[writeIdx] 133 | docs := make([]index.Document, 0) 134 | for { 135 | timeout := time.NewTimer(1 * time.Millisecond) 136 | select { 137 | case doc := <-b.Queues[writeIdx]: 138 | docs = append(docs, doc) 139 | continue 140 | case <-timeout.C: 141 | break 142 | } 143 | break 144 | } 145 | 146 | if len(docs) > 0 { 147 | idx.Add(docs) 148 | } 149 | 150 | if len(b.Queues[1-writeIdx]) > 10 { 151 | atomic.CompareAndSwapUint32(&b.CurrentIdx, writeIdx, 1-writeIdx) 152 | //适当sleep,让历史读写操作执行完,避免读写并发 153 | time.Sleep(100 * time.Millisecond) 154 | } 155 | } 156 | 157 | func (b *DoubleBuffer) Add(doc index.Document) { 158 | for i := 0; i < len(b.Queues); i++ { 159 | b.Queues[i] <- doc 160 | } 161 | } 162 | 163 | func (b *DoubleBuffer) ReadIndex() *index.HashMapIndex { 164 | writeIdx := atomic.LoadUint32(&b.CurrentIdx) 165 | return b.Indices[1-writeIdx] 166 | } 167 | 168 | func (b *DoubleBuffer) Flush() { 169 | b.msgChan <- Message{ 170 | MsgType: FLUSH, 171 | Msg: "force flush", 172 | } 173 | } 174 | 175 | func (b *DoubleBuffer) Clear() { 176 | } 177 | 178 | type IndexArray struct { 179 | lock sync.RWMutex 180 | indices []*index.BTreeIndex 181 | } 182 | 183 | func NewIndexArray() *IndexArray { 184 | return &IndexArray{ 185 | indices: make([]*index.BTreeIndex, 0), 186 | } 187 | } 188 | 189 | func (b *IndexArray) WithFile(file string) *IndexArray { 190 | idx := index.NewBTreeIndex(file) 191 | 192 | t := time.Now() 193 | start := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix() 194 | t = t.AddDate(0, 0, 1) 195 | end := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, t.Location()).Unix() 196 | idx.Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)}) 197 | 198 | b.lock.Lock() 199 | defer b.lock.Unlock() 200 | 201 | b.indices = append(b.indices, idx) 202 | return b 203 | } 204 | 205 | func (b *IndexArray) Indices() []*index.BTreeIndex { 206 | b.lock.RLock() 207 | defer b.lock.RUnlock() 208 | 209 | // 将数据复制到新的切片空间中 210 | copyData := make([]*index.BTreeIndex, len(b.indices)) 211 | copy(copyData, b.indices) 212 | return copyData 213 | } 214 | 215 | func (b *IndexArray) Add(idx *index.BTreeIndex) { 216 | b.lock.Lock() 217 | defer b.lock.Unlock() 218 | 219 | b.indices = append(b.indices, idx) 220 | } 221 | 222 | // Hit 查找包含dr的index 223 | func (b *IndexArray) Hit(dr index.DataRange) *index.BTreeIndex { 224 | b.lock.RLock() 225 | defer b.lock.RUnlock() 226 | 227 | for i := 0; i < len(b.indices); i++ { 228 | r := b.indices[i].Property().DataRange() 229 | if dr.Start >= r.Start && dr.End <= r.End { //在index的range范围内 230 | return b.indices[i] 231 | } 232 | } 233 | return nil 234 | } 235 | 236 | func (b *IndexArray) Swap(old *index.BTreeIndex, new *index.BTreeIndex) bool { 237 | b.lock.Lock() 238 | defer b.lock.Unlock() 239 | for i := 0; i < len(b.indices); i++ { 240 | if b.indices[i] == old { //在index的range范围内 241 | b.indices[i] = new 242 | return true 243 | } 244 | } 245 | return false 246 | } 247 | 248 | // Evict 淘汰dr范围内的index 249 | func (b *IndexArray) Evict(dr index.DataRange) []*index.BTreeIndex { 250 | b.lock.Lock() 251 | defer b.lock.Unlock() 252 | 253 | var evicts []*index.BTreeIndex 254 | for i := 0; i < len(b.indices); { 255 | r := b.indices[i].Property().DataRange() 256 | if dr.Start <= r.Start && dr.End >= r.End { //在dr范围内的所以index 257 | evicts = append(evicts, b.indices[i]) 258 | b.indices = append(b.indices[:i], b.indices[i+1:]...) //删除元素i 259 | } else { 260 | i++ 261 | } 262 | } 263 | return evicts 264 | } 265 | 266 | type Searcher struct { 267 | //全量索引/主索引,历史全量数据静态构建成本高 268 | fullIndex unsafe.Pointer 269 | 270 | // 辅助索引(auxiliary index),全量索引较大重建不方便,可以近期新增数据构建成增量索引。 271 | // eg.每天只对1天前的数据重建索引,当天数据构建成增量索引 272 | auxIndex unsafe.Pointer //*IndexArray 带时间段的索引数组 273 | 274 | // 临时索引(incremental index)支持实时更新索引,利用双buff在内存中构建,支持无锁并发读写; 275 | // 内存不足时合并到辅助索引 276 | incrIndex unsafe.Pointer 277 | 278 | //deleteList []index.Doc //delete docs list. update doc = delete old doc and create new one 279 | //BloomFilter *bloom.Filter //也可使用布谷鸟过滤器效率更高 280 | roaringFilter *roaring.Bitmap //todo:如何删除过期数据 281 | 282 | model *serving.ParaphraseModel //todo: 移到search server更合适 283 | 284 | indexFile string 285 | } 286 | 287 | func NewSearcher(file string) *Searcher { 288 | srh := &Searcher{ 289 | fullIndex: unsafe.Pointer(index.NewBTreeIndex(file)), 290 | auxIndex: unsafe.Pointer(NewIndexArray().WithFile(file + ".aux." + strconv.Itoa(int(time.Now().Unix())))), 291 | incrIndex: unsafe.Pointer(NewDoubleBuffer().WithDataRange(0)), 292 | //deleteList: make([]index.Doc, 0), 293 | //BloomFilter: bloom.New(10000, 1000), 294 | roaringFilter: roaring.New(), 295 | model: nil, 296 | indexFile: file, 297 | } 298 | return srh 299 | } 300 | 301 | func (srh *Searcher) InitParaphrase(file string) { 302 | srh.model = serving.NewModel(file) 303 | } 304 | 305 | func (srh *Searcher) Paraphrase(texts []string, n int) []string { 306 | if srh.model == nil { 307 | return nil 308 | } 309 | var ( 310 | positive = texts 311 | negative []string 312 | ) 313 | l := len(texts) 314 | sim := srh.model.GetSimilar(positive, negative, l+n) 315 | return sim[l:] 316 | } 317 | 318 | // Add doc to index double-buffer async 319 | // write need lock but read do not 320 | func (srh *Searcher) Add(doc index.Document) { 321 | incr := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)) 322 | 323 | //跨天,新建个增量索引 324 | end := incr.ReadIndex().Property().DataRange().End 325 | if doc.Timestamp > end { 326 | srh.Drain(end) 327 | } 328 | 329 | //可能触发Drain需要重新Load 330 | (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).Add(doc) 331 | } 332 | 333 | // Del doc from index 334 | func (srh *Searcher) Del(doc index.Document) { 335 | //todo: 加锁 336 | srh.roaringFilter.Add(uint32(doc.ID)) 337 | } 338 | 339 | func (srh *Searcher) Count() int { 340 | a := (*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex)).Property().DocNum() 341 | copyData := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)).Indices() 342 | for i := 0; i < len(copyData); i++ { 343 | a += copyData[i].Property().DocNum() 344 | } 345 | a += (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).ReadIndex().Property().DocNum() 346 | return a 347 | } 348 | 349 | func (srh *Searcher) Clear() { 350 | (*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex)).Clear() 351 | copyData := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)).Indices() 352 | for i := 0; i < len(copyData); i++ { 353 | copyData[i].Clear() 354 | } 355 | } 356 | 357 | // Drain incremental index to disk 358 | // 实际的原地更新策略,需要PostingList末尾预留足够空间,否则大量PostingList需要移动效率更低 359 | // 磁盘空间足够时使用再合并策略,实现简单且不影响并发,但需要足够的内存 360 | func (srh *Searcher) Drain(timestamp int) { 361 | oldIncr := (*DoubleBuffer)(atomic.SwapPointer(&srh.incrIndex, unsafe.Pointer(NewDoubleBuffer().WithDataRange(int64(timestamp))))) 362 | go func() { 363 | //flush after sleep any second 364 | time.Sleep(100 * time.Millisecond) 365 | oldIncr.Flush() 366 | oldIncr.Stop() 367 | 368 | oldIncrDR := oldIncr.ReadIndex().Property().DataRange() 369 | auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)) 370 | oldAux := auxIdxArray.Hit(oldIncrDR) 371 | if oldAux != nil { 372 | //合并keys 373 | keys := make(sort.StringSlice, len(oldIncr.ReadIndex().Map())+int(oldAux.BT.Count())) 374 | for k := range oldIncr.ReadIndex().Map() { 375 | keys = append(keys, k) 376 | } 377 | ch := oldAux.BT.KeySet() 378 | for { 379 | key := <-ch 380 | if key == nil { 381 | break 382 | } 383 | keys = append(keys, string(key)) 384 | } 385 | sort.Strings(keys) 386 | set.Uniq(keys) 387 | 388 | //合并到新索引 389 | newAux := index.NewBTreeIndex(srh.indexFile + ".aux." + strconv.Itoa(int(time.Now().Unix()))) 390 | for i := 0; i < keys.Len(); i++ { 391 | key := keys[i] 392 | pl := oldAux.Lookup(key, false) 393 | if pl2 := oldIncr.ReadIndex().Get(key); pl2 != nil { 394 | pl = append(pl, pl2...) 395 | } 396 | if len(pl) > 0 { 397 | newAux.Insert(key, pl) 398 | } 399 | } 400 | newAux.SetProperty(*oldAux.Property()) 401 | newAux.Property().SetDocNum(oldIncr.ReadIndex().Property().DocNum() + oldAux.Property().DocNum()) 402 | newAux.Property().SetTokenCount(oldIncr.ReadIndex().Property().TokenCount() + oldAux.Property().TokenCount()) 403 | newAux.BT.Drain() 404 | 405 | //oldAux = (*index.BTreeIndex)(atomic.SwapPointer(&srh.auxIndex, unsafe.Pointer(newAux))) 406 | if auxIdxArray.Swap(oldAux, newAux) { 407 | oldAux.Clear() 408 | oldIncr.Clear() 409 | } 410 | } else { 411 | idx := index.NewBTreeIndex(srh.indexFile + ".aux." + strconv.Itoa(oldIncrDR.Start)) 412 | idx.Property().SetDataRange(oldIncrDR) 413 | auxIdxArray.Add(idx) 414 | } 415 | }() 416 | } 417 | 418 | // Load index, use for rebuild index 419 | func (srh *Searcher) Load(file string, flag IndexType) { 420 | newIndex := index.NewBTreeIndex(file) 421 | auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)) 422 | 423 | evicts := auxIdxArray.Evict(newIndex.Property().DataRange()) 424 | switch flag { 425 | case FullIndex: 426 | old := (*index.BTreeIndex)(atomic.SwapPointer(&srh.fullIndex, unsafe.Pointer(newIndex))) 427 | evicts = append(evicts, old) 428 | case AuxIndex: 429 | auxIdxArray.Add(newIndex) //如果先添加后淘汰,需要避免自身也被淘汰 430 | //old = (*index.BTreeIndex)(atomic.SwapPointer(&srh.auxIndex, unsafe.Pointer(newIndex))) 431 | } 432 | 433 | for i := 0; i < len(evicts); i++ { 434 | evicts[i].Clear() 435 | } 436 | } 437 | 438 | //SearchTips todo: 支持搜索提示 439 | //Trie 适合英文词典,如果系统中存在大量字符串且这些字符串基本没有公共前缀,则相应的trie树将非常消耗内存(数据结构之trie树) 440 | //Double Array Trie 适合做中文词典,内存占用小 441 | func (srh *Searcher) SearchTips() []string { 442 | //支持trie树 or FST 443 | return nil 444 | } 445 | 446 | func (srh *Searcher) Retrieval(terms []string, ext []string, model index.SearchModel) []index.Doc { 447 | var result []index.Doc 448 | 449 | fullIdx := (*index.BTreeIndex)(atomic.LoadPointer(&srh.fullIndex)) 450 | auxIdxArray := (*IndexArray)(atomic.LoadPointer(&srh.auxIndex)) 451 | incrIdx := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)).ReadIndex() 452 | 453 | result = fullIdx.Retrieval(terms, ext, nil, 10, 1000, model) 454 | 455 | copyData := auxIdxArray.Indices() 456 | for i := 0; i < len(copyData); i++ { 457 | y := copyData[i].Retrieval(terms, ext, nil, 10, 1000, model) 458 | (*index.PostingList)(&result).Union(y) 459 | } 460 | 461 | z := incrIdx.Retrieval(terms, ext, nil, 10, 1000, model) 462 | (*index.PostingList)(&result).Union(z) 463 | return result 464 | } 465 | 466 | //Filter deleted docs 467 | func (srh *Searcher) Filter(docs []index.Doc) []index.Doc { 468 | var result []index.Doc 469 | for _, doc := range docs { 470 | hit := srh.roaringFilter.Contains(uint32(doc.ID)) 471 | if !hit { 472 | result = append(result, doc) 473 | } 474 | } 475 | return result 476 | } 477 | 478 | // Search queries the index for the given text. 479 | // todo: 检索召回(多路召回) -> 粗排sort(CTR by LR) -> 精排sort(CVR by DNN) -> topN(堆排序) 480 | func (srh *Searcher) Search(query string) []index.Doc { 481 | //todo: 支持前缀查找 482 | //参考:Lucene builds an inverted index using Skip-Lists on disk, 483 | //and then loads a mapping for the indexed terms into memory using a Finite State Transducer (FST). 484 | 485 | //1. Query Rewrite todo:支持查询纠错,意图识别 486 | //1.1 文本预处理:分词、去除停用词、词干提取 487 | terms := util.Analyze(query) 488 | //1.2 语义扩展,即近义词/含义相同等 489 | ext := srh.Paraphrase(terms, 3) 490 | 491 | //2. todo:多路召回(传统检索+向量检索) 492 | r := srh.Retrieval(terms, ext, index.BM25) 493 | 494 | //3. 过滤已删除文档filter 495 | r = srh.Filter(r) 496 | return r 497 | } 498 | -------------------------------------------------------------------------------- /search/searcher_test.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "fmt" 5 | "github.com/stretchr/testify/assert" 6 | "math/rand" 7 | "runtime" 8 | "sync/atomic" 9 | "testing" 10 | "time" 11 | 12 | "github.com/awesomefly/easysearch/index" 13 | ) 14 | 15 | var letters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 16 | 17 | func randSeq(n int) string { 18 | b := make([]rune, n) 19 | for i := range b { 20 | b[i] = letters[rand.Intn(len(letters))] 21 | } 22 | return string(b) 23 | } 24 | 25 | func get(srh *Searcher, text string) []int { 26 | rt := (*DoubleBuffer)(atomic.LoadPointer(&srh.incrIndex)) 27 | return (index.PostingList)(rt.ReadIndex().Get(text)).IDs() 28 | } 29 | 30 | var searcher = NewSearcher("../data/test_insread") //必须为全局变量 31 | func BenchmarkDoubleBuffer(b *testing.B) { 32 | //b.N = 10000 33 | rand.Seed(time.Now().UnixNano()) 34 | fmt.Println(runtime.GOMAXPROCS(0)) 35 | 36 | fmt.Println("start") 37 | searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."}) 38 | for i := 0; i < b.N; i++ { 39 | searcher.Add(index.Document{ID: 1, Text: randSeq(5)}) 40 | get(searcher, "donut") 41 | } 42 | fmt.Println("done") 43 | } 44 | 45 | func BenchmarkDoubleBufferParallel(b *testing.B) { 46 | rand.Seed(time.Now().UnixNano()) 47 | fmt.Println(runtime.GOMAXPROCS(0)) 48 | searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."}) 49 | 50 | // 测试一个对象或者函数在多线程的场景下面是否安全 51 | b.SetParallelism(10000) //协程总数:b.parallelism * runtime.GOMAXPROCS(0) 52 | b.RunParallel(func(pb *testing.PB) { 53 | for pb.Next() { //每个协程运行b.N个case 54 | t := randSeq(5) 55 | searcher.Add(index.Document{ID: 1, Text: t}) 56 | get(searcher, "donut") 57 | get(searcher, t) 58 | } 59 | }) 60 | } 61 | func TestSearcherLoad(t *testing.T) { 62 | searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate.", Timestamp: int(time.Now().Unix())}) //当天的文档 63 | searcher.Add(index.Document{ID: 2, Text: "Only the donuts.", Timestamp: int(time.Now().AddDate(0,0,1).Unix())}) //第二天的文档 64 | time.Sleep(2 * time.Second) 65 | 66 | copyData :=(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices() 67 | fmt.Printf("1index len:%d\n", len(copyData)) 68 | assert.Equal(t, 1, len(copyData)) //触发索引分裂,此时当天的索引文件已持久化 69 | 70 | 71 | searcher.Drain(0) 72 | time.Sleep(2 * time.Second) 73 | copyData =(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices() 74 | fmt.Printf("2index len:%d\n", len(copyData)) 75 | assert.Equal(t, 2, len(copyData)) //手动持久化第二天的文档 76 | 77 | 78 | newIndex := index.NewBTreeIndex("../data/test_insread_xxx") 79 | 80 | ts := time.Now() 81 | start := time.Date(ts.Year(), ts.Month(), ts.Day(), 0, 0, 0, 0, ts.Location()).Unix() 82 | ts = ts.AddDate(0, 0, 3) 83 | end := time.Date(ts.Year(), ts.Month(), ts.Day(), 0, 0, 0, 0, ts.Location()).Unix() 84 | newIndex.Property().SetDataRange(index.DataRange{Start: int(start), End: int(end)}) 85 | 86 | newIndex.Add([]index.Document{{ID: 3, Text: "god is girl."}}) 87 | newIndex.Close() 88 | 89 | searcher.Load("../data/test_insread_xxx", AuxIndex) 90 | copyData =(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices() 91 | fmt.Printf("3index len:%d\n", len(copyData)) 92 | assert.Equal(t, 1, len(copyData)) //手动持久化第二天的文档 93 | 94 | rst := searcher.Search("girl") 95 | fmt.Printf("%+v", rst) 96 | assert.Equal(t, 1, len(rst)) 97 | 98 | searcher.Clear() 99 | } 100 | 101 | func TestSearcher(t *testing.T) { 102 | searcher.Add(index.Document{ID: 1, Text: "A donut on a glass plate. Only the donuts."}) 103 | for i := 0; i < 12; i++ { 104 | searcher.Add(index.Document{ID: 10+i, Text: randSeq(5)}) 105 | } 106 | time.Sleep(2 * time.Second) 107 | fmt.Printf("count:%d\n", searcher.Count()) 108 | assert.Equal(t, 13, searcher.Count()) //默认10个doc会切换双buffer 109 | 110 | searcher.Drain(0) 111 | time.Sleep(2 * time.Second) 112 | 113 | var a int 114 | copyData :=(*IndexArray)(atomic.LoadPointer(&searcher.auxIndex)).Indices() 115 | for i := 0; i < len(copyData); i++ { 116 | a += copyData[i].Property().DocNum() 117 | } 118 | fmt.Printf("index len:%d\n", len(copyData)) 119 | fmt.Printf("auxIndex count:%d\n", a) 120 | assert.Equal(t, 13, a) 121 | 122 | rst := searcher.Search("donut") 123 | fmt.Printf("%+v", rst) 124 | assert.Equal(t, 1, len(rst)) 125 | 126 | //Del&Filter 127 | searcher.Del(index.Document{ID: 1}) 128 | rst = searcher.Search("donut") 129 | assert.Equal(t, 0, len(rst)) 130 | 131 | //Clear 132 | searcher.Clear() 133 | } 134 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | CURDIR=$(cd $(dirname $0); pwd) 3 | echo "$CURDIR" 4 | 5 | BIN=easysearch 6 | 7 | if [ $1 = "standalone" ]; then 8 | ps -ef|grep $BIN|grep -v "grep"|awk -F " " '{print $2}'|xargs kill -9 9 | sleep 1 10 | 11 | ./$BIN -m cluster --servername=all >> ${CURDIR}/err.log 2>&1 & 12 | echo "started standalone cluster " 13 | elif [ $1 = "manager" ]; then 14 | ./$BIN -m cluster --servername=managerserver >> ${CURDIR}/err.log 2>&1 & 15 | elif [ $1 = "data" ]; then 16 | ./$BIN -m cluster --servername=dataserver >> ${CURDIR}/err.log 2>&1 & 17 | elif [ $1 = "search" ]; then 18 | ./$BIN -m cluster --servername=searchserver >> ${CURDIR}/err.log 2>&1 & 19 | elif [ $1 = "kill" ]; then 20 | ps -ef|grep $BIN|grep -v "grep"|awk -F " " '{print $2}'|xargs kill -9 21 | fi 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /util/collection.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | // InterInt returns the set intersection between a and b. 4 | // a and b have to be sorted in ascending order and contain no duplicates. 5 | func InterInt(a []int, b []int) []int { 6 | maxLen := len(a) 7 | if len(b) > maxLen { 8 | maxLen = len(b) 9 | } 10 | r := make([]int, 0, maxLen) 11 | var i, j int 12 | for i < len(a) && j < len(b) { 13 | if a[i] < b[j] { 14 | i++ 15 | } else if a[i] > b[j] { 16 | j++ 17 | } else { 18 | r = append(r, a[i]) 19 | i++ 20 | j++ 21 | } 22 | } 23 | return r 24 | } 25 | 26 | // MergeInt returns the unique set a union b. 27 | // a and b have to be sorted in ascending order and contain no duplicates. 28 | func MergeInt(a []int, b []int) []int { 29 | if a == nil { 30 | return b 31 | } 32 | if b == nil { 33 | return a 34 | } 35 | r := make([]int, 0, len(a)+len(b)) 36 | var i, j int 37 | for i < len(a) && j < len(b) { 38 | if a[i] < b[j] { 39 | r = append(r, a[i]) 40 | i++ 41 | } else if a[i] > b[j] { 42 | r = append(r, b[j]) 43 | j++ 44 | } else { 45 | r = append(r, a[i]) 46 | i++ 47 | j++ 48 | } 49 | } 50 | return r 51 | } 52 | 53 | // DiffInt returns the diff set a between b. 54 | // a and b have to be sorted in ascending order and contain no duplicates. 55 | func DiffInt(a []int, b []int) []int { 56 | minLen := len(a) 57 | if len(b) < minLen { 58 | minLen = len(b) 59 | } 60 | r := make([]int, 0, minLen) 61 | var i, j int 62 | for i < len(a) && j < len(b) { 63 | if a[i] < b[j] { 64 | r = append(r, a[i]) 65 | i++ 66 | } else if a[i] > b[j] { 67 | r = append(r, b[j]) 68 | j++ 69 | } else { 70 | i++ 71 | j++ 72 | } 73 | } 74 | return r 75 | } 76 | 77 | // FilterInt returns the set a filter b. 78 | // a and b have to be sorted in ascending order and contain no duplicates. 79 | func FilterInt(a []int, b []int) []int { 80 | var i, j int 81 | for i < len(a) && j < len(b) { 82 | if a[i] < b[j] { 83 | i++ 84 | } else if a[i] > b[j] { 85 | j++ 86 | } else { 87 | a[i] = -1 88 | i++ 89 | j++ 90 | } 91 | } 92 | r := make([]int, 0) 93 | for _, v := range a { 94 | if v != -1 { 95 | r = append(r, v) 96 | } 97 | } 98 | return r 99 | } 100 | -------------------------------------------------------------------------------- /util/condition.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | func IfElseInt(condition bool, o1 int, o2 int) int { 4 | if condition { 5 | return o1 6 | } 7 | return o2 8 | } 9 | -------------------------------------------------------------------------------- /util/filter.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "strings" 5 | 6 | snowballeng "github.com/kljensen/snowball/english" 7 | ) 8 | 9 | // lowercaseFilter returns a slice of tokens normalized to lower case. 10 | func lowercaseFilter(tokens []string) []string { 11 | r := make([]string, len(tokens)) 12 | for i, token := range tokens { 13 | r[i] = strings.ToLower(token) 14 | } 15 | return r 16 | } 17 | 18 | // stopwordFilter returns a slice of tokens with stop words removed. 19 | func stopwordFilter(tokens []string) []string { 20 | var stopwords = map[string]struct{}{ 21 | "a": {}, "and": {}, "be": {}, "have": {}, "i": {}, 22 | "in": {}, "of": {}, "that": {}, "the": {}, "to": {}, 23 | } 24 | r := make([]string, 0, len(tokens)) 25 | for _, token := range tokens { 26 | if _, ok := stopwords[token]; !ok { 27 | r = append(r, token) 28 | } 29 | } 30 | return r 31 | } 32 | 33 | // stemmerFilter returns a slice of stemmed tokens. 34 | func stemmerFilter(tokens []string) []string { 35 | r := make([]string, len(tokens)) 36 | for i, token := range tokens { 37 | r[i] = snowballeng.Stem(token, false) 38 | } 39 | return r 40 | } 41 | -------------------------------------------------------------------------------- /util/filter_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestLowercaseFilter(t *testing.T) { 10 | var ( 11 | in = []string{"Cat", "DOG", "fish"} 12 | out = []string{"cat", "dog", "fish"} 13 | ) 14 | assert.Equal(t, out, lowercaseFilter(in)) 15 | } 16 | 17 | func TestStopwordFilter(t *testing.T) { 18 | var ( 19 | in = []string{"i", "am", "the", "cat"} 20 | out = []string{"am", "cat"} 21 | ) 22 | assert.Equal(t, out, stopwordFilter(in)) 23 | } 24 | 25 | func TestStemmerFilter(t *testing.T) { 26 | var ( 27 | in = []string{"cat", "cats", "fish", "fishing", "fished", "airline"} 28 | out = []string{"cat", "cat", "fish", "fish", "fish", "airlin"} 29 | ) 30 | assert.Equal(t, out, stemmerFilter(in)) 31 | } 32 | -------------------------------------------------------------------------------- /util/net.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | ) 7 | 8 | func GetLocalIP() string { 9 | list, err := net.Interfaces() 10 | if err != nil { 11 | panic(err) 12 | } 13 | 14 | for i, iface := range list { 15 | fmt.Printf("%d name=%s %v\n", i, iface.Name, iface) 16 | addrs, err := iface.Addrs() 17 | if err != nil { 18 | panic(err) 19 | } 20 | for j, addr := range addrs { 21 | fmt.Printf(" %d %v\n", j, addr) 22 | var ip net.IP 23 | switch v := addr.(type) { 24 | case *net.IPNet: 25 | ip = v.IP 26 | case *net.IPAddr: 27 | ip = v.IP 28 | } 29 | 30 | if !ip.IsLoopback() && ip.To4() != nil { 31 | return ip.String() 32 | } 33 | } 34 | } 35 | return "" 36 | } 37 | -------------------------------------------------------------------------------- /util/tokenizer.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "strings" 5 | "unicode" 6 | ) 7 | 8 | // tokenize returns a slice of tokens for the given text. 9 | func tokenize(text string) []string { 10 | return strings.FieldsFunc(text, func(r rune) bool { 11 | // Split on any character that is not a letter or a number. 12 | return !unicode.IsLetter(r) && !unicode.IsNumber(r) 13 | }) 14 | } 15 | 16 | // Analyze analyzes the text and returns a slice of tokens. 17 | func Analyze(text string) []string { 18 | tokens := tokenize(text) 19 | tokens = lowercaseFilter(tokens) 20 | tokens = stopwordFilter(tokens) 21 | tokens = stemmerFilter(tokens) //提取词干 smiling -> smile 22 | return tokens 23 | } 24 | -------------------------------------------------------------------------------- /util/tokenizer_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestTokenizer(t *testing.T) { 10 | testCases := []struct { 11 | text string 12 | tokens []string 13 | }{ 14 | { 15 | text: "", 16 | tokens: []string{}, 17 | }, 18 | { 19 | text: "a", 20 | tokens: []string{"a"}, 21 | }, 22 | { 23 | text: "small wild,cat!", 24 | tokens: []string{"small", "wild", "cat"}, 25 | }, 26 | } 27 | 28 | for _, tc := range testCases { 29 | t.Run(tc.text, func(st *testing.T) { 30 | assert.EqualValues(st, tc.tokens, tokenize(tc.text)) 31 | }) 32 | } 33 | } 34 | --------------------------------------------------------------------------------