├── .github └── workflows │ └── rust.yml ├── .gitignore ├── CODEOWNERS ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── all-pairs-hamming ├── Cargo.toml ├── README.md ├── src │ ├── bitset64.rs │ ├── chunked_join.rs │ ├── errors.rs │ ├── lib.rs │ ├── multi_sort.rs │ ├── simple_join.rs │ └── sketch.rs └── timeperf │ ├── Cargo.toml │ └── src │ └── main.rs ├── figures ├── f1_reuters.svg ├── mae_reuters.svg └── recall_reuters.svg ├── find-simdoc-cli ├── Cargo.toml └── src │ ├── cosine.rs │ ├── dump.rs │ ├── jaccard.rs │ └── minhash_acc.rs ├── find-simdoc ├── Cargo.toml ├── README.md ├── examples │ ├── find_cosine.rs │ └── find_jaccard.rs └── src │ ├── cosine.rs │ ├── errors.rs │ ├── feature.rs │ ├── jaccard.rs │ ├── lib.rs │ ├── lsh.rs │ ├── lsh │ ├── minhash.rs │ └── simhash.rs │ ├── shingling.rs │ └── tfidf.rs └── scripts └── load_nltk_dataset.py /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | name: Check on ${{ matrix.rust }} 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | rust: 19 | - stable 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Install latest stable 23 | uses: actions-rs/toolchain@v1 24 | with: 25 | toolchain: ${{ matrix.rust }} 26 | override: true 27 | components: rustfmt, clippy 28 | 29 | - name: Run cargo check 30 | uses: actions-rs/cargo@v1 31 | with: 32 | command: check 33 | 34 | - name: Run cargo fmt 35 | uses: actions-rs/cargo@v1 36 | with: 37 | command: fmt 38 | args: --all -- --check 39 | 40 | - name: Run cargo clippy 41 | uses: actions-rs/cargo@v1 42 | with: 43 | command: clippy 44 | args: -- -D warnings -W clippy::nursery 45 | 46 | - name: Run cargo test 47 | uses: actions-rs/cargo@v1 48 | continue-on-error: ${{ matrix.rust == 'nightly' }} 49 | with: 50 | command: test 51 | args: --release --all-features 52 | 53 | - name: Run cargo doc 54 | uses: actions-rs/cargo@v1 55 | continue-on-error: ${{ matrix.rust == 'nightly' }} 56 | with: 57 | command: doc 58 | args: --no-deps 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | 13 | # Added by cargo 14 | 15 | /target 16 | 17 | # My def 18 | reuters.txt 19 | reuters.5k.txt 20 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @kampersanda 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We'd love to accept your patches and contributions to this project. 4 | There are just a few small guidelines you need to follow. 5 | 6 | - You need to acknowledge that your patches and contributions never conflict 7 | with any intellectual properties held by others than LegalForce. 8 | - All submissions, including submissions by project members, require review. 9 | We use GitHub pull requests for this purpose. 10 | Consult GitHub Help for more information on using pull requests. -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "all-pairs-hamming", 4 | "all-pairs-hamming/timeperf", 5 | "find-simdoc", 6 | "find-simdoc-cli", 7 | ] 8 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 legalforce-research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Finding all pairs of similar documents 2 | 3 | [![Crates.io](https://img.shields.io/crates/v/find-simdoc)](https://crates.io/crates/find-simdoc) 4 | [![Documentation](https://docs.rs/find-simdoc/badge.svg)](https://docs.rs/find-simdoc) 5 | ![Build Status](https://github.com/legalforce-research/find-simdoc/actions/workflows/rust.yml/badge.svg) 6 | 7 | This software provides time- and memory-efficient all pairs similarity searches in documents. 8 | 9 | ## Problem definition 10 | 11 | - Input 12 | - List of documents $D = (d_1, d_2, \dots, d_n)$ 13 | - Distance function $\delta: D \times D \rightarrow [0,1]$ 14 | - Radius threshold $r \in [0,1]$ 15 | - Output 16 | - All pairs of similar document ids $R = \\{ (i,j): i < j, \delta(d_i, d_j) \leq r \\}$ 17 | 18 | ## Features 19 | 20 | ### Easy to use 21 | 22 | This software supports all essential steps of document similarity search, 23 | from feature extraction to output of similar pairs. 24 | Therefore, you can immediately try the fast all pairs similarity search using your document files. 25 | 26 | ### Flexible tokenization 27 | 28 | You can specify any delimiter when splitting words in tokenization for feature extraction. 29 | This can be useful in languages where multiple definitions of words exist, such as Japanese or Chinese. 30 | 31 | ### Time and memory efficiency 32 | 33 | The time and memory complexities are *linear* over the numbers of input documents and output results 34 | on the basis of the ideas behind the locality sensitive hashing (LSH) and [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html). 35 | 36 | ### Tunable search performance 37 | 38 | LSH allows tuning of performance in accuracy, time, and memory, through a manual parameter specifying search dimensions. 39 | You can flexibly perform searches depending on your dataset and machine environment. 40 | - Specifying lower dimensions allows for faster and rougher searches with less memory usage. 41 | - Specifying higher dimensions allows for more accurate searches with more memory usage. 42 | 43 | ### Pure Rust 44 | 45 | This software is implemented in Rust, achieving safe and fast performance. 46 | 47 | ## Running example 48 | 49 | Here, we describe the basic usage of this software through an example of running the CLI tool. 50 | 51 | First of all, install `rustc` and `cargo` following the [official instructions](https://www.rust-lang.org/tools/install) since this software is implemented in Rust. 52 | 53 | ### 1. Data preparation 54 | 55 | You have to prepare a text file containing documents line by line (NOT including empty lines). 56 | 57 | To produce an example file used throughout this description, you can use `scripts/load_nltk_dataset.py` that downloads the Reuters Corpus provided by NLTK. 58 | Run the following command. 59 | 60 | ``` 61 | $ ./scripts/load_nltk_dataset.py reuters 62 | ``` 63 | 64 | `reuters.txt` will be output. 65 | 66 | ``` 67 | $ head reuters.txt 68 | hre properties & lt ; hre > 1st qtr jan 31 net shr 38 cts vs 47 cts net 2 , 253 , 664 vs 2 , 806 , 820 gross income 5 , 173 , 318 vs 5 , 873 , 904 note : net includes gains on sale of real estate of 126 , 117 dlrs vs 29 , 812 dlrs . 69 | the firm , however , is supplying temporary financing , and sources close to the transaction disputed the claim that the firm will not end up paying for its equity position . 70 | conoco , which has completed geological prospecting for the tunisian government , has transferred one third of its option rights in the region to ina , it said . 71 | " willis faber ' s stake in morgan grenfell has been a very successful investment ," it said . 72 | china reports 700 mln dlr two - month trade deficit china ' s trade deficit totalled 700 mln dlrs in the first two months of this year , according to figures released by the state statistics bureau . 73 | the treasury said baker and stoltenberg " are consulting with their g - 7 colleagues and are confident that this will enable them to foster exchange rate stability around current levels ." 74 | u . s . tariffs are due to take effect on april 17 . 75 | some dealers said there were growing signs the united states wanted the dollar to fall further . 76 | since last august smart has been leading talks to open up japan to purchases of more u . s .- made automotive parts . 77 | the resulting association will operate under the name of charter and will be based in bristol . 78 | ``` 79 | 80 | Fully-duplicate documents in `reuters.txt` are removed because they are noisy in evaluation of similarity searches. 81 | To do this, the output lines are shuffled, and your file will not be the identical to the example. 82 | 83 | ### 2. Finding all pairs of similar documents 84 | 85 | The workspace `find-simdoc-cli` provides CLI tools for fast all pairs similarity searches in documents. 86 | The approach consists of three steps: 87 | 88 | 1. Extract features from documents 89 | - Set representation of character or word ngrams 90 | - Tfidf-weighted vector representation of character or word ngrams 91 | 2. Convert the features into binary sketches through locality sensitive hashing (LSH) 92 | - [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity 93 | - [Simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592) for the Cosine similarity 94 | 3. Search for similar sketches in the Hamming space using a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html) 95 | 96 | #### 2.1 Jaccard space 97 | 98 | The executable `jaccard` provides a similarity search in the [Jaccard space](https://en.wikipedia.org/wiki/Jaccard_index). 99 | You can check the arguments with the following command. 100 | 101 | ``` 102 | $ cargo run --release -p find-simdoc-cli --bin jaccard -- --help 103 | ``` 104 | 105 | Run the following command if you want to search for `reuters.txt` with 106 | 107 | - search radius `0.1`, 108 | - tokens of character `5`-grams, and 109 | - `15*64=960` dimensions in the Hamming space. 110 | 111 | ``` 112 | $ cargo run --release -p find-simdoc-cli --bin jaccard -- -i reuters.txt -r 0.1 -w 5 -c 15 > result-jaccard.csv 113 | ``` 114 | 115 | Argument `-c` indicates the number of dimensions in the Hamming space, 116 | a trade-off parameter between approximation accuracy and search speed. 117 | The larger this value, the higher the accuracy, but the longer the search takes. 118 | [This section](#4-testing-the-accuracy-of-1-bit-minwise-hashing) describes how to examine the approximation accuracy for the number of dimensions. 119 | 120 | Pairs of similar documents (indicated by zero-origin line numbers) and their distances are reported. 121 | 122 | ``` 123 | $ head result-jaccard.csv 124 | i,j,dist 125 | 191,29637,0.07291666666666667 126 | 199,38690,0.0375 127 | 274,10048,0.07083333333333333 128 | 294,27675,0.04791666666666667 129 | 311,13812,0.04583333333333333 130 | 361,50938,0.08958333333333333 131 | 469,6360,0.035416666666666666 132 | 546,10804,0.0875 133 | 690,28281,0.0875 134 | ``` 135 | 136 | #### 2.2 Cosine space 137 | 138 | The executable `cosine` provides a similarity search in the [Cosine space](https://en.wikipedia.org/wiki/Cosine_similarity). 139 | You can check the arguments with the following command. 140 | 141 | ``` 142 | $ cargo run --release -p find-simdoc-cli --bin cosine -- --help 143 | ``` 144 | 145 | Run the following command if you want to search for `reuters.txt` with 146 | 147 | - search radius `0.1`, 148 | - tokens of word `3`-grams, 149 | - word delimiter `" "` (i.e., a space), 150 | - `10*64=640` dimensions in the Hamming space, and 151 | - weighting using the standard TF and the smoothed IDF. 152 | 153 | ``` 154 | $ cargo run --release -p find-simdoc-cli --bin cosine -- -i reuters.txt -r 0.1 -d " " -w 3 -c 10 -T standard -I smooth > result-cosine.csv 155 | ``` 156 | 157 | Pairs of similar documents (indicated by zero-origin line numbers) and their distances are reported. 158 | 159 | ``` 160 | $ head result-cosine.csv 161 | i,j,dist 162 | 542,49001,0.084375 163 | 964,24198,0.09375 164 | 1872,3024,0.0859375 165 | 1872,6823,0.090625 166 | 1872,8462,0.0953125 167 | 1872,11402,0.090625 168 | 1872,18511,0.0859375 169 | 1872,41491,0.0875 170 | 1872,48344,0.0859375 171 | ``` 172 | 173 | ### 3. Printing similar documents 174 | 175 | The executable `dump` prints similar documents from an output CSV file. 176 | 177 | If you want to print similar documents in `reuters.txt` with the result `result-jaccard.csv`, 178 | run the following command. 179 | 180 | ``` 181 | $ cargo run --release -p find-simdoc-cli --bin dump -- -i reuters.txt -s result-jaccard.csv 182 | [i=191,j=29637,dist=0.07291666666666667] 183 | pending its deliberations , harper and row ' s board has postponed indefinitely a special meeting of stockholders that had been scheduled for april 2 to discuss a proposal to recapitalize the company ' s stock to create two classes of shares with different voting rights . 184 | pending its deliberations , harper and row ' s board has postponed indefinitely a special meeting of stockholders that had been scheduled for april 2 to discuss a proposal to recapitalize the company ' s stock in order to create two classes of shares with different votinmg rights . 185 | [i=199,j=38690,dist=0.0375] 186 | government officials had no immediate comment on the report , which advised a reduction in the overall size of the public investment programme and greater emphasis on the preservation of peru ' s export potential . 187 | government officials had no immediate comment on the report , which advised a reduction in the overall size of the public investment program and greater emphasis on the preservation of peru ' s export potential . 188 | [i=274,j=10048,dist=0.07083333333333333] 189 | the measure was adopted as part of a wide - ranging trade bill that will be considered by the full house in april before it moves on to the senate . 190 | the measure was adopted as part of a wide - ranging trade bill that will be considered by the full house in april before it moves onto the senate . 191 | [i=294,j=27675,dist=0.04791666666666667] 192 | the company said the start - up was necessitated by continuing strong demand for aluminum and dwindling worldwide inventories , and that the metal is needed to supply reynolds ' various fabricating businesses . 193 | the company said the start up was necessitated by continuing strong demand for aluminum and dwindling worldwide inventories , and that the metal is needed to supply reynolds ' various fabricating businesses . 194 | [i=311,j=13812,dist=0.04583333333333333] 195 | he said in an interview with reuter that after a few years it was likely south korea would drop barriers to foreign goods and move toward a more balanced trade position . 196 | he said in an interview with reuters that after a few years it was likely south korea would drop barriers to foreign goods and move toward a more balanced trade position . 197 | [i=361,j=50938,dist=0.08958333333333333] 198 | hog and cattle slaughter guesstimates chicago mercantile exchange floor traders and commission house representatives are guesstimating today ' s hog slaughter at about 295 , 000 to 305 , 000 head versus 307 , 000 week ago and 311 , 000 a year ago . 199 | hog and cattle slaughter guesstimates chicago mercantile exchange floor traders and commission house representatives are guesstimating today ' s hog slaughter at about 295 , 000 to 308 , 000 head versus 305 , 000 week ago and 308 , 000 a year ago . 200 | [i=469,j=6360,dist=0.035416666666666666] 201 | the national planning department forecast that in 1987 coffee , colombia ' s traditional major export , will account for only one - third of total exports , or about 1 . 5 billion dlrs . 202 | the national planning department forecast that in 1987 coffee , colombia ' s traditional major export , will account for only one third of total exports , or about 1 . 5 billion dlrs . 203 | ... 204 | ``` 205 | 206 | ### 4. Testing the accuracy of 1-bit minwise hashing 207 | 208 | LSH is an approximate solution, and you may want to know the accuracy. 209 | The executable `minhash_acc` allows you to examine 210 | - the mean absolute error that is the averaged gap between the normalized Hamming distance and the actual Jaccard distance; and 211 | - the number of true results, precisions, recalls, and F1-scores for search radii {0.01, 0.02, 0.05, 0.1, 0.2, 0.5}. 212 | 213 | To use this executable, we recommend extracting a small subset from your dataset 214 | because it exactly computes distances for all possible pairs (although the computation is accelerated with parallelization). 215 | 216 | ``` 217 | $ head -5000 reuters.txt > reuters.5k.txt 218 | ``` 219 | 220 | You can test the number of Hamming dimensions from 64 to 6400 221 | (i.e., the number of chunks from 1 to 100 indicated with `-c`) 222 | with the following command. 223 | The arguments for feature extraction are the same as those of `jaccard`. 224 | 225 | ``` 226 | $ cargo run --release -p find-simdoc-cli --bin minhash_acc -- -i reuters.5k.txt -w 5 > acc.csv 227 | ``` 228 | 229 | ## Approximation accuracy of 1-bit minwise hashing 230 | 231 | LSH is an approximate solution, and the number of dimensions in the Hamming space 232 | (indicated with the command line argument `-c`) is related to the approximation accuracy. 233 | As a hint for choosing a parameter of `-c`, we show experimental results obtained from `reuters.txt` of 51,535 documents when setting `-w 5`. 234 | 235 | ### Mean absolute error (MAE) 236 | 237 | The following figure shows MAEs while varying the number of Hamming dimensions from 64 to 6400 (i.e., the number of chunks from 1 to 100 indicated with `-c`). 238 | 239 | ![](./figures/mae_reuters.svg) 240 | 241 | As expected, the larger the number, the higher the accuracy. For example, when the number of dimensions is 1024 (setting the argument `-c 16`), we achieve the MAE around 2.5%. 242 | 243 | ### Recall 244 | 245 | Of the precision, recall, and F1 score, the most interesting would be the recall. 246 | This is because false positives can be filtered out in post processing. 247 | 248 | The following figure shows recalls in search with radii 0.05, 0.1, and 0.2 (indicated with the argument `-r`). 249 | 250 | ![](./figures/recall_reuters.svg) 251 | 252 | For radii 0.1 and 0.2, over 90% recalls are achieved in most cases. 253 | For smaller radius 0.05, 75-90% recalls are obtained because the MAE becomes larger relative to the radius. 254 | 255 | By the way, the numbers of true results are 50, 201, and 626 for radii 0.05, 0.1, and 0.2, respecitvely. 256 | 257 | ### F1 score 258 | 259 | The following figure shows F1 scores in search with radii 0.05, 0.1, and 0.2 (indicated with the argument `-r`). 260 | 261 | ![](./figures/f1_reuters.svg) 262 | 263 | - For radius 0.05, over 90% scores are achieved from 3520 dimensions (setting `-c 55`). 264 | - For radius 0.1, over 90% scores are achieved from 704 dimensions (setting `-c 11`). 265 | - For radius 0.2, over 90% scores are achieved from 448 dimensions (setting `-c 7`). 266 | 267 | ## Disclaimer 268 | 269 | This software is developed by LegalForce, Inc., 270 | but not an officially supported LegalForce product. 271 | 272 | ## License 273 | 274 | Licensed under either of 275 | 276 | * Apache License, Version 2.0 277 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 278 | * MIT license 279 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 280 | 281 | at your option. 282 | 283 | ## Contribution 284 | 285 | Unless you explicitly state otherwise, any contribution intentionally submitted 286 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 287 | dual licensed as above, without any additional terms or conditions. 288 | -------------------------------------------------------------------------------- /all-pairs-hamming/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "all-pairs-hamming" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["Shunsuke Kanda "] 6 | description = "All pairs similarity search on binary sketches in the Hamming space." 7 | license = "MIT OR Apache-2.0" 8 | homepage = "https://github.com/legalforce-research/find-simdoc" 9 | repository = "https://github.com/legalforce-research/find-simdoc" 10 | readme = "README.md" 11 | keywords = ["search", "similarity", "all-pairs", "lsh"] 12 | categories = ["text-processing", "algorithms"] 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [dependencies] 17 | num-traits = "0.2.15" # MIT or Apache-2.0 18 | hashbrown = "0.12.3" # MIT or Apache-2.0 -------------------------------------------------------------------------------- /all-pairs-hamming/README.md: -------------------------------------------------------------------------------- 1 | # All pairs similarity search on binary sketches in the Hamming space 2 | 3 | This library provides a fast and compact all pairs similarity search (or *similarity self-join*) 4 | on binary sketches in the Hamming space. 5 | The algorithm employs a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html), 6 | a combination of the [multiple sorting](https://doi.org/10.1007/s10115-009-0271-6) 7 | and the [multi-index approach](https://doi.org/10.1109/TKDE.2019.2899597). 8 | 9 | This library is a part of [find-simdoc](https://github.com/legalforce-research/find-simdoc). 10 | 11 | ## API documentation 12 | 13 | https://docs.rs/all-pairs-hamming 14 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/bitset64.rs: -------------------------------------------------------------------------------- 1 | #[derive(Clone, Copy)] 2 | pub struct Bitset64(u64); 3 | 4 | impl Bitset64 { 5 | #[inline(always)] 6 | pub const fn new() -> Self { 7 | Self(0) 8 | } 9 | 10 | #[inline(always)] 11 | pub const fn add(mut self, i: usize) -> Self { 12 | assert!(i < 64); 13 | self.0 |= 1 << i; 14 | self 15 | } 16 | 17 | #[inline(always)] 18 | pub const fn max(&self) -> Option { 19 | if self.0 == 0 { 20 | None 21 | } else { 22 | Some(63 - self.0.leading_zeros() as usize) 23 | } 24 | } 25 | 26 | #[inline(always)] 27 | pub const fn inverse(mut self) -> Self { 28 | self.0 = !self.0; 29 | self 30 | } 31 | 32 | #[inline(always)] 33 | pub const fn iter(&self) -> Bitset64Iter { 34 | Bitset64Iter(self.0) 35 | } 36 | 37 | #[inline(always)] 38 | pub const fn len(&self) -> usize { 39 | self.0.count_ones() as usize 40 | } 41 | 42 | #[inline(always)] 43 | #[allow(dead_code)] 44 | pub const fn is_empty(&self) -> bool { 45 | self.len() == 0 46 | } 47 | } 48 | 49 | pub struct Bitset64Iter(u64); 50 | 51 | impl Iterator for Bitset64Iter { 52 | type Item = usize; 53 | 54 | #[inline(always)] 55 | fn next(&mut self) -> Option { 56 | if self.0 == 0 { 57 | return None; 58 | } 59 | let numtz = self.0.trailing_zeros() as usize; 60 | let mask = 1 << numtz; 61 | self.0 ^= mask; 62 | Some(numtz) 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | use super::*; 69 | 70 | #[test] 71 | fn test_basic() { 72 | // {} 73 | let mut s = Bitset64::new(); 74 | assert_eq!(s.len(), 0); 75 | assert_eq!(s.is_empty(), true); 76 | assert_eq!(s.max(), None); 77 | assert_eq!(s.iter().collect::>(), vec![]); 78 | 79 | // {2} 80 | s = s.add(2); 81 | assert_eq!(s.len(), 1); 82 | assert_eq!(s.is_empty(), false); 83 | assert_eq!(s.max(), Some(2)); 84 | assert_eq!(s.iter().collect::>(), vec![2]); 85 | 86 | // {2,9} 87 | s = s.add(9); 88 | assert_eq!(s.len(), 2); 89 | assert_eq!(s.is_empty(), false); 90 | assert_eq!(s.max(), Some(9)); 91 | assert_eq!(s.iter().collect::>(), vec![2, 9]); 92 | 93 | // {2,5,9} 94 | s = s.add(5); 95 | assert_eq!(s.len(), 3); 96 | assert_eq!(s.is_empty(), false); 97 | assert_eq!(s.max(), Some(9)); 98 | assert_eq!(s.iter().collect::>(), vec![2, 5, 9]); 99 | 100 | // {2,5,9} 101 | s = s.add(9); 102 | assert_eq!(s.len(), 3); 103 | assert_eq!(s.is_empty(), false); 104 | assert_eq!(s.max(), Some(9)); 105 | assert_eq!(s.iter().collect::>(), vec![2, 5, 9]); 106 | 107 | // !{2,5,9} 108 | s = s.inverse(); 109 | assert_eq!(s.len(), 61); 110 | assert_eq!(s.is_empty(), false); 111 | assert_eq!(s.max(), Some(63)); 112 | 113 | let mut expexted = vec![0, 1, 3, 4, 6, 7, 8]; 114 | expexted.extend(10..64); 115 | assert_eq!(s.iter().collect::>(), expexted); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/chunked_join.rs: -------------------------------------------------------------------------------- 1 | //! A fast and compact implementation of similarity self-join on binary sketches in the Hamming space. 2 | use hashbrown::HashSet; 3 | 4 | use crate::errors::{AllPairsHammingError, Result}; 5 | use crate::multi_sort::MultiSort; 6 | use crate::sketch::Sketch; 7 | 8 | /// A fast and compact implementation of similarity self-join on binary sketches in the Hamming space. 9 | /// The algorithm employs a modified variant of the sketch sorting with the multi-index approach. 10 | /// 11 | /// # Complexities 12 | /// 13 | /// The time and memory complexities are linear in the input and output size. 14 | /// 15 | /// # Examples 16 | /// 17 | /// ``` 18 | /// use all_pairs_hamming::ChunkedJoiner; 19 | /// 20 | /// let mut joiner = ChunkedJoiner::::new(2); 21 | /// joiner.add([0b1111, 0b1001]); 22 | /// joiner.add([0b1101, 0b1001]); 23 | /// joiner.add([0b0101, 0b0001]); 24 | /// 25 | /// let mut results = joiner.similar_pairs(0.15); 26 | /// assert_eq!(results, vec![(0, 1, 0.0625), (1, 2, 0.125)]); 27 | /// ``` 28 | /// 29 | /// # References 30 | /// 31 | /// - Tabei, Uno, Sugiyama, and Tsuda. 32 | /// [Single versus Multiple Sorting in All Pairs Similarity Search](https://proceedings.mlr.press/v13/tabei10a.html). 33 | /// ACML, 2010 34 | /// - J. Qin et al. 35 | /// [Generalizing the Pigeonhole Principle for Similarity Search in Hamming Space](https://doi.org/10.1109/TKDE.2019.2899597). 36 | /// IEEE Transactions on Knowledge and Data Engineering, 2021 37 | pub struct ChunkedJoiner { 38 | chunks: Vec>, 39 | shows_progress: bool, 40 | } 41 | 42 | impl ChunkedJoiner 43 | where 44 | S: Sketch, 45 | { 46 | /// Creates an instance, handling sketches of `num_chunks` chunks, i.e., 47 | /// in `S::dim() * num_chunks` dimensions. 48 | pub fn new(num_chunks: usize) -> Self { 49 | Self { 50 | chunks: vec![vec![]; num_chunks], 51 | shows_progress: false, 52 | } 53 | } 54 | 55 | /// Prints the progress with stderr? 56 | pub const fn shows_progress(mut self, yes: bool) -> Self { 57 | self.shows_progress = yes; 58 | self 59 | } 60 | 61 | /// Appends a sketch of [`Self::num_chunks()`] chunks. 62 | /// The first [`Self::num_chunks()`] elements of an input iterator is stored. 63 | /// If the iterator is consumed until obtaining the elements, an error is returned. 64 | pub fn add(&mut self, sketch: I) -> Result<()> 65 | where 66 | I: IntoIterator, 67 | { 68 | let num_chunks = self.num_chunks(); 69 | let mut iter = sketch.into_iter(); 70 | for chunk in self.chunks.iter_mut() { 71 | chunk.push(iter.next().ok_or_else(|| { 72 | let msg = format!("The input sketch must include {num_chunks} chunks at least."); 73 | AllPairsHammingError::input(msg) 74 | })?); 75 | } 76 | Ok(()) 77 | } 78 | 79 | /// Finds all similar pairs whose normalized Hamming distance is within `radius`, 80 | /// returning triplets of the left-side id, the right-side id, and thier distance. 81 | pub fn similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> { 82 | let dimension = S::dim() * self.num_chunks(); 83 | let hamradius = (dimension as f64 * radius).ceil() as usize; 84 | if self.shows_progress { 85 | eprintln!( 86 | "[ChunkedJoiner::similar_pairs] #dimensions={dimension}, hamradius={hamradius}" 87 | ); 88 | } 89 | 90 | // TODO: Threading. 91 | let mut candidates = HashSet::new(); 92 | for (j, chunk) in self.chunks.iter().enumerate() { 93 | // Based on the general pigeonhole principle. 94 | // https://doi.org/10.1109/TKDE.2019.2899597 95 | if j + hamradius + 1 < self.chunks.len() { 96 | continue; 97 | } 98 | let r = (j + hamradius + 1 - self.chunks.len()) / self.chunks.len(); 99 | MultiSort::new().similar_pairs(chunk, r, &mut candidates); 100 | 101 | if self.shows_progress { 102 | eprintln!( 103 | "[ChunkedJoiner::similar_pairs] Processed {}/{}...", 104 | j + 1, 105 | self.chunks.len() 106 | ); 107 | eprintln!( 108 | "[ChunkedJoiner::similar_pairs] #candidates={}", 109 | candidates.len() 110 | ); 111 | } 112 | } 113 | if self.shows_progress { 114 | eprintln!("[ChunkedJoiner::similar_pairs] Done"); 115 | } 116 | 117 | let mut candidates: Vec<_> = candidates.into_iter().collect(); 118 | candidates.sort_unstable(); 119 | 120 | let bound = (dimension as f64 * radius) as usize; 121 | let mut matched = vec![]; 122 | 123 | for (i, j) in candidates { 124 | if let Some(dist) = self.hamming_distance(i, j, bound) { 125 | let dist = dist as f64 / dimension as f64; 126 | if dist <= radius { 127 | matched.push((i, j, dist)); 128 | } 129 | } 130 | } 131 | if self.shows_progress { 132 | eprintln!("[ChunkedJoiner::similar_pairs] #matched={}", matched.len()); 133 | } 134 | matched 135 | } 136 | 137 | /// Gets the number of chunks. 138 | pub fn num_chunks(&self) -> usize { 139 | self.chunks.len() 140 | } 141 | 142 | /// Gets the number of stored sketches. 143 | pub fn num_sketches(&self) -> usize { 144 | self.chunks.first().map(|v| v.len()).unwrap_or(0) 145 | } 146 | 147 | /// Gets the memory usage in bytes. 148 | pub fn memory_in_bytes(&self) -> usize { 149 | self.num_chunks() * self.num_sketches() * std::mem::size_of::() 150 | } 151 | 152 | fn hamming_distance(&self, i: usize, j: usize, bound: usize) -> Option { 153 | let mut dist = 0; 154 | for chunk in &self.chunks { 155 | dist += chunk[i].hamdist(chunk[j]); 156 | if bound < dist { 157 | return None; 158 | } 159 | } 160 | Some(dist) 161 | } 162 | } 163 | 164 | #[cfg(test)] 165 | mod tests { 166 | use super::*; 167 | 168 | fn example_sketches() -> Vec { 169 | vec![ 170 | 0b_1110_0011_1111_1011, // 0 171 | 0b_0001_0111_0111_1101, // 1 172 | 0b_1100_1101_1000_1100, // 2 173 | 0b_1100_1101_0001_0100, // 3 174 | 0b_1010_1110_0010_1010, // 4 175 | 0b_0111_1001_0011_1111, // 5 176 | 0b_1110_0011_0001_0000, // 6 177 | 0b_1000_0111_1001_0101, // 7 178 | 0b_1110_1101_1000_1101, // 8 179 | 0b_0111_1001_0011_1001, // 9 180 | ] 181 | } 182 | 183 | fn naive_search(sketches: &[u16], radius: f64) -> Vec<(usize, usize, f64)> { 184 | let mut results = vec![]; 185 | for i in 0..sketches.len() { 186 | let x = sketches[i]; 187 | for j in i + 1..sketches.len() { 188 | let y = sketches[j]; 189 | let dist = x.hamdist(y); 190 | let dist = dist as f64 / 16.; 191 | if dist <= radius { 192 | results.push((i, j, dist)); 193 | } 194 | } 195 | } 196 | results 197 | } 198 | 199 | fn test_similar_pairs(radius: f64) { 200 | let sketches = example_sketches(); 201 | let expected = naive_search(&sketches, radius); 202 | 203 | let mut joiner = ChunkedJoiner::new(2); 204 | for s in sketches { 205 | joiner.add([(s & 0xFF) as u8, (s >> 8) as u8]).unwrap(); 206 | } 207 | let mut results = joiner.similar_pairs(radius); 208 | results.sort_by_key(|&(i, j, _)| (i, j)); 209 | assert_eq!(results, expected); 210 | } 211 | 212 | #[test] 213 | fn test_similar_pairs_for_all() { 214 | for radius in 0..=10 { 215 | test_similar_pairs(radius as f64 / 10.); 216 | } 217 | } 218 | 219 | #[test] 220 | fn test_short_sketch() { 221 | let mut joiner = ChunkedJoiner::new(2); 222 | let result = joiner.add([0u64]); 223 | assert!(result.is_err()); 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/errors.rs: -------------------------------------------------------------------------------- 1 | //! Error definitions. 2 | use std::error::Error; 3 | use std::{fmt, result}; 4 | 5 | /// A specialized Result type for this library. 6 | pub type Result = result::Result; 7 | 8 | /// Errors in this library. 9 | #[derive(Debug)] 10 | pub enum AllPairsHammingError { 11 | /// Contains [`InputError`]. 12 | Input(InputError), 13 | } 14 | 15 | impl fmt::Display for AllPairsHammingError { 16 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 17 | match self { 18 | Self::Input(e) => e.fmt(f), 19 | } 20 | } 21 | } 22 | 23 | impl Error for AllPairsHammingError {} 24 | 25 | impl AllPairsHammingError { 26 | pub(crate) const fn input(msg: String) -> Self { 27 | Self::Input(InputError { msg }) 28 | } 29 | } 30 | 31 | /// Error used when the input argument is invalid. 32 | #[derive(Debug)] 33 | pub struct InputError { 34 | msg: String, 35 | } 36 | 37 | impl fmt::Display for InputError { 38 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 39 | write!(f, "InputError: {}", self.msg) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This library provides a fast and compact all pairs similarity search (or *similarity self-join*) 2 | //! on binary sketches in the Hamming space. 3 | //! The algorithm employs a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html), 4 | //! a combination of the [multiple sorting](https://doi.org/10.1007/s10115-009-0271-6) 5 | //! and the [multi-index approach](https://doi.org/10.1109/TKDE.2019.2899597). 6 | #![deny(missing_docs)] 7 | 8 | mod bitset64; 9 | pub mod chunked_join; 10 | pub mod errors; 11 | pub mod multi_sort; 12 | pub mod simple_join; 13 | pub mod sketch; 14 | 15 | pub use chunked_join::ChunkedJoiner; 16 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/multi_sort.rs: -------------------------------------------------------------------------------- 1 | //! The core part of [`crate::ChunkedJoiner`]. 2 | use std::cell::RefCell; 3 | use std::ops::Range; 4 | 5 | use hashbrown::HashSet; 6 | 7 | use crate::bitset64::Bitset64; 8 | use crate::sketch::Sketch; 9 | 10 | const SORT_SHIFT: usize = 8; 11 | const SORT_MASK: usize = (1 << SORT_SHIFT) - 1; 12 | const DEFAULT_THRESHOLD_IN_SORT: usize = 1000; 13 | 14 | #[derive(Clone, Debug, Default)] 15 | struct Record { 16 | id: usize, 17 | sketch: S, 18 | } 19 | 20 | /// The core part of [`crate::ChunkedJoiner`] 21 | /// implementing the multiple sorting algorithm for short binary sketches. 22 | /// 23 | /// # Complexities 24 | /// 25 | /// The time and memory complexities are linear in the input and output size. 26 | /// 27 | /// # References 28 | /// 29 | /// - Uno. 30 | /// [Multi-sorting algorithm for finding pairs of similar short substrings from large-scale string data](https://doi.org/10.1007/s10115-009-0271-6). 31 | /// Knowl Inf Syst 25, 229–251 (2010). 32 | #[derive(Clone, Debug)] 33 | pub struct MultiSort { 34 | radius: usize, 35 | num_blocks: usize, 36 | masks: Vec, 37 | offsets: Vec, 38 | // For radix sort 39 | threshold_in_sort: usize, 40 | buckets: RefCell<[usize; SORT_MASK + 1]>, 41 | sorted: RefCell>>, 42 | } 43 | 44 | impl Default for MultiSort 45 | where 46 | S: Sketch, 47 | { 48 | fn default() -> Self { 49 | Self::new() 50 | } 51 | } 52 | 53 | impl MultiSort 54 | where 55 | S: Sketch, 56 | { 57 | /// Creates an instance. 58 | pub const fn new() -> Self { 59 | Self { 60 | radius: 0, 61 | num_blocks: 0, 62 | masks: vec![], 63 | offsets: vec![], 64 | threshold_in_sort: DEFAULT_THRESHOLD_IN_SORT, 65 | buckets: RefCell::new([0usize; SORT_MASK + 1]), 66 | sorted: RefCell::new(vec![]), 67 | } 68 | } 69 | 70 | /// Sets the number of blocks. 71 | pub fn num_blocks(mut self, num_blocks: usize) -> Self { 72 | if num_blocks <= S::dim() { 73 | self.num_blocks = num_blocks; 74 | } 75 | self 76 | } 77 | 78 | /// Sets the size threshold for partial sorting. 79 | /// If the partial size is smaller than the threshold, a quicksort is used; 80 | /// otherwise, a radix sort is used. 81 | pub const fn threshold_in_sort(mut self, threshold_in_sort: usize) -> Self { 82 | self.threshold_in_sort = threshold_in_sort; 83 | self 84 | } 85 | 86 | /// Finds all similar pairs whose Hamming distance is within `radius`, 87 | /// inserting the results in a given hash table. 88 | pub fn similar_pairs( 89 | mut self, 90 | sketches: &[S], 91 | radius: usize, 92 | results: &mut HashSet<(usize, usize)>, 93 | ) { 94 | if self.num_blocks == 0 || self.num_blocks < radius { 95 | // Following Tabei's paper. 96 | self.num_blocks = S::dim().min(radius + 3); 97 | } 98 | 99 | self.build_masks_and_offsets(); 100 | self.radius = radius; 101 | self.sorted = RefCell::new(Vec::with_capacity(sketches.len())); 102 | 103 | let mut records: Vec<_> = sketches 104 | .iter() 105 | .enumerate() 106 | .map(|(id, &sketch)| Record { id, sketch }) 107 | .collect(); 108 | self.similar_pairs_recur(&mut records, Bitset64::new(), results); 109 | } 110 | 111 | fn build_masks_and_offsets(&mut self) { 112 | let mut masks = vec![S::default(); self.num_blocks]; 113 | let mut offsets = vec![0; self.num_blocks + 1]; 114 | let mut i = 0; 115 | for (b, mask) in masks.iter_mut().enumerate().take(self.num_blocks) { 116 | let dim = (b + S::dim()) / self.num_blocks; 117 | *mask = S::mask(i..i + dim); 118 | i += dim; 119 | offsets[b + 1] = i; 120 | } 121 | self.masks = masks; 122 | self.offsets = offsets; 123 | } 124 | 125 | fn similar_pairs_recur( 126 | &self, 127 | records: &mut [Record], 128 | blocks: Bitset64, 129 | results: &mut HashSet<(usize, usize)>, 130 | ) { 131 | if blocks.len() == self.num_blocks - self.radius { 132 | self.verify_all_pairs(records, blocks, results); 133 | return; 134 | } 135 | 136 | let mut ranges = vec![]; 137 | let max_block = blocks.max().map(|x| x + 1).unwrap_or(0); 138 | 139 | for b in max_block..self.num_blocks { 140 | self.sort_sketches(b, records); 141 | self.collision_ranges(b, records, &mut ranges); 142 | for r in ranges.iter().cloned() { 143 | self.similar_pairs_recur(&mut records[r], blocks.add(b), results); 144 | } 145 | } 146 | } 147 | 148 | fn verify_all_pairs( 149 | &self, 150 | records: &[Record], 151 | blocks: Bitset64, 152 | results: &mut HashSet<(usize, usize)>, 153 | ) { 154 | for i in 0..records.len() { 155 | let x = &records[i]; 156 | for y in records.iter().skip(i + 1) { 157 | debug_assert!(self.debug_block_collisions(x.sketch, y.sketch, blocks)); 158 | if x.sketch.hamdist(y.sketch) <= self.radius 159 | && self.check_canonical(x.sketch, y.sketch, blocks) 160 | { 161 | debug_assert_ne!(x.id, y.id); 162 | // Keeps the tuple order to ease debug. 163 | results.insert((x.id.min(y.id), x.id.max(y.id))); 164 | } 165 | } 166 | } 167 | } 168 | 169 | fn check_canonical(&self, x: S, y: S, blocks: Bitset64) -> bool { 170 | let max = blocks.max().unwrap_or(0); 171 | let others = blocks.inverse(); 172 | for b in others.iter() { 173 | if max <= b { 174 | break; 175 | } 176 | if x & self.masks[b] == y & self.masks[b] { 177 | return false; 178 | } 179 | } 180 | true 181 | } 182 | 183 | fn sort_sketches(&self, block_id: usize, records: &mut [Record]) { 184 | if records.len() < self.threshold_in_sort { 185 | self.quick_sort_sketches(block_id, records); 186 | } else { 187 | self.radix_sort_sketches(block_id, records); 188 | } 189 | } 190 | 191 | fn quick_sort_sketches(&self, block_id: usize, records: &mut [Record]) { 192 | let mask = self.masks[block_id]; 193 | records.sort_unstable_by(|x, y| (x.sketch & mask).cmp(&(y.sketch & mask))); 194 | } 195 | 196 | fn radix_sort_sketches(&self, block_id: usize, records: &mut [Record]) { 197 | let mut buckets = self.buckets.borrow_mut(); 198 | let mut sorted = self.sorted.borrow_mut(); 199 | sorted.resize(records.len(), Record::::default()); 200 | 201 | let mask = self.masks[block_id]; 202 | for j in (self.offsets[block_id]..self.offsets[block_id + 1]).step_by(SORT_SHIFT) { 203 | buckets.fill(0); 204 | for x in records.iter() { 205 | let k = ((x.sketch & mask) >> j).to_usize().unwrap() & SORT_MASK; 206 | buckets[k] += 1; 207 | } 208 | for k in 1..buckets.len() { 209 | buckets[k] += buckets[k - 1]; 210 | } 211 | for x in records.iter().rev() { 212 | let k = ((x.sketch & mask) >> j).to_usize().unwrap() & SORT_MASK; 213 | buckets[k] -= 1; 214 | sorted[buckets[k]] = x.clone(); 215 | } 216 | for i in 0..records.len() { 217 | records[i] = sorted[i].clone(); 218 | } 219 | } 220 | } 221 | 222 | fn collision_ranges( 223 | &self, 224 | block_id: usize, 225 | records: &[Record], 226 | ranges: &mut Vec>, 227 | ) { 228 | ranges.clear(); 229 | let mut i = 0; 230 | for j in 1..records.len() { 231 | let mask = self.masks[block_id]; 232 | let x = records[i].sketch & mask; 233 | let y = records[j].sketch & mask; 234 | if x == y { 235 | continue; 236 | } 237 | if 2 <= j - i { 238 | ranges.push(i..j); 239 | } 240 | i = j; 241 | } 242 | let j = records.len(); 243 | if 2 <= j - i { 244 | ranges.push(i..j); 245 | } 246 | } 247 | 248 | fn debug_block_collisions(&self, x: S, y: S, blocks: Bitset64) -> bool { 249 | for b in blocks.iter() { 250 | let mx = x & self.masks[b]; 251 | let my = y & self.masks[b]; 252 | if mx != my { 253 | return false; 254 | } 255 | } 256 | true 257 | } 258 | } 259 | 260 | #[cfg(test)] 261 | mod tests { 262 | use super::*; 263 | 264 | fn example_sketches() -> Vec { 265 | vec![ 266 | 0b_1110_0011_1111_1011, // 0 267 | 0b_0001_0111_0111_1101, // 1 268 | 0b_1100_1101_1000_1100, // 2 269 | 0b_1100_1101_0001_0100, // 3 270 | 0b_1010_1110_0010_1010, // 4 271 | 0b_0111_1001_0011_1111, // 5 272 | 0b_1110_0011_0001_0000, // 6 273 | 0b_1000_0111_1001_0101, // 7 274 | 0b_1110_1101_1000_1101, // 8 275 | 0b_0111_1001_0011_1001, // 9 276 | ] 277 | } 278 | 279 | fn naive_search(sketches: &[u16], radius: usize) -> Vec<(usize, usize)> { 280 | let mut results = vec![]; 281 | for i in 0..sketches.len() { 282 | let x = sketches[i]; 283 | for j in i + 1..sketches.len() { 284 | let y = sketches[j]; 285 | if x.hamdist(y) <= radius { 286 | results.push((i, j)); 287 | } 288 | } 289 | } 290 | results 291 | } 292 | 293 | fn test_similar_pairs(radius: usize, num_blocks: usize) { 294 | let sketches = example_sketches(); 295 | let expected = naive_search(&sketches, radius); 296 | let mut results = HashSet::new(); 297 | MultiSort::new() 298 | .num_blocks(num_blocks) 299 | .threshold_in_sort(5) 300 | .similar_pairs(&sketches, radius, &mut results); 301 | let mut results: Vec<_> = results.into_iter().collect(); 302 | results.sort_unstable(); 303 | assert_eq!(results, expected); 304 | } 305 | 306 | #[test] 307 | fn test_similar_pairs_for_all() { 308 | for radius in 0..=16 { 309 | for num_blocks in radius..=16 { 310 | test_similar_pairs(radius, num_blocks); 311 | } 312 | } 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/simple_join.rs: -------------------------------------------------------------------------------- 1 | //! A naive implementation of similarity self-join on binary sketches in the Hamming space. 2 | use crate::errors::{AllPairsHammingError, Result}; 3 | use crate::sketch::Sketch; 4 | 5 | /// A naive implementation of similarity self-join on binary sketches in the Hamming space, 6 | /// taking a quadratic time. 7 | /// Do NOT use this for large datasets. 8 | pub struct SimpleJoiner { 9 | sketches: Vec>, 10 | num_chunks: usize, 11 | shows_progress: bool, 12 | } 13 | 14 | impl SimpleJoiner 15 | where 16 | S: Sketch, 17 | { 18 | /// Creates an instance, handling sketches of `num_chunks` chunks, i.e., 19 | /// in `S::dim() * num_chunks` dimensions. 20 | pub const fn new(num_chunks: usize) -> Self { 21 | Self { 22 | sketches: vec![], 23 | num_chunks, 24 | shows_progress: false, 25 | } 26 | } 27 | 28 | /// Prints the progress with stderr? 29 | pub const fn shows_progress(mut self, yes: bool) -> Self { 30 | self.shows_progress = yes; 31 | self 32 | } 33 | 34 | /// Appends a sketch of [`Self::num_chunks()`] chunks. 35 | /// The first [`Self::num_chunks()`] elements of an input iterator is stored. 36 | /// If the iterator is consumed until obtaining the elements, an error is returned. 37 | pub fn add(&mut self, sketch: I) -> Result<()> 38 | where 39 | I: IntoIterator, 40 | { 41 | let mut iter = sketch.into_iter(); 42 | let mut sketch = Vec::with_capacity(self.num_chunks()); 43 | for _ in 0..self.num_chunks() { 44 | sketch.push(iter.next().ok_or_else(|| { 45 | let msg = format!( 46 | "The input sketch must include {} chunks at least.", 47 | self.num_chunks() 48 | ); 49 | AllPairsHammingError::input(msg) 50 | })?) 51 | } 52 | self.sketches.push(sketch); 53 | Ok(()) 54 | } 55 | 56 | /// Finds all similar pairs whose normalized Hamming distance is within `radius`, 57 | /// returning triplets of the left-side id, the right-side id, and thier distance. 58 | pub fn similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> { 59 | let dimension = S::dim() * self.num_chunks(); 60 | if self.shows_progress { 61 | eprintln!("[SimpleJoiner::similar_pairs] #dimensions={dimension}"); 62 | } 63 | 64 | let bound = (dimension as f64 * radius) as usize; 65 | let mut matched = vec![]; 66 | 67 | for i in 0..self.sketches.len() { 68 | if self.shows_progress && (i + 1) % 10000 == 0 { 69 | eprintln!( 70 | "[SimpleJoiner::similar_pairs] Processed {}/{}...", 71 | i + 1, 72 | self.sketches.len() 73 | ); 74 | } 75 | for j in i + 1..self.sketches.len() { 76 | if let Some(dist) = self.hamming_distance(i, j, bound) { 77 | let dist = dist as f64 / dimension as f64; 78 | if dist <= radius { 79 | matched.push((i, j, dist)); 80 | } 81 | } 82 | } 83 | } 84 | if self.shows_progress { 85 | eprintln!("[SimpleJoiner::similar_pairs] Done"); 86 | eprintln!("[SimpleJoiner::similar_pairs] #matched={}", matched.len()); 87 | } 88 | matched 89 | } 90 | 91 | /// Gets the number of chunks. 92 | pub const fn num_chunks(&self) -> usize { 93 | self.num_chunks 94 | } 95 | 96 | /// Gets the number of stored sketches. 97 | pub fn num_sketches(&self) -> usize { 98 | self.sketches.len() 99 | } 100 | 101 | /// Gets the memory usage in bytes. 102 | pub fn memory_in_bytes(&self) -> usize { 103 | self.num_chunks() * self.num_sketches() * std::mem::size_of::() 104 | } 105 | 106 | fn hamming_distance(&self, i: usize, j: usize, bound: usize) -> Option { 107 | let xs = &self.sketches[i]; 108 | let ys = &self.sketches[j]; 109 | let mut dist = 0; 110 | for (&x, &y) in xs.iter().zip(ys.iter()) { 111 | dist += x.hamdist(y); 112 | if bound < dist { 113 | return None; 114 | } 115 | } 116 | Some(dist) 117 | } 118 | } 119 | 120 | #[cfg(test)] 121 | mod tests { 122 | use super::*; 123 | 124 | fn example_sketches() -> Vec { 125 | vec![ 126 | 0b_1110_0011_1111_1011, // 0 127 | 0b_0001_0111_0111_1101, // 1 128 | 0b_1100_1101_1000_1100, // 2 129 | 0b_1100_1101_0001_0100, // 3 130 | 0b_1010_1110_0010_1010, // 4 131 | 0b_0111_1001_0011_1111, // 5 132 | 0b_1110_0011_0001_0000, // 6 133 | 0b_1000_0111_1001_0101, // 7 134 | 0b_1110_1101_1000_1101, // 8 135 | 0b_0111_1001_0011_1001, // 9 136 | ] 137 | } 138 | 139 | fn naive_search(sketches: &[u16], radius: f64) -> Vec<(usize, usize, f64)> { 140 | let mut results = vec![]; 141 | for i in 0..sketches.len() { 142 | let x = sketches[i]; 143 | for j in i + 1..sketches.len() { 144 | let y = sketches[j]; 145 | let dist = x.hamdist(y); 146 | let dist = dist as f64 / 16.; 147 | if dist <= radius { 148 | results.push((i, j, dist)); 149 | } 150 | } 151 | } 152 | results 153 | } 154 | 155 | fn test_similar_pairs(radius: f64) { 156 | let sketches = example_sketches(); 157 | let expected = naive_search(&sketches, radius); 158 | 159 | let mut joiner = SimpleJoiner::new(2); 160 | for s in sketches { 161 | joiner.add([(s & 0xFF) as u8, (s >> 8) as u8]).unwrap(); 162 | } 163 | let results = joiner.similar_pairs(radius); 164 | assert_eq!(results, expected); 165 | } 166 | 167 | #[test] 168 | fn test_similar_pairs_for_all() { 169 | for radius in 0..=10 { 170 | test_similar_pairs(radius as f64 / 10.); 171 | } 172 | } 173 | 174 | #[test] 175 | fn test_short_sketch() { 176 | let mut joiner = SimpleJoiner::new(2); 177 | let result = joiner.add([0u64]); 178 | assert!(result.is_err()); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /all-pairs-hamming/src/sketch.rs: -------------------------------------------------------------------------------- 1 | //! Traits of binary short sketches of primitive integer types. 2 | use std::ops::Range; 3 | 4 | use num_traits::int::PrimInt; 5 | use num_traits::{FromPrimitive, ToPrimitive}; 6 | 7 | /// Trait of a binary short sketch from a primitive integer type. 8 | pub trait Sketch: Default + PrimInt + FromPrimitive + ToPrimitive { 9 | /// Gets the number of dimensions. 10 | fn dim() -> usize; 11 | /// Gets the Hamming distance to the other sketch. 12 | fn hamdist(self, rhs: Self) -> usize; 13 | /// Produces a sketch for masking a given bit-position range. 14 | fn mask(rng: Range) -> Self; 15 | } 16 | 17 | impl Sketch for u8 { 18 | #[inline(always)] 19 | fn dim() -> usize { 20 | 8 21 | } 22 | #[inline(always)] 23 | fn hamdist(self, rhs: Self) -> usize { 24 | (self ^ rhs).count_ones() as usize 25 | } 26 | #[inline(always)] 27 | fn mask(rng: Range) -> Self { 28 | debug_assert!(rng.end <= Self::dim()); 29 | if rng.len() == Self::dim() { 30 | Self::MAX 31 | } else { 32 | ((1 << rng.len()) - 1) << rng.start 33 | } 34 | } 35 | } 36 | 37 | impl Sketch for u16 { 38 | #[inline(always)] 39 | fn dim() -> usize { 40 | 16 41 | } 42 | #[inline(always)] 43 | fn hamdist(self, rhs: Self) -> usize { 44 | (self ^ rhs).count_ones() as usize 45 | } 46 | #[inline(always)] 47 | fn mask(rng: Range) -> Self { 48 | debug_assert!(rng.end <= Self::dim()); 49 | if rng.len() == Self::dim() { 50 | Self::MAX 51 | } else { 52 | ((1 << rng.len()) - 1) << rng.start 53 | } 54 | } 55 | } 56 | 57 | impl Sketch for u32 { 58 | #[inline(always)] 59 | fn dim() -> usize { 60 | 32 61 | } 62 | #[inline(always)] 63 | fn hamdist(self, rhs: Self) -> usize { 64 | (self ^ rhs).count_ones() as usize 65 | } 66 | #[inline(always)] 67 | fn mask(rng: Range) -> Self { 68 | debug_assert!(rng.end <= Self::dim()); 69 | if rng.len() == Self::dim() { 70 | Self::MAX 71 | } else { 72 | ((1 << rng.len()) - 1) << rng.start 73 | } 74 | } 75 | } 76 | 77 | impl Sketch for u64 { 78 | #[inline(always)] 79 | fn dim() -> usize { 80 | 64 81 | } 82 | #[inline(always)] 83 | fn hamdist(self, rhs: Self) -> usize { 84 | (self ^ rhs).count_ones() as usize 85 | } 86 | #[inline(always)] 87 | fn mask(rng: Range) -> Self { 88 | debug_assert!(rng.end <= Self::dim()); 89 | if rng.len() == Self::dim() { 90 | Self::MAX 91 | } else { 92 | ((1 << rng.len()) - 1) << rng.start 93 | } 94 | } 95 | } 96 | 97 | #[cfg(test)] 98 | mod tests { 99 | use super::*; 100 | 101 | #[test] 102 | fn test_mask_u8() { 103 | assert_eq!(u8::mask(0..4), 0b00001111); 104 | assert_eq!(u8::mask(3..6), 0b00111000); 105 | assert_eq!(u8::mask(4..8), 0b11110000); 106 | assert_eq!(u8::mask(0..8), 0b11111111); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /all-pairs-hamming/timeperf/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "timeperf" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | all-pairs-hamming = { path = ".." } # MIT or Apache-2.0 10 | criterion = { version = "0.3", features = ["html_reports"] } # Apache-2.0 or MIT 11 | rand = "0.8.5" 12 | -------------------------------------------------------------------------------- /all-pairs-hamming/timeperf/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use all_pairs_hamming::chunked_join::ChunkedJoiner; 4 | use all_pairs_hamming::simple_join::SimpleJoiner; 5 | 6 | const TRIALS: usize = 3; 7 | const SCALES: [usize; 4] = [1_000, 10_000, 100_000, 1_000_000]; 8 | const CHUNKS: [usize; 3] = [4, 16, 64]; 9 | const RADII: [f64; 3] = [0.01, 0.05, 0.1]; 10 | 11 | macro_rules! timeperf_common { 12 | ($percent:expr, $name:expr, $method:ident, $sketches:ident, $radii:ident, $chunks:ident, $scales:ident) => { 13 | for &num_chunks in $chunks { 14 | let mut joiner = $method::new(num_chunks).shows_progress(true); 15 | for &num_sketches in $scales { 16 | while joiner.num_sketches() < num_sketches { 17 | let sketch = &$sketches[joiner.num_sketches()]; 18 | joiner.add(sketch.iter().cloned()).unwrap(); 19 | } 20 | for &radius in $radii { 21 | let mut num_results = 0; 22 | let elapsed_sec = measure(TRIALS, || { 23 | num_results += joiner.similar_pairs(radius).len(); 24 | }); 25 | num_results /= TRIALS; 26 | println!( 27 | "[percent={},method={},num_chunks={num_chunks},num_sketches={num_sketches},radius={radius},num_results={num_results}] {elapsed_sec} sec", 28 | $percent, $name 29 | ); 30 | } 31 | } 32 | } 33 | }; 34 | } 35 | 36 | fn main() { 37 | main_percent(50, false); 38 | main_percent(80, false); 39 | } 40 | 41 | fn main_percent(percent: u64, test_simple: bool) { 42 | let max_chunks = *CHUNKS.last().unwrap(); 43 | let max_sketches = *SCALES.last().unwrap(); 44 | 45 | let mut sketches = Vec::with_capacity(max_sketches); 46 | for _ in 0..max_sketches { 47 | let mut chunks = Vec::with_capacity(max_chunks); 48 | for _ in 0..max_chunks { 49 | chunks.push((0..64).fold(0u64, |acc, _| { 50 | let x = rand::random::() & 100; 51 | (acc << 1) | ((x < percent) as u64) 52 | })); 53 | } 54 | sketches.push(chunks); 55 | } 56 | { 57 | let radii = &RADII[..]; 58 | let chunks = &CHUNKS[..]; 59 | let scales = &SCALES[..]; 60 | timeperf_common!( 61 | percent, 62 | "chunked_join", 63 | ChunkedJoiner, 64 | sketches, 65 | radii, 66 | chunks, 67 | scales 68 | ); 69 | } 70 | if test_simple { 71 | let radii = &RADII[..1]; 72 | let chunks = &CHUNKS[..]; 73 | let scales = &SCALES[..3]; 74 | timeperf_common!( 75 | percent, 76 | "simple_join", 77 | SimpleJoiner, 78 | sketches, 79 | radii, 80 | chunks, 81 | scales 82 | ); 83 | } 84 | } 85 | 86 | fn measure(num_trials: usize, mut func: F) -> f64 87 | where 88 | F: FnMut(), 89 | { 90 | // Measure 91 | let start = Instant::now(); 92 | for _ in 0..num_trials { 93 | func(); 94 | } 95 | let duration = start.elapsed(); 96 | duration.as_secs_f64() / num_trials as f64 97 | } 98 | -------------------------------------------------------------------------------- /find-simdoc-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "find-simdoc-cli" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | all-pairs-hamming = { path = "../all-pairs-hamming" } # MIT or Apache-2.0 10 | clap = { version = "3.1", features = ["derive"] } # MIT or Apache-2.0 11 | find-simdoc = { path = "../find-simdoc" } # MIT or Apache-2.0 12 | hashbrown = "0.12.3" # MIT or Apache-2.0 13 | positioned-io = "0.3.0" # MIT 14 | rand = "0.8.5" # MIT or Apache-2.0 15 | rand_xoshiro = "0.6.0" # MIT or Apache-2.0 16 | rayon = "1.5.3" # MIT or Apache-2.0 17 | 18 | [[bin]] 19 | name = "jaccard" 20 | path = "src/jaccard.rs" 21 | 22 | [[bin]] 23 | name = "cosine" 24 | path = "src/cosine.rs" 25 | 26 | [[bin]] 27 | name = "dump" 28 | path = "src/dump.rs" 29 | 30 | [[bin]] 31 | name = "minhash_acc" 32 | path = "src/minhash_acc.rs" -------------------------------------------------------------------------------- /find-simdoc-cli/src/cosine.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::fs::File; 3 | use std::io::{BufRead, BufReader, Read}; 4 | use std::path::PathBuf; 5 | use std::str::FromStr; 6 | use std::time::Instant; 7 | 8 | use find_simdoc::tfidf::{Idf, Tf}; 9 | use find_simdoc::CosineSearcher; 10 | 11 | use clap::Parser; 12 | 13 | #[derive(Clone, Debug, PartialEq, Eq)] 14 | enum TfWeights { 15 | Binary, 16 | Standard, 17 | Sublinear, 18 | } 19 | 20 | #[derive(Clone, Debug, PartialEq, Eq)] 21 | enum IdfWeights { 22 | Unary, 23 | Standard, 24 | Smooth, 25 | } 26 | 27 | impl FromStr for TfWeights { 28 | type Err = &'static str; 29 | fn from_str(w: &str) -> Result { 30 | match w { 31 | "binary" => Ok(Self::Binary), 32 | "standard" => Ok(Self::Standard), 33 | "sublinear" => Ok(Self::Sublinear), 34 | _ => Err("Could not parse a tf-weighting value"), 35 | } 36 | } 37 | } 38 | 39 | impl FromStr for IdfWeights { 40 | type Err = &'static str; 41 | fn from_str(w: &str) -> Result { 42 | match w { 43 | "unary" => Ok(Self::Unary), 44 | "standard" => Ok(Self::Standard), 45 | "smooth" => Ok(Self::Smooth), 46 | _ => Err("Could not parse a idf-weighting value"), 47 | } 48 | } 49 | } 50 | 51 | #[derive(Parser, Debug)] 52 | #[clap( 53 | name = "find-simdoc-cosine", 54 | about = "A program to find similar documents in the Cosine space." 55 | )] 56 | struct Args { 57 | /// File path to a document file to be searched. 58 | /// Empty lines must not be included. 59 | #[clap(short = 'i', long)] 60 | document_path: PathBuf, 61 | 62 | /// Search radius in the range of [0,1]. 63 | #[clap(short = 'r', long)] 64 | radius: f64, 65 | 66 | /// Delimiter for recognizing words as tokens in feature extraction. 67 | /// If None, characters are used for tokens. 68 | #[clap(short = 'd', long)] 69 | delimiter: Option, 70 | 71 | /// Window size for w-shingling in feature extraction (must be more than 0). 72 | #[clap(short = 'w', long, default_value = "1")] 73 | window_size: usize, 74 | 75 | /// Number of chunks in sketches, indicating that the number of dimensions in the Hamming space 76 | /// will be 64*#chunks. The larger this value, the more accurate the approximation, 77 | /// but the more time and memory it takes to search. 78 | #[clap(short = 'c', long, default_value = "8")] 79 | num_chunks: usize, 80 | 81 | /// Weighting variant of term frequency. 82 | /// "binary" is the boolean frequency. 83 | /// "standard" is the standard term frequency. 84 | /// "sublinear" is the logarithmically scaled frequency. 85 | #[clap(short = 'T', long, default_value = "standard")] 86 | tf: TfWeights, 87 | 88 | /// Weighting variant of inverse document frequency. 89 | /// "unary" is always 1. 90 | /// "standard" is the standard inverse document frequency. 91 | /// "smooth" is the smoothed inverse document frequency. 92 | #[clap(short = 'I', long, default_value = "smooth")] 93 | idf: IdfWeights, 94 | 95 | /// Seed value for random values. 96 | #[clap(short = 's', long)] 97 | seed: Option, 98 | 99 | /// Disables parallel construction. 100 | #[clap(short = 'p', long)] 101 | disable_parallel: bool, 102 | } 103 | 104 | fn main() -> Result<(), Box> { 105 | let args = Args::parse(); 106 | 107 | let document_path = args.document_path; 108 | let radius = args.radius; 109 | let delimiter = args.delimiter; 110 | let window_size = args.window_size; 111 | let num_chunks = args.num_chunks; 112 | let tf_weight = args.tf; 113 | let idf_weight = args.idf; 114 | let seed = args.seed; 115 | let disable_parallel = args.disable_parallel; 116 | 117 | let mut searcher = CosineSearcher::new(window_size, delimiter, seed)?.shows_progress(true); 118 | 119 | let tf = match tf_weight { 120 | TfWeights::Binary => None, 121 | TfWeights::Standard | TfWeights::Sublinear => { 122 | Some(Tf::new().sublinear(tf_weight == TfWeights::Sublinear)) 123 | } 124 | }; 125 | 126 | let idf = match idf_weight { 127 | IdfWeights::Unary => None, 128 | IdfWeights::Standard | IdfWeights::Smooth => { 129 | eprintln!("Building IDF..."); 130 | let start = Instant::now(); 131 | let documents = texts_iter(File::open(&document_path)?); 132 | let idf = Idf::new() 133 | .smooth(idf_weight == IdfWeights::Smooth) 134 | .build(documents, searcher.config())?; 135 | let duration = start.elapsed(); 136 | eprintln!("Produced in {} sec", duration.as_secs_f64()); 137 | Some(idf) 138 | } 139 | }; 140 | 141 | searcher = searcher.tf(tf).idf(idf); 142 | 143 | { 144 | eprintln!("Converting documents into sketches..."); 145 | let start = Instant::now(); 146 | let documents = texts_iter(File::open(&document_path)?); 147 | searcher = if disable_parallel { 148 | searcher.build_sketches(documents, num_chunks)? 149 | } else { 150 | searcher.build_sketches_in_parallel(documents, num_chunks)? 151 | }; 152 | let duration = start.elapsed(); 153 | let memory_in_bytes = searcher.memory_in_bytes() as f64; 154 | eprintln!( 155 | "Produced {} sketches in {} sec, consuming {} MiB", 156 | searcher.len(), 157 | duration.as_secs_f64(), 158 | memory_in_bytes / (1024. * 1024.) 159 | ); 160 | } 161 | 162 | eprintln!("Finding all similar pairs in sketches..."); 163 | let start = Instant::now(); 164 | let results = searcher.search_similar_pairs(radius); 165 | eprintln!("Done in {} sec", start.elapsed().as_secs_f64()); 166 | 167 | println!("i,j,dist"); 168 | for (i, j, dist) in results { 169 | println!("{i},{j},{dist}"); 170 | } 171 | 172 | Ok(()) 173 | } 174 | 175 | fn texts_iter(rdr: R) -> impl Iterator 176 | where 177 | R: Read, 178 | { 179 | BufReader::new(rdr).lines().map(|line| line.unwrap()) 180 | } 181 | -------------------------------------------------------------------------------- /find-simdoc-cli/src/dump.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::fs::File; 3 | use std::io::{BufRead, BufReader}; 4 | use std::path::PathBuf; 5 | 6 | use clap::Parser; 7 | 8 | #[derive(Parser, Debug)] 9 | #[clap(name = "find-simdoc-dump", about = "A program to dump similar texts.")] 10 | struct Args { 11 | #[clap(short = 'i', long)] 12 | text_path: PathBuf, 13 | 14 | #[clap(short = 's', long)] 15 | simpair_path: PathBuf, 16 | } 17 | 18 | fn main() -> Result<(), Box> { 19 | let args = Args::parse(); 20 | 21 | let text_path = args.text_path; 22 | let simpair_path = args.simpair_path; 23 | 24 | let texts: Vec<_> = BufReader::new(File::open(text_path)?) 25 | .lines() 26 | .map(|line| line.unwrap()) 27 | .collect(); 28 | 29 | for (i, row) in BufReader::new(File::open(simpair_path)?) 30 | .lines() 31 | .enumerate() 32 | { 33 | if i == 0 { 34 | continue; 35 | } 36 | let row = row?; 37 | let cols: Vec<_> = row.split(',').collect(); 38 | let i = cols[0].parse::()?; 39 | let j = cols[1].parse::()?; 40 | let dist = cols[2].parse::()?; 41 | println!("[i={i},j={j},dist={dist}]"); 42 | println!("{}", texts[i]); 43 | println!("{}", texts[j]); 44 | } 45 | 46 | Ok(()) 47 | } 48 | -------------------------------------------------------------------------------- /find-simdoc-cli/src/jaccard.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::fs::File; 3 | use std::io::{BufRead, BufReader, Read}; 4 | use std::path::PathBuf; 5 | use std::time::Instant; 6 | 7 | use clap::Parser; 8 | 9 | use find_simdoc::JaccardSearcher; 10 | 11 | #[derive(Parser, Debug)] 12 | #[clap( 13 | name = "find-simdoc-jaccard", 14 | about = "A program to find similar documents in the Jaccard space." 15 | )] 16 | struct Args { 17 | /// File path to a document file to be searched. 18 | /// Empty lines must not be included. 19 | #[clap(short = 'i', long)] 20 | document_path: PathBuf, 21 | 22 | /// Search radius in the range of [0,1]. 23 | #[clap(short = 'r', long)] 24 | radius: f64, 25 | 26 | /// Delimiter for recognizing words as tokens in feature extraction. 27 | /// If None, characters are used for tokens. 28 | #[clap(short = 'd', long)] 29 | delimiter: Option, 30 | 31 | /// Window size for w-shingling in feature extraction (must be more than 0). 32 | #[clap(short = 'w', long, default_value = "1")] 33 | window_size: usize, 34 | 35 | /// Number of chunks in sketches, indicating that the number of dimensions in the Hamming space 36 | /// will be 64*#chunks. The larger this value, the more accurate the approximation, 37 | /// but the more time and memory it takes to search. 38 | #[clap(short = 'c', long, default_value = "8")] 39 | num_chunks: usize, 40 | 41 | /// Seed value for random values. 42 | #[clap(short = 's', long)] 43 | seed: Option, 44 | 45 | /// Disables parallel construction. 46 | #[clap(short = 'p', long)] 47 | disable_parallel: bool, 48 | } 49 | 50 | fn main() -> Result<(), Box> { 51 | let args = Args::parse(); 52 | 53 | let document_path = args.document_path; 54 | let radius = args.radius; 55 | let delimiter = args.delimiter; 56 | let window_size = args.window_size; 57 | let num_chunks = args.num_chunks; 58 | let seed = args.seed; 59 | let disable_parallel = args.disable_parallel; 60 | 61 | let mut searcher = JaccardSearcher::new(window_size, delimiter, seed)?.shows_progress(true); 62 | 63 | { 64 | eprintln!("Converting documents into sketches..."); 65 | let start = Instant::now(); 66 | let documents = texts_iter(File::open(&document_path)?); 67 | searcher = if disable_parallel { 68 | searcher.build_sketches(documents, num_chunks)? 69 | } else { 70 | searcher.build_sketches_in_parallel(documents, num_chunks)? 71 | }; 72 | let duration = start.elapsed(); 73 | let memory_in_bytes = searcher.memory_in_bytes() as f64; 74 | eprintln!( 75 | "Produced {} sketches in {} sec, consuming {} MiB", 76 | searcher.len(), 77 | duration.as_secs_f64(), 78 | memory_in_bytes / (1024. * 1024.) 79 | ); 80 | } 81 | 82 | eprintln!("Finding all similar pairs in sketches..."); 83 | let start = Instant::now(); 84 | let results = searcher.search_similar_pairs(radius); 85 | eprintln!("Done in {} sec", start.elapsed().as_secs_f64()); 86 | 87 | println!("i,j,dist"); 88 | for (i, j, dist) in results { 89 | println!("{i},{j},{dist}"); 90 | } 91 | 92 | Ok(()) 93 | } 94 | 95 | fn texts_iter(rdr: R) -> impl Iterator 96 | where 97 | R: Read, 98 | { 99 | BufReader::new(rdr).lines().map(|line| line.unwrap()) 100 | } 101 | -------------------------------------------------------------------------------- /find-simdoc-cli/src/minhash_acc.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::mutex_atomic)] 2 | 3 | use std::env; 4 | use std::error::Error; 5 | use std::fmt::Write as _; 6 | use std::fs::File; 7 | use std::io::{BufRead, BufReader, Read}; 8 | use std::mem; 9 | use std::path::PathBuf; 10 | use std::sync::Mutex; 11 | use std::time::Instant; 12 | 13 | use all_pairs_hamming::sketch::Sketch; 14 | use clap::Parser; 15 | use find_simdoc::feature::{FeatureConfig, FeatureExtractor}; 16 | use find_simdoc::lsh::minhash::MinHasher; 17 | use hashbrown::HashSet; 18 | use positioned_io::WriteAt; 19 | use rand::{RngCore, SeedableRng}; 20 | use rayon::prelude::*; 21 | 22 | const MAX_CHUNKS: usize = 100; 23 | 24 | #[derive(Parser, Debug)] 25 | #[clap( 26 | name = "find-simdoc-minhash_acc", 27 | about = "A program to test accuracy in 1-bit minwise hashing." 28 | )] 29 | struct Args { 30 | /// File path to a document file to be searched. 31 | /// Empty lines must not be included. 32 | #[clap(short = 'i', long)] 33 | document_path: PathBuf, 34 | 35 | /// Delimiter for recognizing words as tokens in feature extraction. 36 | /// If None, characters are used for tokens. 37 | #[clap(short = 'd', long)] 38 | delimiter: Option, 39 | 40 | /// Window size for w-shingling in feature extraction (must to be more than 0). 41 | #[clap(short = 'w', long, default_value = "1")] 42 | window_size: usize, 43 | 44 | /// Seed value for random values. 45 | #[clap(short = 's', long)] 46 | seed: Option, 47 | 48 | /// Directory path to write a tmp file. 49 | #[clap(short = 't', long)] 50 | tmp_dir: Option, 51 | } 52 | 53 | fn main() -> Result<(), Box> { 54 | let args = Args::parse(); 55 | 56 | let document_path = args.document_path; 57 | let delimiter = args.delimiter; 58 | let window_size = args.window_size; 59 | let seed = args.seed; 60 | let tmp_dir = args.tmp_dir; 61 | 62 | if window_size == 0 { 63 | return Err("window_size must not be 0.".into()); 64 | } 65 | 66 | let documents = BufReader::new(File::open(document_path)?) 67 | .lines() 68 | .map(|line| line.unwrap()); 69 | 70 | let mut seeder = 71 | rand_xoshiro::SplitMix64::seed_from_u64(seed.unwrap_or_else(rand::random::)); 72 | 73 | let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?; 74 | let extractor = FeatureExtractor::new(&config); 75 | 76 | let features = { 77 | eprintln!("Loading documents and extracting features..."); 78 | let start = Instant::now(); 79 | let mut features = vec![]; 80 | for document in documents { 81 | if document.is_empty() { 82 | return Err("Input document must not be empty.".into()); 83 | } 84 | let mut feature = vec![]; 85 | extractor.extract(document, &mut feature); 86 | features.push(feature); 87 | } 88 | let duration = start.elapsed(); 89 | let total_bytes = 90 | features.iter().fold(0, |acc, f| acc + f.len()) * std::mem::size_of::(); 91 | eprintln!( 92 | "Extracted {} features in {} sec, consuming {} MiB", 93 | features.len(), 94 | duration.as_secs_f64(), 95 | total_bytes as f64 / (1024. * 1024.) 96 | ); 97 | features 98 | }; 99 | 100 | let sketches = { 101 | eprintln!("Producing binary sketches..."); 102 | let start = Instant::now(); 103 | let hasher = MinHasher::new(seeder.next_u64()); 104 | 105 | let processed = Mutex::new(0usize); 106 | 107 | let mut sketches = vec![vec![]; features.len()]; 108 | features 109 | .par_iter() 110 | .map(|feature| { 111 | { 112 | // Mutex::lock also locks eprintln. 113 | let mut cnt = processed.lock().unwrap(); 114 | *cnt += 1; 115 | if *cnt % 1000 == 0 { 116 | eprintln!("Processed {} features...", *cnt); 117 | } 118 | } 119 | let mut iter = hasher.iter(feature); 120 | let mut sketch = Vec::with_capacity(MAX_CHUNKS); 121 | (0..MAX_CHUNKS).for_each(|_| sketch.push(iter.next().unwrap())); 122 | sketch 123 | }) 124 | .collect_into_vec(&mut sketches); 125 | 126 | let duration = start.elapsed(); 127 | let total_bytes = sketches.len() * MAX_CHUNKS * std::mem::size_of::(); 128 | eprintln!( 129 | "Produced in {} sec, consuming {} MiB", 130 | duration.as_secs_f64(), 131 | total_bytes as f64 / (1024. * 1024.) 132 | ); 133 | sketches 134 | }; 135 | 136 | let tmp_path = { 137 | let mut tmp_path = tmp_dir.unwrap_or_else(env::temp_dir); 138 | tmp_path.push("tmp.jac_dist"); 139 | tmp_path 140 | }; 141 | 142 | let possible_pairs = { 143 | let start = Instant::now(); 144 | 145 | let possible_pairs = features.len() * (features.len() - 1) / 2; 146 | eprintln!("Computing exact Jaccard distances for {possible_pairs} pairs..."); 147 | 148 | let tmp_file_size = possible_pairs * mem::size_of::(); 149 | let offsets = { 150 | let mut offset = 0; 151 | let mut offsets = Vec::with_capacity(features.len()); 152 | for i in 0..features.len() { 153 | offsets.push(offset); 154 | offset += features.len() - i - 1; 155 | } 156 | assert_eq!(offset, possible_pairs); 157 | offsets 158 | }; 159 | 160 | { 161 | let processed = Mutex::new(0usize); 162 | let writer = Mutex::new(File::create(&tmp_path)?); 163 | 164 | // Creates a file object of size tmp_file_size bytes. 165 | { 166 | let mut w = writer.lock().unwrap(); 167 | w.write_at(tmp_file_size as u64 - 1, &[0])?; 168 | } 169 | 170 | eprintln!( 171 | "Created a tmp file of {} GiB, at {:?}", 172 | tmp_file_size as f64 / (1024. * 1024. * 1024.), 173 | &tmp_path 174 | ); 175 | 176 | (0..features.len()).into_par_iter().for_each(|i| { 177 | { 178 | // Mutex::lock also locks eprintln. 179 | let mut cnt = processed.lock().unwrap(); 180 | *cnt += 1; 181 | if *cnt % 1000 == 0 { 182 | eprintln!("Processed {} features...", *cnt); 183 | } 184 | } 185 | 186 | let mut jac_dists = 187 | Vec::with_capacity((features.len() - i) * mem::size_of::()); 188 | 189 | let x = &features[i]; 190 | for y in features.iter().skip(i + 1) { 191 | let dist = 192 | find_simdoc::lsh::jaccard_distance(x.iter().clone(), y.iter().clone()); 193 | jac_dists.extend_from_slice(&dist.to_le_bytes()); 194 | } 195 | 196 | // Writes distances with random access on a file stream. 197 | let offset = offsets[i] * mem::size_of::(); 198 | { 199 | let mut w = writer.lock().unwrap(); 200 | w.write_at(offset as u64, &jac_dists).unwrap(); 201 | } 202 | }); 203 | } 204 | 205 | let duration = start.elapsed(); 206 | eprintln!("Computed in {} sec", duration.as_secs_f64()); 207 | possible_pairs 208 | }; 209 | 210 | let radii = vec![0.01, 0.02, 0.05, 0.1, 0.2, 0.5]; 211 | let mut header = "num_chunks,dimensions,mean_absolute_error".to_string(); 212 | for &r in &radii { 213 | write!(header, ",results_{r}")?; 214 | write!(header, ",precision_{r}")?; 215 | write!(header, ",recall_{r}")?; 216 | write!(header, ",f1_{r}")?; 217 | } 218 | println!("{header}"); 219 | 220 | eprintln!("Computing accuracy..."); 221 | let start = Instant::now(); 222 | 223 | let results = { 224 | let processed = Mutex::new(0usize); 225 | let mut results: Vec<_> = (1..=MAX_CHUNKS) 226 | .into_par_iter() 227 | .map(|num_chunks| { 228 | { 229 | // Mutex::lock also locks eprintln. 230 | let mut cnt = processed.lock().unwrap(); 231 | *cnt += 1; 232 | if *cnt % 10 == 0 { 233 | eprintln!("Processed {} chunks...", *cnt); 234 | } 235 | } 236 | 237 | let mut sum_error = 0.; 238 | let mut true_results: Vec<_> = (0..radii.len()).map(|_| HashSet::new()).collect(); 239 | let mut appx_results: Vec<_> = (0..radii.len()).map(|_| HashSet::new()).collect(); 240 | 241 | let mut reader = BufReader::new(File::open(&tmp_path).unwrap()); 242 | 243 | for i in 0..sketches.len() { 244 | let x = &sketches[i]; 245 | for (j, y) in sketches.iter().enumerate().skip(i + 1) { 246 | let mut buf = [0; mem::size_of::()]; 247 | reader.read_exact(&mut buf).unwrap(); 248 | 249 | let jac_dist = f64::from_le_bytes(buf); 250 | let ham_dist = hamming_distance(&x[..num_chunks], &y[..num_chunks]); 251 | sum_error += (jac_dist - ham_dist).abs(); 252 | 253 | for (k, &r) in radii.iter().enumerate() { 254 | if jac_dist <= r { 255 | true_results[k].insert((i, j)); 256 | } 257 | if ham_dist <= r { 258 | appx_results[k].insert((i, j)); 259 | } 260 | } 261 | } 262 | } 263 | 264 | let dim = num_chunks * 64; 265 | let mae = sum_error / possible_pairs as f64; 266 | 267 | let mut prf = vec![]; 268 | for (tr, ar) in true_results.iter().zip(appx_results.iter()) { 269 | let true_positive = tr.intersection(ar).count() as f64; 270 | let false_positive = ar.len() as f64 - true_positive; 271 | let false_negative = tr.len() as f64 - true_positive; 272 | let precision = true_positive / (true_positive + false_positive); 273 | let recall = true_positive / (true_positive + false_negative); 274 | let f1 = (2. * precision * recall) / (precision + recall); 275 | prf.push((tr.len(), precision, recall, f1)); 276 | } 277 | 278 | let mut body = format!("{num_chunks},{dim},{mae}"); 279 | for (t, p, r, f) in prf { 280 | write!(body, ",{t},{p},{r},{f}").unwrap(); 281 | } 282 | (num_chunks, body) 283 | }) 284 | .collect(); 285 | results.sort_by_key(|r| r.0); 286 | results 287 | }; 288 | let duration = start.elapsed(); 289 | eprintln!("Computed in {} sec", duration.as_secs_f64()); 290 | 291 | for (_, body) in results { 292 | println!("{body}"); 293 | } 294 | 295 | Ok(()) 296 | } 297 | 298 | fn hamming_distance(xs: &[u64], ys: &[u64]) -> f64 { 299 | assert_eq!(xs.len(), ys.len()); 300 | let mut dist = 0; 301 | for (&x, &y) in xs.iter().zip(ys.iter()) { 302 | dist += x.hamdist(y); 303 | } 304 | // In 1-bit minhash, the collision probability is multiplied by 2 over the original. 305 | // Thus, we should modify the Hamming distance with a factor of 2. 306 | dist as f64 / (xs.len() * 64) as f64 * 2. 307 | } 308 | -------------------------------------------------------------------------------- /find-simdoc/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "find-simdoc" 3 | version = "0.1.1" 4 | edition = "2021" 5 | authors = ["Shunsuke Kanda "] 6 | description = "Time- and memory-efficient all pairs similarity searches in documents." 7 | license = "MIT OR Apache-2.0" 8 | homepage = "https://github.com/legalforce-research/find-simdoc" 9 | repository = "https://github.com/legalforce-research/find-simdoc" 10 | readme = "README.md" 11 | keywords = ["search", "similarity", "all-pairs", "lsh"] 12 | categories = ["text-processing", "algorithms"] 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [dependencies] 17 | ahash = "0.8.0" # MIT or Apache-2.0 18 | all-pairs-hamming = { path = "../all-pairs-hamming", version = "0.1.0" } # MIT or Apache-2.0 19 | hashbrown = "0.12.3" # MIT or Apache-2.0 20 | rand = "0.8.5" # MIT or Apache-2.0 21 | rand_xoshiro = "0.6.0" # MIT or Apache-2.0 22 | rayon = "1.5.3" # MIT or Apache-2.0 -------------------------------------------------------------------------------- /find-simdoc/README.md: -------------------------------------------------------------------------------- 1 | # find-simdoc 2 | 3 | Time- and memory-efficient all pairs similarity searches in documents. 4 | The detailed description can be found on the [project page](https://github.com/legalforce-research/find-simdoc). 5 | 6 | ## API documentation 7 | 8 | https://docs.rs/find-simdoc 9 | -------------------------------------------------------------------------------- /find-simdoc/examples/find_cosine.rs: -------------------------------------------------------------------------------- 1 | use find_simdoc::tfidf::{Idf, Tf}; 2 | use find_simdoc::CosineSearcher; 3 | 4 | fn main() { 5 | let documents = vec![ 6 | "Welcome to Jimbocho, the town of books and curry!", 7 | "Welcome to Jimbocho, the city of books and curry!", 8 | "We welcome you to Jimbocho, the town of books and curry.", 9 | "Welcome to the town of books and curry, Jimbocho!", 10 | ]; 11 | 12 | // Creates a searcher for word unigrams (with random seed value 42). 13 | let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap(); 14 | // Creates a term frequency (TF) weighter. 15 | let tf = Tf::new(); 16 | // Creates a inverse document frequency (IDF) weighter. 17 | let idf = Idf::new() 18 | .build(documents.iter().clone(), searcher.config()) 19 | .unwrap(); 20 | // Builds the database of binary sketches converted from input documents, 21 | let searcher = searcher 22 | // with the TF weighter and 23 | .tf(Some(tf)) 24 | // the IDF weighter, 25 | .idf(Some(idf)) 26 | // where binary sketches are in the Hamming space of 10*64 dimensions. 27 | .build_sketches_in_parallel(documents.iter(), 10) 28 | .unwrap(); 29 | 30 | // Searches all similar pairs within radius 0.25. 31 | let results = searcher.search_similar_pairs(0.25); 32 | // A result consists of the left-side id, the right-side id, and their distance. 33 | assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]); 34 | } 35 | -------------------------------------------------------------------------------- /find-simdoc/examples/find_jaccard.rs: -------------------------------------------------------------------------------- 1 | use find_simdoc::JaccardSearcher; 2 | 3 | fn main() { 4 | let documents = vec![ 5 | "Welcome to Jimbocho, the town of books and curry!", 6 | "Welcome to Jimbocho, the city of books and curry!", 7 | "We welcome you to Jimbocho, the town of books and curry.", 8 | "Welcome to the town of books and curry, Jimbocho!", 9 | ]; 10 | 11 | // Creates a searcher for character trigrams (with random seed value 42). 12 | let searcher = JaccardSearcher::new(3, None, Some(42)) 13 | .unwrap() 14 | // Builds the database of binary sketches converted from input documents, 15 | // where binary sketches are in the Hamming space of 20*64 dimensions. 16 | .build_sketches_in_parallel(documents.iter(), 20) 17 | .unwrap(); 18 | 19 | // Searches all similar pairs within radius 0.25. 20 | let results = searcher.search_similar_pairs(0.25); 21 | assert_eq!(results, vec![(0, 1, 0.1875), (0, 3, 0.2296875)]); 22 | } 23 | -------------------------------------------------------------------------------- /find-simdoc/src/cosine.rs: -------------------------------------------------------------------------------- 1 | //! Searcher for all pairs of similar documents in the Cosine space. 2 | use std::sync::Mutex; 3 | 4 | use crate::errors::{FindSimdocError, Result}; 5 | use crate::feature::{FeatureConfig, FeatureExtractor}; 6 | use crate::lsh::simhash::SimHasher; 7 | use crate::tfidf::{Idf, Tf}; 8 | 9 | use all_pairs_hamming::chunked_join::ChunkedJoiner; 10 | use rand::{RngCore, SeedableRng}; 11 | use rayon::prelude::*; 12 | 13 | /// Searcher for all pairs of similar documents in the Cosine space. 14 | /// 15 | /// # Approach 16 | /// 17 | /// The search steps consist of 18 | /// 19 | /// 1. Extracts features from documents, 20 | /// where a feature is a tfidf-weighted vector representation of character or word ngrams. 21 | /// 2. Convert the features into binary sketches through the [simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592). 22 | /// 3. Search for similar sketches in the Hamming space using [`ChunkedJoiner`]. 23 | /// 24 | /// # Examples 25 | /// 26 | /// ``` 27 | /// use find_simdoc::tfidf::{Idf, Tf}; 28 | /// use find_simdoc::CosineSearcher; 29 | /// 30 | /// let documents = vec![ 31 | /// "Welcome to Jimbocho, the town of books and curry!", 32 | /// "Welcome to Jimbocho, the city of books and curry!", 33 | /// "We welcome you to Jimbocho, the town of books and curry.", 34 | /// "Welcome to the town of books and curry, Jimbocho!", 35 | /// ]; 36 | /// 37 | /// // Creates a searcher for word unigrams (with random seed value 42). 38 | /// let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap(); 39 | /// // Creates a term frequency (TF) weighter. 40 | /// let tf = Tf::new(); 41 | /// // Creates a inverse document frequency (IDF) weighter. 42 | /// let idf = Idf::new() 43 | /// .build(documents.iter().clone(), searcher.config()) 44 | /// .unwrap(); 45 | /// // Builds the database of binary sketches converted from input documents, 46 | /// let searcher = searcher 47 | /// // with the TF weighter and 48 | /// .tf(Some(tf)) 49 | /// // the IDF weighter, 50 | /// .idf(Some(idf)) 51 | /// // where binary sketches are in the Hamming space of 10*64 dimensions. 52 | /// .build_sketches_in_parallel(documents.iter(), 10) 53 | /// .unwrap(); 54 | /// 55 | /// // Searches all similar pairs within radius 0.25. 56 | /// let results = searcher.search_similar_pairs(0.25); 57 | /// ``` 58 | pub struct CosineSearcher { 59 | config: FeatureConfig, 60 | hasher: SimHasher, 61 | tf: Option, 62 | idf: Option>, 63 | joiner: Option>, 64 | shows_progress: bool, 65 | } 66 | 67 | impl CosineSearcher { 68 | /// Creates an instance. 69 | /// 70 | /// # Arguments 71 | /// 72 | /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0). 73 | /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction. 74 | /// If `None`, characters are used for tokens. 75 | /// * `seed` - Seed value for random values. 76 | pub fn new(window_size: usize, delimiter: Option, seed: Option) -> Result { 77 | let seed = seed.unwrap_or_else(rand::random::); 78 | let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed); 79 | let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?; 80 | let hasher = SimHasher::new(seeder.next_u64()); 81 | Ok(Self { 82 | config, 83 | hasher, 84 | tf: None, 85 | idf: None, 86 | joiner: None, 87 | shows_progress: false, 88 | }) 89 | } 90 | 91 | /// Shows the progress via the standard error output? 92 | pub const fn shows_progress(mut self, yes: bool) -> Self { 93 | self.shows_progress = yes; 94 | self 95 | } 96 | 97 | /// Sets the scheme of TF weighting. 98 | #[allow(clippy::missing_const_for_fn)] 99 | pub fn tf(mut self, tf: Option) -> Self { 100 | self.tf = tf; 101 | self 102 | } 103 | 104 | /// Sets the scheme of IDF weighting. 105 | #[allow(clippy::missing_const_for_fn)] 106 | pub fn idf(mut self, idf: Option>) -> Self { 107 | self.idf = idf; 108 | self 109 | } 110 | 111 | /// Builds the database of sketches from input documents. 112 | /// 113 | /// # Arguments 114 | /// 115 | /// * `documents` - List of documents (must not include an empty string). 116 | /// * `num_chunks` - Number of chunks of sketches, indicating that 117 | /// the number of dimensions in the Hamming space is `num_chunks*64`. 118 | pub fn build_sketches(mut self, documents: I, num_chunks: usize) -> Result 119 | where 120 | I: IntoIterator, 121 | D: AsRef, 122 | { 123 | let mut joiner = ChunkedJoiner::::new(num_chunks).shows_progress(self.shows_progress); 124 | let extractor = FeatureExtractor::new(&self.config); 125 | 126 | let mut feature = vec![]; 127 | for (i, doc) in documents.into_iter().enumerate() { 128 | if self.shows_progress && (i + 1) % 10000 == 0 { 129 | eprintln!("Processed {} documents...", i + 1); 130 | } 131 | let doc = doc.as_ref(); 132 | if doc.is_empty() { 133 | return Err(FindSimdocError::input("Input document must not be empty.")); 134 | } 135 | extractor.extract_with_weights(doc, &mut feature); 136 | if let Some(tf) = self.tf.as_ref() { 137 | tf.tf(&mut feature); 138 | } 139 | if let Some(idf) = self.idf.as_ref() { 140 | for (term, weight) in feature.iter_mut() { 141 | *weight *= idf.idf(*term); 142 | } 143 | } 144 | joiner.add(self.hasher.iter(&feature)).unwrap(); 145 | } 146 | self.joiner = Some(joiner); 147 | Ok(self) 148 | } 149 | 150 | /// Builds the database of sketches from input documents in parallel. 151 | /// 152 | /// # Arguments 153 | /// 154 | /// * `documents` - List of documents (must not include an empty string). 155 | /// * `num_chunks` - Number of chunks of sketches, indicating that 156 | /// the number of dimensions in the Hamming space is `num_chunks*64`. 157 | /// 158 | /// # Notes 159 | /// 160 | /// The progress is not printed even if `shows_progress = true`. 161 | pub fn build_sketches_in_parallel( 162 | mut self, 163 | documents: I, 164 | num_chunks: usize, 165 | ) -> Result 166 | where 167 | I: Iterator + Send, 168 | D: AsRef + Send, 169 | { 170 | let extractor = FeatureExtractor::new(&self.config); 171 | #[allow(clippy::mutex_atomic)] 172 | let processed = Mutex::new(0usize); 173 | let mut sketches: Vec<_> = documents 174 | .into_iter() 175 | .enumerate() 176 | .par_bridge() 177 | .map(|(i, doc)| { 178 | #[allow(clippy::mutex_atomic)] 179 | { 180 | // Mutex::lock also locks eprintln. 181 | let mut cnt = processed.lock().unwrap(); 182 | *cnt += 1; 183 | if self.shows_progress && *cnt % 10000 == 0 { 184 | eprintln!("Processed {} documents...", *cnt); 185 | } 186 | } 187 | let doc = doc.as_ref(); 188 | // TODO: Returns the error value (but I dont know the manner). 189 | assert!(!doc.is_empty(), "Input document must not be empty."); 190 | let mut feature = vec![]; 191 | extractor.extract_with_weights(doc, &mut feature); 192 | if let Some(tf) = self.tf.as_ref() { 193 | tf.tf(&mut feature); 194 | } 195 | if let Some(idf) = self.idf.as_ref() { 196 | for (term, weight) in feature.iter_mut() { 197 | *weight *= idf.idf(*term); 198 | } 199 | } 200 | let mut gen = self.hasher.iter(&feature); 201 | let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect(); 202 | (i, sketch) 203 | }) 204 | .collect(); 205 | sketches.par_sort_by_key(|&(i, _)| i); 206 | 207 | let mut joiner = ChunkedJoiner::::new(num_chunks).shows_progress(self.shows_progress); 208 | for (_, sketch) in sketches { 209 | joiner.add(sketch).unwrap(); 210 | } 211 | self.joiner = Some(joiner); 212 | Ok(self) 213 | } 214 | 215 | /// Searches for all pairs of similar documents within an input radius, returning 216 | /// triplets of the left-side id, the right-side id, and their distance. 217 | pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> { 218 | self.joiner.as_ref().unwrap().similar_pairs(radius) 219 | } 220 | 221 | /// Gets the number of input documents. 222 | pub fn len(&self) -> usize { 223 | self.joiner 224 | .as_ref() 225 | .map_or(0, |joiner| joiner.num_sketches()) 226 | } 227 | 228 | /// Checks if the database is empty. 229 | pub fn is_empty(&self) -> bool { 230 | self.len() == 0 231 | } 232 | 233 | /// Gets the memory usage in bytes. 234 | pub fn memory_in_bytes(&self) -> usize { 235 | self.joiner 236 | .as_ref() 237 | .map_or(0, |joiner| joiner.memory_in_bytes()) 238 | } 239 | 240 | /// Gets the configure of feature extraction. 241 | pub const fn config(&self) -> &FeatureConfig { 242 | &self.config 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /find-simdoc/src/errors.rs: -------------------------------------------------------------------------------- 1 | //! Error definitions. 2 | use std::error::Error; 3 | use std::{fmt, result}; 4 | 5 | /// A specialized Result type for this library. 6 | pub type Result = result::Result; 7 | 8 | /// Errors in this library. 9 | #[derive(Debug)] 10 | pub enum FindSimdocError { 11 | /// Contains [`InputError`]. 12 | Input(InputError), 13 | } 14 | 15 | impl fmt::Display for FindSimdocError { 16 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 17 | match self { 18 | Self::Input(e) => e.fmt(f), 19 | } 20 | } 21 | } 22 | 23 | impl Error for FindSimdocError {} 24 | 25 | impl FindSimdocError { 26 | pub(crate) const fn input(msg: &'static str) -> Self { 27 | Self::Input(InputError { msg }) 28 | } 29 | } 30 | 31 | /// Error used when the input argument is invalid. 32 | #[derive(Debug)] 33 | pub struct InputError { 34 | msg: &'static str, 35 | } 36 | 37 | impl fmt::Display for InputError { 38 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 39 | write!(f, "InputError: {}", self.msg) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /find-simdoc/src/feature.rs: -------------------------------------------------------------------------------- 1 | //! Feature extractor. 2 | use std::hash::{BuildHasher, Hash, Hasher}; 3 | use std::ops::Range; 4 | 5 | use ahash::RandomState; 6 | use rand::{RngCore, SeedableRng}; 7 | 8 | use crate::errors::{FindSimdocError, Result}; 9 | use crate::shingling::ShingleIter; 10 | 11 | /// Configuration of feature extraction. 12 | #[derive(Clone, Debug)] 13 | pub struct FeatureConfig { 14 | window_size: usize, 15 | delimiter: Option, 16 | build_hasher: RandomState, 17 | } 18 | 19 | impl FeatureConfig { 20 | /// Creates an instance. 21 | /// 22 | /// # Arguments 23 | /// 24 | /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0). 25 | /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction. 26 | /// If `None`, characters are used for tokens. 27 | /// * `seed` - Seed value for random values. 28 | pub fn new(window_size: usize, delimiter: Option, seed: u64) -> Result { 29 | if window_size == 0 { 30 | return Err(FindSimdocError::input("Window size must not be 0.")); 31 | } 32 | let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed); 33 | let build_hasher = RandomState::with_seeds( 34 | seeder.next_u64(), 35 | seeder.next_u64(), 36 | seeder.next_u64(), 37 | seeder.next_u64(), 38 | ); 39 | Ok(Self { 40 | window_size, 41 | delimiter, 42 | build_hasher, 43 | }) 44 | } 45 | 46 | fn hash(&self, iter: I) -> u64 47 | where 48 | I: IntoIterator, 49 | T: Hash, 50 | { 51 | let mut s = self.build_hasher.build_hasher(); 52 | for t in iter { 53 | t.hash(&mut s); 54 | } 55 | s.finish() 56 | } 57 | } 58 | 59 | /// Extractor of feature vectors. 60 | pub struct FeatureExtractor<'a> { 61 | config: &'a FeatureConfig, 62 | } 63 | 64 | impl<'a> FeatureExtractor<'a> { 65 | /// Creates an instance. 66 | pub const fn new(config: &'a FeatureConfig) -> Self { 67 | Self { config } 68 | } 69 | 70 | /// Extracts a feature vector from an input text. 71 | pub fn extract(&self, text: S, feature: &mut Vec) 72 | where 73 | S: AsRef, 74 | { 75 | let text = text.as_ref(); 76 | 77 | feature.clear(); 78 | if self.config.delimiter.is_none() && self.config.window_size == 1 { 79 | // The simplest case. 80 | text.chars().for_each(|c| feature.push(c as u64)); 81 | } else { 82 | let token_ranges = self.tokenize(text); 83 | for ranges in ShingleIter::new(&token_ranges, self.config.window_size) { 84 | feature.push(self.config.hash(ranges.iter().cloned().map(|r| &text[r]))); 85 | } 86 | } 87 | } 88 | 89 | /// Extracts a feature vector from an input text with weights of 1.0. 90 | pub fn extract_with_weights(&self, text: S, feature: &mut Vec<(u64, f64)>) 91 | where 92 | S: AsRef, 93 | { 94 | let text = text.as_ref(); 95 | 96 | feature.clear(); 97 | if self.config.delimiter.is_none() && self.config.window_size == 1 { 98 | // The simplest case. 99 | text.chars().for_each(|c| { 100 | let f = c as u64; 101 | let w = 1.; 102 | feature.push((f, w)) 103 | }); 104 | } else { 105 | let token_ranges = self.tokenize(text); 106 | for ranges in ShingleIter::new(&token_ranges, self.config.window_size) { 107 | let f = self.config.hash(ranges.iter().cloned().map(|r| &text[r])); 108 | let w = 1.; 109 | feature.push((f, w)) 110 | } 111 | } 112 | } 113 | 114 | fn tokenize(&self, text: &str) -> Vec> { 115 | let mut token_ranges = vec![]; 116 | for _ in 1..self.config.window_size { 117 | token_ranges.push(0..0); // BOS 118 | } 119 | let mut offset = 0; 120 | if let Some(delim) = self.config.delimiter { 121 | while offset < text.len() { 122 | let len = text[offset..].find(delim); 123 | if let Some(len) = len { 124 | token_ranges.push(offset..offset + len); 125 | offset += len + 1; 126 | } else { 127 | token_ranges.push(offset..text.len()); 128 | break; 129 | } 130 | } 131 | } else { 132 | for c in text.chars() { 133 | let len = c.len_utf8(); 134 | token_ranges.push(offset..offset + len); 135 | offset += len; 136 | } 137 | } 138 | for _ in 1..self.config.window_size { 139 | token_ranges.push(text.len()..text.len()); // EOS 140 | } 141 | token_ranges 142 | } 143 | } 144 | 145 | #[cfg(test)] 146 | mod tests { 147 | use super::*; 148 | 149 | #[test] 150 | fn test_char_unigram() { 151 | let config = FeatureConfig::new(1, None, 42).unwrap(); 152 | let extractor = FeatureExtractor::new(&config); 153 | 154 | let text = "abcd"; 155 | let mut feature = vec![]; 156 | 157 | extractor.extract(text, &mut feature); 158 | assert_eq!( 159 | feature, 160 | vec!['a' as u64, 'b' as u64, 'c' as u64, 'd' as u64] 161 | ) 162 | } 163 | 164 | #[test] 165 | fn test_char_bigram() { 166 | let config = FeatureConfig::new(2, None, 42).unwrap(); 167 | let extractor = FeatureExtractor::new(&config); 168 | 169 | let text = "abcd"; 170 | let mut feature = vec![]; 171 | 172 | extractor.extract(text, &mut feature); 173 | assert_eq!( 174 | feature, 175 | vec![ 176 | config.hash(&["", "a"]), 177 | config.hash(&["a", "b"]), 178 | config.hash(&["b", "c"]), 179 | config.hash(&["c", "d"]), 180 | config.hash(&["d", ""]), 181 | ] 182 | ) 183 | } 184 | 185 | #[test] 186 | fn test_char_trigram() { 187 | let config = FeatureConfig::new(3, None, 42).unwrap(); 188 | let extractor = FeatureExtractor::new(&config); 189 | 190 | let text = "abcd"; 191 | let mut feature = vec![]; 192 | 193 | extractor.extract(text, &mut feature); 194 | assert_eq!( 195 | feature, 196 | vec![ 197 | config.hash(&["", "", "a"]), 198 | config.hash(&["", "a", "b"]), 199 | config.hash(&["a", "b", "c"]), 200 | config.hash(&["b", "c", "d"]), 201 | config.hash(&["c", "d", ""]), 202 | config.hash(&["d", "", ""]), 203 | ] 204 | ) 205 | } 206 | 207 | #[test] 208 | fn test_word_unigram() { 209 | let config = FeatureConfig::new(1, Some(' '), 42).unwrap(); 210 | let extractor = FeatureExtractor::new(&config); 211 | 212 | let text = "abc de fgh"; 213 | let mut feature = vec![]; 214 | 215 | extractor.extract(text, &mut feature); 216 | assert_eq!( 217 | feature, 218 | vec![ 219 | config.hash(&["abc"]), 220 | config.hash(&["de"]), 221 | config.hash(&["fgh"]), 222 | ] 223 | ) 224 | } 225 | 226 | #[test] 227 | fn test_word_bigram() { 228 | let config = FeatureConfig::new(2, Some(' '), 42).unwrap(); 229 | let extractor = FeatureExtractor::new(&config); 230 | 231 | let text = "abc de fgh"; 232 | let mut feature = vec![]; 233 | 234 | extractor.extract(text, &mut feature); 235 | assert_eq!( 236 | feature, 237 | vec![ 238 | config.hash(&["", "abc"]), 239 | config.hash(&["abc", "de"]), 240 | config.hash(&["de", "fgh"]), 241 | config.hash(&["fgh", ""]), 242 | ] 243 | ) 244 | } 245 | 246 | #[test] 247 | fn test_word_trigram() { 248 | let config = FeatureConfig::new(3, Some(' '), 42).unwrap(); 249 | let extractor = FeatureExtractor::new(&config); 250 | 251 | let text = "abc de fgh"; 252 | let mut feature = vec![]; 253 | 254 | extractor.extract(text, &mut feature); 255 | assert_eq!( 256 | feature, 257 | vec![ 258 | config.hash(&["", "", "abc"]), 259 | config.hash(&["", "abc", "de"]), 260 | config.hash(&["abc", "de", "fgh"]), 261 | config.hash(&["de", "fgh", ""]), 262 | config.hash(&["fgh", "", ""]), 263 | ] 264 | ) 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /find-simdoc/src/jaccard.rs: -------------------------------------------------------------------------------- 1 | //! Searcher for all pairs of similar documents in the Jaccard space. 2 | use std::sync::Mutex; 3 | 4 | use crate::errors::{FindSimdocError, Result}; 5 | use crate::feature::{FeatureConfig, FeatureExtractor}; 6 | use crate::lsh::minhash::MinHasher; 7 | 8 | use all_pairs_hamming::chunked_join::ChunkedJoiner; 9 | use rand::{RngCore, SeedableRng}; 10 | use rayon::prelude::*; 11 | 12 | /// Searcher for all pairs of similar documents in the Jaccard space. 13 | /// 14 | /// # Approach 15 | /// 16 | /// The search steps consist of 17 | /// 18 | /// 1. Extracts features from documents, 19 | /// where a feature is a set representation of character or word ngrams. 20 | /// 2. Convert the features into binary sketches through the [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759). 21 | /// 3. Search for similar sketches in the Hamming space using [`ChunkedJoiner`]. 22 | /// 23 | /// # Examples 24 | /// 25 | /// ``` 26 | /// use find_simdoc::JaccardSearcher; 27 | /// 28 | /// let documents = vec![ 29 | /// "Welcome to Jimbocho, the town of books and curry!", 30 | /// "Welcome to Jimbocho, the city of books and curry!", 31 | /// "We welcome you to Jimbocho, the town of books and curry.", 32 | /// "Welcome to the town of books and curry, Jimbocho!", 33 | /// ]; 34 | /// 35 | /// // Creates a searcher for character trigrams (with random seed value 42). 36 | /// let searcher = JaccardSearcher::new(3, None, Some(42)) 37 | /// .unwrap() 38 | /// // Builds the database of binary sketches converted from input documents, 39 | /// // where binary sketches are in the Hamming space of 20*64 dimensions. 40 | /// .build_sketches_in_parallel(documents.iter(), 20) 41 | /// .unwrap(); 42 | /// 43 | /// // Searches all similar pairs within radius 0.25. 44 | /// let results = searcher.search_similar_pairs(0.25); 45 | /// ``` 46 | pub struct JaccardSearcher { 47 | config: FeatureConfig, 48 | hasher: MinHasher, 49 | joiner: Option>, 50 | shows_progress: bool, 51 | } 52 | 53 | impl JaccardSearcher { 54 | /// Creates an instance. 55 | /// 56 | /// # Arguments 57 | /// 58 | /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0). 59 | /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction. 60 | /// If `None`, characters are used for tokens. 61 | /// * `seed` - Seed value for random values. 62 | pub fn new(window_size: usize, delimiter: Option, seed: Option) -> Result { 63 | let seed = seed.unwrap_or_else(rand::random::); 64 | let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed); 65 | let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?; 66 | let hasher = MinHasher::new(seeder.next_u64()); 67 | Ok(Self { 68 | config, 69 | hasher, 70 | joiner: None, 71 | shows_progress: false, 72 | }) 73 | } 74 | 75 | /// Shows the progress via the standard error output? 76 | pub const fn shows_progress(mut self, yes: bool) -> Self { 77 | self.shows_progress = yes; 78 | self 79 | } 80 | 81 | /// Builds the database of sketches from input documents. 82 | /// 83 | /// # Arguments 84 | /// 85 | /// * `documents` - List of documents (must not include an empty string). 86 | /// * `num_chunks` - Number of chunks of sketches, indicating that 87 | /// the number of dimensions in the Hamming space is `num_chunks*64`. 88 | pub fn build_sketches(mut self, documents: I, num_chunks: usize) -> Result 89 | where 90 | I: IntoIterator, 91 | D: AsRef, 92 | { 93 | let mut joiner = ChunkedJoiner::::new(num_chunks).shows_progress(self.shows_progress); 94 | let extractor = FeatureExtractor::new(&self.config); 95 | 96 | let mut feature = vec![]; 97 | for (i, doc) in documents.into_iter().enumerate() { 98 | if self.shows_progress && (i + 1) % 10000 == 0 { 99 | eprintln!("Processed {} documents...", i + 1); 100 | } 101 | let doc = doc.as_ref(); 102 | if doc.is_empty() { 103 | return Err(FindSimdocError::input("Input document must not be empty.")); 104 | } 105 | extractor.extract(doc, &mut feature); 106 | joiner.add(self.hasher.iter(&feature)).unwrap(); 107 | } 108 | self.joiner = Some(joiner); 109 | Ok(self) 110 | } 111 | 112 | /// Builds the database of sketches from input documents in parallel. 113 | /// 114 | /// # Arguments 115 | /// 116 | /// * `documents` - List of documents (must not include an empty string). 117 | /// * `num_chunks` - Number of chunks of sketches, indicating that 118 | /// the number of dimensions in the Hamming space is `num_chunks*64`. 119 | /// 120 | /// # Notes 121 | /// 122 | /// The progress is not printed even if `shows_progress = true`. 123 | pub fn build_sketches_in_parallel( 124 | mut self, 125 | documents: I, 126 | num_chunks: usize, 127 | ) -> Result 128 | where 129 | I: Iterator + Send, 130 | D: AsRef + Send, 131 | { 132 | let extractor = FeatureExtractor::new(&self.config); 133 | #[allow(clippy::mutex_atomic)] 134 | let processed = Mutex::new(0usize); 135 | let mut sketches: Vec<_> = documents 136 | .into_iter() 137 | .enumerate() 138 | .par_bridge() 139 | .map(|(i, doc)| { 140 | #[allow(clippy::mutex_atomic)] 141 | { 142 | // Mutex::lock also locks eprintln. 143 | let mut cnt = processed.lock().unwrap(); 144 | *cnt += 1; 145 | if self.shows_progress && *cnt % 10000 == 0 { 146 | eprintln!("Processed {} documents...", *cnt); 147 | } 148 | } 149 | let doc = doc.as_ref(); 150 | // TODO: Returns the error value (but I dont know the manner). 151 | assert!(!doc.is_empty(), "Input document must not be empty."); 152 | let mut feature = vec![]; 153 | extractor.extract(doc, &mut feature); 154 | let mut gen = self.hasher.iter(&feature); 155 | let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect(); 156 | (i, sketch) 157 | }) 158 | .collect(); 159 | sketches.par_sort_by_key(|&(i, _)| i); 160 | 161 | let mut joiner = ChunkedJoiner::::new(num_chunks).shows_progress(self.shows_progress); 162 | for (_, sketch) in sketches { 163 | joiner.add(sketch).unwrap(); 164 | } 165 | self.joiner = Some(joiner); 166 | Ok(self) 167 | } 168 | 169 | /// Searches for all pairs of similar documents within an input radius, returning 170 | /// triplets of the left-side id, the right-side id, and their distance. 171 | pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> { 172 | self.joiner.as_ref().map_or_else(Vec::new, |joiner| { 173 | // In 1-bit minhash, the collision probability is multiplied by 2 over the original. 174 | // Thus, we should search with the half of the actual radius. 175 | let mut results = joiner.similar_pairs(radius / 2.); 176 | // Modifies the distances. 177 | results.iter_mut().for_each(|(_, _, d)| *d *= 2.); 178 | results 179 | }) 180 | } 181 | 182 | /// Gets the number of input documents. 183 | pub fn len(&self) -> usize { 184 | self.joiner 185 | .as_ref() 186 | .map_or(0, |joiner| joiner.num_sketches()) 187 | } 188 | 189 | /// Checks if the database is empty. 190 | pub fn is_empty(&self) -> bool { 191 | self.len() == 0 192 | } 193 | 194 | /// Gets the memory usage in bytes. 195 | pub fn memory_in_bytes(&self) -> usize { 196 | self.joiner 197 | .as_ref() 198 | .map_or(0, |joiner| joiner.memory_in_bytes()) 199 | } 200 | 201 | /// Gets the configure of feature extraction. 202 | pub const fn config(&self) -> &FeatureConfig { 203 | &self.config 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /find-simdoc/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Time- and memory-efficient all pairs similarity searches in documents. 2 | //! A more detailed description can be found on the [project page](https://github.com/legalforce-research/find-simdoc). 3 | //! 4 | //! # Problem definition 5 | //! 6 | //! - Input 7 | //! - List of documents 8 | //! - Distance function 9 | //! - Radius threshold 10 | //! - Output 11 | //! - All pairs of similar document ids 12 | //! 13 | //! # Features 14 | //! 15 | //! ## Easy to use 16 | //! 17 | //! This software supports all essential steps of document similarity search, 18 | //! from feature extraction to output of similar pairs. 19 | //! Therefore, you can immediately try the fast all pairs similarity search using your document files. 20 | //! 21 | //! ## Flexible tokenization 22 | //! 23 | //! You can specify any delimiter when splitting words in tokenization for feature extraction. 24 | //! This can be useful in languages where multiple definitions of words exist, such as Japanese or Chinese. 25 | //! 26 | //! ## Time and memory efficiency 27 | //! 28 | //! The time and memory complexities are *linear* over the numbers of input documents and output results 29 | //! on the basis of the ideas behind the locality sensitive hashing (LSH) and [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html). 30 | //! 31 | //! ## Tunable search performance 32 | //! 33 | //! LSH allows tuning of performance in accuracy, time, and memory, through a manual parameter specifying search dimensions. 34 | //! You can flexibly perform searches depending on your dataset and machine environment. 35 | //! - Specifying lower dimensions allows for faster and rougher searches with less memory usage. 36 | //! - Specifying higher dimensions allows for more accurate searches with more memory usage. 37 | //! 38 | //! # Search steps 39 | //! 40 | //! 1. Extract features from documents 41 | //! - Set representation of character or word ngrams 42 | //! - Tfidf-weighted vector representation of character or word ngrams 43 | //! 2. Convert the features into binary sketches through locality sensitive hashing 44 | //! - [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity 45 | //! - [Simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592) for the Cosine similarity 46 | //! 3. Search for similar sketches in the Hamming space using a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html) 47 | #![deny(missing_docs)] 48 | 49 | pub mod cosine; 50 | pub mod errors; 51 | pub mod feature; 52 | pub mod jaccard; 53 | pub mod lsh; 54 | pub mod tfidf; 55 | 56 | mod shingling; 57 | 58 | pub use cosine::CosineSearcher; 59 | pub use jaccard::JaccardSearcher; 60 | -------------------------------------------------------------------------------- /find-simdoc/src/lsh.rs: -------------------------------------------------------------------------------- 1 | //! Locality-sensitive hashings. 2 | pub mod minhash; 3 | pub mod simhash; 4 | 5 | use std::hash::Hash; 6 | 7 | use hashbrown::HashSet; 8 | use rand_xoshiro::rand_core::{RngCore, SeedableRng}; 9 | 10 | /// Generates a hash value. 11 | #[inline(always)] 12 | pub(crate) fn hash_u64(x: u64, seed: u64) -> u64 { 13 | rand_xoshiro::SplitMix64::seed_from_u64(x ^ seed).next_u64() 14 | } 15 | 16 | /// Computes the Jaccard distance. 17 | /// 18 | /// # Examples 19 | /// 20 | /// ``` 21 | /// use find_simdoc::lsh::jaccard_distance; 22 | /// 23 | /// let x = vec![1, 2, 4]; 24 | /// let y = vec![1, 2, 5, 7]; 25 | /// assert_eq!(jaccard_distance(x, y), 0.6); 26 | /// ``` 27 | pub fn jaccard_distance(lhs: I, rhs: I) -> f64 28 | where 29 | I: IntoIterator, 30 | T: Hash + Eq, 31 | { 32 | let a = HashSet::::from_iter(lhs); 33 | let b = HashSet::::from_iter(rhs); 34 | 1. - (a.intersection(&b).count() as f64) / (a.union(&b).count() as f64) 35 | } 36 | -------------------------------------------------------------------------------- /find-simdoc/src/lsh/minhash.rs: -------------------------------------------------------------------------------- 1 | //! 1-bit minwise hashing for the Jaccard similarity. 2 | use rand_xoshiro::rand_core::{RngCore, SeedableRng}; 3 | 4 | /// [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity. 5 | pub struct MinHasher { 6 | seed: u64, 7 | } 8 | 9 | impl MinHasher { 10 | /// Creates an instance. 11 | pub const fn new(seed: u64) -> Self { 12 | Self { seed } 13 | } 14 | 15 | /// Creates an iterator to generate sketches from an input feature. 16 | pub fn iter<'a>(&self, feature: &'a [u64]) -> MinHashIter<'a> { 17 | MinHashIter { 18 | feature, 19 | seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed), 20 | } 21 | } 22 | } 23 | 24 | /// Iterator to generate sketches with the 1-bit minwise hashing. 25 | pub struct MinHashIter<'a> { 26 | feature: &'a [u64], 27 | seeder: rand_xoshiro::SplitMix64, 28 | } 29 | 30 | impl Iterator for MinHashIter<'_> { 31 | type Item = u64; 32 | 33 | fn next(&mut self) -> Option { 34 | let mut x = 0; 35 | for _ in 0..64 { 36 | let seed = self.seeder.next_u64(); 37 | let h = self 38 | .feature 39 | .iter() 40 | .map(|&i| crate::lsh::hash_u64(i, seed)) 41 | .min() 42 | .unwrap(); 43 | x = (x << 1) | (h & 1); 44 | } 45 | Some(x) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /find-simdoc/src/lsh/simhash.rs: -------------------------------------------------------------------------------- 1 | //! Simplified simhash for the Cosine similarity. 2 | use rand_xoshiro::rand_core::{RngCore, SeedableRng}; 3 | 4 | /// [Simplified simhash](https://dl.acm.org/doi/10.1145/2063576.2063737) for Cosine similarity. 5 | pub struct SimHasher { 6 | seed: u64, 7 | } 8 | 9 | impl SimHasher { 10 | /// Creates an instance. 11 | pub const fn new(seed: u64) -> Self { 12 | Self { seed } 13 | } 14 | 15 | /// Creates an iterator to generate sketches from an input feature. 16 | pub fn iter<'a>(&self, feature: &'a [(u64, f64)]) -> SimHashIter<'a> { 17 | SimHashIter { 18 | feature, 19 | seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed), 20 | weights: [0.; 64], 21 | } 22 | } 23 | } 24 | 25 | /// Iterator to generate sketches with the simplified simhash. 26 | pub struct SimHashIter<'a> { 27 | feature: &'a [(u64, f64)], 28 | seeder: rand_xoshiro::SplitMix64, 29 | weights: [f64; 64], 30 | } 31 | 32 | impl Iterator for SimHashIter<'_> { 33 | type Item = u64; 34 | 35 | fn next(&mut self) -> Option { 36 | self.weights.fill(0.); 37 | let seed = self.seeder.next_u64(); 38 | for (h, x) in self 39 | .feature 40 | .iter() 41 | .map(|&(i, x)| (crate::lsh::hash_u64(i, seed), x)) 42 | { 43 | for (j, w) in self.weights.iter_mut().enumerate() { 44 | if (h >> j) & 1 == 0 { 45 | *w += x; 46 | } else { 47 | *w -= x; 48 | } 49 | } 50 | } 51 | Some( 52 | self.weights 53 | .iter() 54 | .fold(0, |acc, w| if *w >= 0. { (acc << 1) | 1 } else { acc << 1 }), 55 | ) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /find-simdoc/src/shingling.rs: -------------------------------------------------------------------------------- 1 | pub struct ShingleIter<'a, T> { 2 | tokens: &'a [T], 3 | window_size: usize, 4 | position: usize, 5 | } 6 | 7 | impl<'a, T> ShingleIter<'a, T> { 8 | pub fn new(tokens: &'a [T], window_size: usize) -> Self { 9 | assert!(!tokens.is_empty()); 10 | assert!(window_size <= tokens.len()); 11 | Self { 12 | tokens, 13 | window_size, 14 | position: 0, 15 | } 16 | } 17 | } 18 | 19 | impl<'a, T> Iterator for ShingleIter<'a, T> { 20 | type Item = &'a [T]; 21 | 22 | fn next(&mut self) -> Option { 23 | if self.tokens.len() < self.position + self.window_size { 24 | return None; 25 | } 26 | let window = &self.tokens[self.position..self.position + self.window_size]; 27 | self.position += 1; 28 | Some(window) 29 | } 30 | } 31 | 32 | #[cfg(test)] 33 | mod tests { 34 | use super::*; 35 | 36 | #[test] 37 | fn test_q1() { 38 | let tokens = vec!["a", "b", "c"]; 39 | let mut iter = ShingleIter::new(&tokens, 1); 40 | assert_eq!(iter.next(), Some(&tokens[0..1])); 41 | assert_eq!(iter.next(), Some(&tokens[1..2])); 42 | assert_eq!(iter.next(), Some(&tokens[2..3])); 43 | assert_eq!(iter.next(), None); 44 | } 45 | 46 | #[test] 47 | fn test_q2() { 48 | let tokens = vec!["a", "b", "c"]; 49 | let mut iter = ShingleIter::new(&tokens, 2); 50 | assert_eq!(iter.next(), Some(&tokens[0..2])); 51 | assert_eq!(iter.next(), Some(&tokens[1..3])); 52 | assert_eq!(iter.next(), None); 53 | } 54 | 55 | #[test] 56 | fn test_q3() { 57 | let tokens = vec!["a", "b", "c"]; 58 | let mut iter = ShingleIter::new(&tokens, 3); 59 | assert_eq!(iter.next(), Some(&tokens[0..3])); 60 | assert_eq!(iter.next(), None); 61 | } 62 | 63 | #[test] 64 | #[should_panic] 65 | fn test_q4() { 66 | let tokens = vec!["a", "b", "c"]; 67 | ShingleIter::new(&tokens, 4); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /find-simdoc/src/tfidf.rs: -------------------------------------------------------------------------------- 1 | //! Weighters of TF-IDF. 2 | use std::hash::Hash; 3 | 4 | use hashbrown::{HashMap, HashSet}; 5 | 6 | use crate::errors::{FindSimdocError, Result}; 7 | use crate::feature::{FeatureConfig, FeatureExtractor}; 8 | 9 | /// Weighter of inverse document frequency. 10 | #[derive(Default)] 11 | pub struct Idf { 12 | counter: HashMap, 13 | dedup: HashSet, 14 | num_docs: usize, 15 | smooth: bool, 16 | } 17 | 18 | impl Idf 19 | where 20 | T: Hash + Eq + Copy + Default, 21 | { 22 | /// Creates an instance. 23 | pub fn new() -> Self { 24 | Self::default() 25 | } 26 | 27 | /// Enables smoothing. 28 | pub const fn smooth(mut self, yes: bool) -> Self { 29 | self.smooth = yes; 30 | self 31 | } 32 | 33 | /// Trains the frequency of terms for a document. 34 | pub fn add(&mut self, terms: &[T]) { 35 | self.dedup.clear(); 36 | for &term in terms { 37 | if self.dedup.insert(term) { 38 | self.counter 39 | .entry(term) 40 | .and_modify(|c| *c += 1) 41 | .or_insert(1); 42 | } 43 | } 44 | self.num_docs += 1; 45 | } 46 | 47 | /// Gets the number of input documents. 48 | pub const fn num_docs(&self) -> usize { 49 | self.num_docs 50 | } 51 | 52 | /// Computes the IDF of an input term. 53 | pub fn idf(&self, term: T) -> f64 { 54 | let c = usize::from(self.smooth); 55 | let n = (self.num_docs + c) as f64; 56 | let m = (*self.counter.get(&term).unwrap() + c) as f64; 57 | (n / m).log10() + 1. 58 | } 59 | } 60 | 61 | impl Idf { 62 | /// Trains the term frequency of input documents. 63 | /// 64 | /// # Arguments 65 | /// 66 | /// * `documents` - List of documents. 67 | /// * `config` - Configuration of feature extraction. Use the same configuration as that in search. 68 | pub fn build(mut self, documents: I, config: &FeatureConfig) -> Result 69 | where 70 | I: IntoIterator, 71 | D: AsRef, 72 | { 73 | let extractor = FeatureExtractor::new(config); 74 | let mut feature = vec![]; 75 | for doc in documents { 76 | let doc = doc.as_ref(); 77 | if doc.is_empty() { 78 | return Err(FindSimdocError::input("Input document must not be empty.")); 79 | } 80 | extractor.extract(doc, &mut feature); 81 | self.add(&feature); 82 | } 83 | Ok(self) 84 | } 85 | } 86 | 87 | /// Weighter of term frequency. 88 | #[derive(Default)] 89 | pub struct Tf { 90 | sublinear: bool, 91 | } 92 | 93 | impl Tf { 94 | /// Creates an instance. 95 | pub fn new() -> Self { 96 | Self::default() 97 | } 98 | 99 | /// Enables sublinear normalization. 100 | pub const fn sublinear(mut self, yes: bool) -> Self { 101 | self.sublinear = yes; 102 | self 103 | } 104 | 105 | /// Computes the TF of input terms. 106 | pub fn tf(&self, terms: &mut [(T, f64)]) 107 | where 108 | T: Hash + Eq + Copy + Default, 109 | { 110 | let counter = self.count(terms); 111 | let total = terms.len() as f64; 112 | for (term, weight) in terms { 113 | let cnt = *counter.get(term).unwrap() as f64; 114 | *weight = if self.sublinear { 115 | cnt.log10() + 1. 116 | } else { 117 | cnt / total 118 | }; 119 | } 120 | } 121 | 122 | fn count(&self, terms: &[(T, f64)]) -> HashMap 123 | where 124 | T: Hash + Eq + Copy + Default, 125 | { 126 | let mut counter = HashMap::new(); 127 | for &(term, _) in terms.iter() { 128 | counter.entry(term).and_modify(|c| *c += 1).or_insert(1); 129 | } 130 | counter 131 | } 132 | } 133 | 134 | #[cfg(test)] 135 | mod tests { 136 | use std::vec; 137 | 138 | use super::*; 139 | 140 | #[test] 141 | fn test_idf() { 142 | let mut idf = Idf::new(); 143 | idf.add(&['A', 'A', 'C']); 144 | idf.add(&['A', 'C']); 145 | idf.add(&['B', 'A']); 146 | 147 | assert_eq!(idf.num_docs(), 3); 148 | 149 | idf = idf.smooth(false); 150 | assert_eq!(idf.idf('A'), (3f64 / 3f64).log10() + 1.); 151 | assert_eq!(idf.idf('B'), (3f64 / 1f64).log10() + 1.); 152 | assert_eq!(idf.idf('C'), (3f64 / 2f64).log10() + 1.); 153 | 154 | idf = idf.smooth(true); 155 | assert_eq!(idf.idf('A'), (4f64 / 4f64).log10() + 1.); 156 | assert_eq!(idf.idf('B'), (4f64 / 2f64).log10() + 1.); 157 | assert_eq!(idf.idf('C'), (4f64 / 3f64).log10() + 1.); 158 | } 159 | 160 | #[test] 161 | fn test_tf() { 162 | let mut tf = Tf::new(); 163 | let mut terms = vec![('A', 0.), ('B', 0.), ('A', 0.)]; 164 | 165 | tf = tf.sublinear(false); 166 | tf.tf(&mut terms); 167 | assert_eq!( 168 | terms.clone(), 169 | vec![('A', 2. / 3.), ('B', 1. / 3.), ('A', 2. / 3.)] 170 | ); 171 | 172 | tf = tf.sublinear(true); 173 | tf.tf(&mut terms); 174 | assert_eq!( 175 | terms.clone(), 176 | vec![ 177 | ('A', 2f64.log10() + 1.), 178 | ('B', 1f64.log10() + 1.), 179 | ('A', 2f64.log10() + 1.) 180 | ] 181 | ); 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /scripts/load_nltk_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Download NLTK corpora and create a text file of sentences with lowercase letters and no duplicate lines. 4 | ''' 5 | 6 | import sys 7 | import nltk 8 | from argparse import ArgumentParser 9 | 10 | 11 | def download_reuters(): 12 | from nltk.corpus import reuters 13 | nltk.download('reuters') 14 | return reuters.sents() 15 | 16 | 17 | def download_gutenberg(): 18 | from nltk.corpus import gutenberg 19 | nltk.download('gutenberg') 20 | return gutenberg.sents() 21 | 22 | 23 | def download_webtext(): 24 | from nltk.corpus import webtext 25 | nltk.download('webtext') 26 | return webtext.sents() 27 | 28 | 29 | def download_brown(): 30 | from nltk.corpus import brown 31 | nltk.download('brown') 32 | return brown.sents() 33 | 34 | 35 | def download_inaugural(): 36 | from nltk.corpus import inaugural 37 | nltk.download('inaugural') 38 | return inaugural.sents() 39 | 40 | 41 | def main(): 42 | parser = ArgumentParser() 43 | parser.add_argument('name') 44 | args = parser.parse_args() 45 | 46 | nltk.download('punkt') 47 | 48 | if args.name == 'reuters': 49 | sents = download_reuters() 50 | elif args.name == 'gutenberg': 51 | sents = download_gutenberg() 52 | elif args.name == 'webtext': 53 | sents = download_webtext() 54 | elif args.name == 'brown': 55 | sents = download_brown() 56 | elif args.name == 'inaugural': 57 | sents = download_inaugural() 58 | else: 59 | print(f'unsupported corpus name: {args.name}', file=sys.stderr) 60 | return 61 | 62 | with open(f'{args.name}.txt', 'wt') as fout: 63 | sents = [' '.join(sent).lower() for sent in sents] 64 | for sent in set(sents): 65 | fout.write(sent) 66 | fout.write('\n') 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | --------------------------------------------------------------------------------