├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── CODEOWNERS
├── CONTRIBUTING.md
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── all-pairs-hamming
    ├── Cargo.toml
    ├── README.md
    ├── src
    │   ├── bitset64.rs
    │   ├── chunked_join.rs
    │   ├── errors.rs
    │   ├── lib.rs
    │   ├── multi_sort.rs
    │   ├── simple_join.rs
    │   └── sketch.rs
    └── timeperf
    │   ├── Cargo.toml
    │   └── src
    │       └── main.rs
├── figures
    ├── f1_reuters.svg
    ├── mae_reuters.svg
    └── recall_reuters.svg
├── find-simdoc-cli
    ├── Cargo.toml
    └── src
    │   ├── cosine.rs
    │   ├── dump.rs
    │   ├── jaccard.rs
    │   └── minhash_acc.rs
├── find-simdoc
    ├── Cargo.toml
    ├── README.md
    ├── examples
    │   ├── find_cosine.rs
    │   └── find_jaccard.rs
    └── src
    │   ├── cosine.rs
    │   ├── errors.rs
    │   ├── feature.rs
    │   ├── jaccard.rs
    │   ├── lib.rs
    │   ├── lsh.rs
    │   ├── lsh
    │       ├── minhash.rs
    │       └── simhash.rs
    │   ├── shingling.rs
    │   └── tfidf.rs
└── scripts
    └── load_nltk_dataset.py


/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   build:
14 |     name: Check on ${{ matrix.rust }}
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         rust:
19 |           - stable
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |       - name: Install latest stable
23 |         uses: actions-rs/toolchain@v1
24 |         with:
25 |           toolchain: ${{ matrix.rust }}
26 |           override: true
27 |           components: rustfmt, clippy
28 | 
29 |       - name: Run cargo check
30 |         uses: actions-rs/cargo@v1
31 |         with:
32 |           command: check
33 | 
34 |       - name: Run cargo fmt
35 |         uses: actions-rs/cargo@v1
36 |         with:
37 |           command: fmt
38 |           args: --all -- --check
39 | 
40 |       - name: Run cargo clippy
41 |         uses: actions-rs/cargo@v1
42 |         with:
43 |           command: clippy
44 |           args: -- -D warnings -W clippy::nursery
45 | 
46 |       - name: Run cargo test
47 |         uses: actions-rs/cargo@v1
48 |         continue-on-error: ${{ matrix.rust == 'nightly' }}
49 |         with:
50 |           command: test
51 |           args: --release --all-features
52 | 
53 |       - name: Run cargo doc
54 |         uses: actions-rs/cargo@v1
55 |         continue-on-error: ${{ matrix.rust == 'nightly' }}
56 |         with:
57 |           command: doc
58 |           args: --no-deps
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 
12 | 
13 | # Added by cargo
14 | 
15 | /target
16 | 
17 | # My def
18 | reuters.txt
19 | reuters.5k.txt
20 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @kampersanda
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project.
 4 | There are just a few small guidelines you need to follow.
 5 | 
 6 | - You need to acknowledge that your patches and contributions never conflict
 7 |   with any intellectual properties held by others than LegalForce.
 8 | - All submissions, including submissions by project members, require review.
 9 |   We use GitHub pull requests for this purpose.
10 |   Consult GitHub Help for more information on using pull requests.


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = [
3 |     "all-pairs-hamming",
4 |     "all-pairs-hamming/timeperf",
5 |     "find-simdoc",
6 |     "find-simdoc-cli",
7 | ]
8 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 legalforce-research
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Finding all pairs of similar documents
  2 | 
  3 | [![Crates.io](https://img.shields.io/crates/v/find-simdoc)](https://crates.io/crates/find-simdoc)
  4 | [![Documentation](https://docs.rs/find-simdoc/badge.svg)](https://docs.rs/find-simdoc)
  5 | ![Build Status](https://github.com/legalforce-research/find-simdoc/actions/workflows/rust.yml/badge.svg)
  6 | 
  7 | This software provides time- and memory-efficient all pairs similarity searches in documents.
  8 | 
  9 | ## Problem definition
 10 | 
 11 | - Input
 12 |   - List of documents $D = (d_1, d_2, \dots, d_n)$
 13 |   - Distance function $\delta: D \times D \rightarrow [0,1]$
 14 |   - Radius threshold $r \in [0,1]$
 15 | - Output
 16 |   - All pairs of similar document ids $R = \\{ (i,j): i < j, \delta(d_i, d_j) \leq r \\}$
 17 | 
 18 | ## Features
 19 | 
 20 | ### Easy to use
 21 | 
 22 | This software supports all essential steps of document similarity search,
 23 | from feature extraction to output of similar pairs.
 24 | Therefore, you can immediately try the fast all pairs similarity search using your document files.
 25 | 
 26 | ### Flexible tokenization
 27 | 
 28 | You can specify any delimiter when splitting words in tokenization for feature extraction.
 29 | This can be useful in languages where multiple definitions of words exist, such as Japanese or Chinese.
 30 | 
 31 | ### Time and memory efficiency
 32 | 
 33 | The time and memory complexities are *linear* over the numbers of input documents and output results
 34 | on the basis of the ideas behind the locality sensitive hashing (LSH) and [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html).
 35 | 
 36 | ### Tunable search performance
 37 | 
 38 | LSH allows tuning of performance in accuracy, time, and memory, through a manual parameter specifying search dimensions.
 39 | You can flexibly perform searches depending on your dataset and machine environment.
 40 |   - Specifying lower dimensions allows for faster and rougher searches with less memory usage.
 41 |   - Specifying higher dimensions allows for more accurate searches with more memory usage.
 42 | 
 43 | ### Pure Rust
 44 | 
 45 | This software is implemented in Rust, achieving safe and fast performance.
 46 | 
 47 | ## Running example
 48 | 
 49 | Here, we describe the basic usage of this software through an example of running the CLI tool.
 50 | 
 51 | First of all, install `rustc` and `cargo` following the [official instructions](https://www.rust-lang.org/tools/install) since this software is implemented in Rust.
 52 | 
 53 | ### 1. Data preparation
 54 | 
 55 | You have to prepare a text file containing documents line by line (NOT including empty lines).
 56 | 
 57 | To produce an example file used throughout this description, you can use `scripts/load_nltk_dataset.py` that downloads the Reuters Corpus provided by NLTK.
 58 | Run the following command.
 59 | 
 60 | ```
 61 | $ ./scripts/load_nltk_dataset.py reuters
 62 | ```
 63 | 
 64 | `reuters.txt` will be output.
 65 | 
 66 | ```
 67 | $ head reuters.txt
 68 | hre properties & lt ; hre > 1st qtr jan 31 net shr 38 cts vs 47 cts net 2 , 253 , 664 vs 2 , 806 , 820 gross income 5 , 173 , 318 vs 5 , 873 , 904 note : net includes gains on sale of real estate of 126 , 117 dlrs vs 29 , 812 dlrs .
 69 | the firm , however , is supplying temporary financing , and sources close to the transaction disputed the claim that the firm will not end up paying for its equity position . 
 70 | conoco , which has completed geological prospecting for the tunisian government , has transferred one third of its option rights in the region to ina , it said .
 71 | " willis faber ' s stake in morgan grenfell has been a very successful investment ," it said .
 72 | china reports 700 mln dlr two - month trade deficit china ' s trade deficit totalled 700 mln dlrs in the first two months of this year , according to figures released by the state statistics bureau .
 73 | the treasury said baker and stoltenberg " are consulting with their g - 7 colleagues and are confident that this will enable them to foster exchange rate stability around current levels ."
 74 | u . s . tariffs are due to take effect on april 17 .
 75 | some dealers said there were growing signs the united states wanted the dollar to fall further .
 76 | since last august smart has been leading talks to open up japan to purchases of more u . s .- made automotive parts .
 77 | the resulting association will operate under the name of charter and will be based in bristol .
 78 | ```
 79 | 
 80 | Fully-duplicate documents in `reuters.txt` are removed because they are noisy in evaluation of similarity searches.
 81 | To do this, the output lines are shuffled, and your file will not be the identical to the example.
 82 | 
 83 | ### 2. Finding all pairs of similar documents
 84 | 
 85 | The workspace `find-simdoc-cli` provides CLI tools for fast all pairs similarity searches in documents.
 86 | The approach consists of three steps:
 87 | 
 88 | 1. Extract features from documents
 89 |    - Set representation of character or word ngrams
 90 |    - Tfidf-weighted vector representation of character or word ngrams
 91 | 2. Convert the features into binary sketches through locality sensitive hashing (LSH)
 92 |    - [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity
 93 |    - [Simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592) for the Cosine similarity
 94 | 3. Search for similar sketches in the Hamming space using a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html)
 95 | 
 96 | #### 2.1 Jaccard space
 97 | 
 98 | The executable `jaccard` provides a similarity search in the [Jaccard space](https://en.wikipedia.org/wiki/Jaccard_index).
 99 | You can check the arguments with the following command.
100 | 
101 | ```
102 | $ cargo run --release -p find-simdoc-cli --bin jaccard -- --help
103 | ```
104 | 
105 | Run the following command if you want to search for `reuters.txt` with
106 | 
107 | - search radius `0.1`,
108 | - tokens of character `5`-grams, and
109 | - `15*64=960` dimensions in the Hamming space.
110 | 
111 | ```
112 | $ cargo run --release -p find-simdoc-cli --bin jaccard -- -i reuters.txt -r 0.1 -w 5 -c 15 > result-jaccard.csv
113 | ```
114 | 
115 | Argument `-c` indicates the number of dimensions in the Hamming space,
116 | a trade-off parameter between approximation accuracy and search speed.
117 | The larger this value, the higher the accuracy, but the longer the search takes.
118 | [This section](#4-testing-the-accuracy-of-1-bit-minwise-hashing) describes how to examine the approximation accuracy for the number of dimensions.
119 | 
120 | Pairs of similar documents (indicated by zero-origin line numbers) and their distances are reported.
121 | 
122 | ```
123 | $ head result-jaccard.csv
124 | i,j,dist
125 | 191,29637,0.07291666666666667
126 | 199,38690,0.0375
127 | 274,10048,0.07083333333333333
128 | 294,27675,0.04791666666666667
129 | 311,13812,0.04583333333333333
130 | 361,50938,0.08958333333333333
131 | 469,6360,0.035416666666666666
132 | 546,10804,0.0875
133 | 690,28281,0.0875
134 | ```
135 | 
136 | #### 2.2 Cosine space
137 | 
138 | The executable `cosine` provides a similarity search in the [Cosine space](https://en.wikipedia.org/wiki/Cosine_similarity).
139 | You can check the arguments with the following command.
140 | 
141 | ```
142 | $ cargo run --release -p find-simdoc-cli --bin cosine -- --help
143 | ```
144 | 
145 | Run the following command if you want to search for `reuters.txt` with
146 | 
147 | - search radius `0.1`,
148 | - tokens of word `3`-grams,
149 | - word delimiter `" "` (i.e., a space),
150 | - `10*64=640` dimensions in the Hamming space, and
151 | - weighting using the standard TF and the smoothed IDF.
152 | 
153 | ```
154 | $ cargo run --release -p find-simdoc-cli --bin cosine -- -i reuters.txt -r 0.1 -d " " -w 3 -c 10 -T standard -I smooth > result-cosine.csv
155 | ```
156 | 
157 | Pairs of similar documents (indicated by zero-origin line numbers) and their distances are reported.
158 | 
159 | ```
160 | $ head result-cosine.csv
161 | i,j,dist
162 | 542,49001,0.084375
163 | 964,24198,0.09375
164 | 1872,3024,0.0859375
165 | 1872,6823,0.090625
166 | 1872,8462,0.0953125
167 | 1872,11402,0.090625
168 | 1872,18511,0.0859375
169 | 1872,41491,0.0875
170 | 1872,48344,0.0859375
171 | ```
172 | 
173 | ### 3. Printing similar documents
174 | 
175 | The executable `dump` prints similar documents from an output CSV file.
176 | 
177 | If you want to print similar documents in `reuters.txt` with the result `result-jaccard.csv`,
178 | run the following command.
179 | 
180 | ```
181 | $ cargo run --release -p find-simdoc-cli --bin dump -- -i reuters.txt -s result-jaccard.csv
182 | [i=191,j=29637,dist=0.07291666666666667]
183 | pending its deliberations , harper and row ' s board has postponed indefinitely a special meeting of stockholders that had been scheduled for april 2 to discuss a proposal to recapitalize the company ' s stock to create two classes of shares with different voting rights .
184 | pending its deliberations , harper and row ' s board has postponed indefinitely a special meeting of stockholders that had been scheduled for april 2 to discuss a proposal to recapitalize the company ' s stock in order to create two classes of shares with different votinmg rights .
185 | [i=199,j=38690,dist=0.0375]
186 | government officials had no immediate comment on the report , which advised a reduction in the overall size of the public investment programme and greater emphasis on the preservation of peru ' s export potential .
187 | government officials had no immediate comment on the report , which advised a reduction in the overall size of the public investment program and greater emphasis on the preservation of peru ' s export potential .
188 | [i=274,j=10048,dist=0.07083333333333333]
189 | the measure was adopted as part of a wide - ranging trade bill that will be considered by the full house in april before it moves on to the senate .
190 | the measure was adopted as part of a wide - ranging trade bill that will be considered by the full house in april before it moves onto the senate .
191 | [i=294,j=27675,dist=0.04791666666666667]
192 | the company said the start - up was necessitated by continuing strong demand for aluminum and dwindling worldwide inventories , and that the metal is needed to supply reynolds ' various fabricating businesses .
193 | the company said the start up was necessitated by continuing strong demand for aluminum and dwindling worldwide inventories , and that the metal is needed to supply reynolds ' various fabricating businesses .
194 | [i=311,j=13812,dist=0.04583333333333333]
195 | he said in an interview with reuter that after a few years it was likely south korea would drop barriers to foreign goods and move toward a more balanced trade position .
196 | he said in an interview with reuters that after a few years it was likely south korea would drop barriers to foreign goods and move toward a more balanced trade position .
197 | [i=361,j=50938,dist=0.08958333333333333]
198 | hog and cattle slaughter guesstimates chicago mercantile exchange floor traders and commission house representatives are guesstimating today ' s hog slaughter at about 295 , 000 to 305 , 000 head versus 307 , 000 week ago and 311 , 000 a year ago .
199 | hog and cattle slaughter guesstimates chicago mercantile exchange floor traders and commission house representatives are guesstimating today ' s hog slaughter at about 295 , 000 to 308 , 000 head versus 305 , 000 week ago and 308 , 000 a year ago .
200 | [i=469,j=6360,dist=0.035416666666666666]
201 | the national planning department forecast that in 1987 coffee , colombia ' s traditional major export , will account for only one - third of total exports , or about 1 . 5 billion dlrs .
202 | the national planning department forecast that in 1987 coffee , colombia ' s traditional major export , will account for only one third of total exports , or about 1 . 5 billion dlrs .
203 | ...
204 | ```
205 | 
206 | ### 4. Testing the accuracy of 1-bit minwise hashing
207 | 
208 | LSH is an approximate solution, and you may want to know the accuracy.
209 | The executable `minhash_acc` allows you to examine
210 | - the mean absolute error that is the averaged gap between the normalized Hamming distance and the actual Jaccard distance; and
211 | - the number of true results, precisions, recalls, and F1-scores for search radii {0.01, 0.02, 0.05, 0.1, 0.2, 0.5}.
212 | 
213 | To use this executable, we recommend extracting a small subset from your dataset
214 | because it exactly computes distances for all possible pairs (although the computation is accelerated with parallelization).
215 | 
216 | ```
217 | $ head -5000 reuters.txt > reuters.5k.txt
218 | ```
219 | 
220 | You can test the number of Hamming dimensions from 64 to 6400
221 | (i.e., the number of chunks from 1 to 100 indicated with `-c`)
222 | with the following command.
223 | The arguments for feature extraction are the same as those of `jaccard`.
224 | 
225 | ```
226 | $ cargo run --release -p find-simdoc-cli --bin minhash_acc -- -i reuters.5k.txt -w 5 > acc.csv
227 | ```
228 | 
229 | ## Approximation accuracy of 1-bit minwise hashing
230 | 
231 | LSH is an approximate solution, and the number of dimensions in the Hamming space
232 | (indicated with the command line argument `-c`) is related to the approximation accuracy.
233 | As a hint for choosing a parameter of `-c`, we show experimental results obtained from `reuters.txt` of 51,535 documents when setting `-w 5`.
234 | 
235 | ### Mean absolute error (MAE)
236 | 
237 | The following figure shows MAEs while varying the number of Hamming dimensions from 64 to 6400 (i.e., the number of chunks from 1 to 100 indicated with `-c`).
238 | 
239 | ![](./figures/mae_reuters.svg)
240 | 
241 | As expected, the larger the number, the higher the accuracy. For example, when the number of dimensions is 1024 (setting the argument `-c 16`), we achieve the MAE around 2.5%.
242 | 
243 | ### Recall
244 | 
245 | Of the precision, recall, and F1 score, the most interesting would be the recall.
246 | This is because false positives can be filtered out in post processing.
247 | 
248 | The following figure shows recalls in search with radii 0.05, 0.1, and 0.2 (indicated with the argument `-r`).
249 | 
250 | ![](./figures/recall_reuters.svg)
251 | 
252 | For radii 0.1 and 0.2, over 90% recalls are achieved in most cases.
253 | For smaller radius 0.05, 75-90% recalls are obtained because the MAE becomes larger relative to the radius.
254 | 
255 | By the way, the numbers of true results are 50, 201, and 626 for radii 0.05, 0.1, and 0.2, respecitvely.
256 | 
257 | ### F1 score
258 | 
259 | The following figure shows F1 scores in search with radii 0.05, 0.1, and 0.2 (indicated with the argument `-r`).
260 | 
261 | ![](./figures/f1_reuters.svg)
262 | 
263 | - For radius 0.05, over 90% scores are achieved from 3520 dimensions (setting `-c 55`). 
264 | - For radius 0.1, over 90% scores are achieved from 704 dimensions (setting `-c 11`).
265 | - For radius 0.2, over 90% scores are achieved from 448 dimensions (setting `-c 7`).
266 | 
267 | ## Disclaimer
268 | 
269 | This software is developed by LegalForce, Inc.,
270 | but not an officially supported LegalForce product.
271 | 
272 | ## License
273 | 
274 | Licensed under either of
275 | 
276 |  * Apache License, Version 2.0
277 |    ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
278 |  * MIT license
279 |    ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
280 | 
281 | at your option.
282 | 
283 | ## Contribution
284 | 
285 | Unless you explicitly state otherwise, any contribution intentionally submitted
286 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
287 | dual licensed as above, without any additional terms or conditions.
288 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "all-pairs-hamming"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | authors = ["Shunsuke Kanda <shnsk.knd@gmail.com>"]
 6 | description = "All pairs similarity search on binary sketches in the Hamming space."
 7 | license = "MIT OR Apache-2.0"
 8 | homepage = "https://github.com/legalforce-research/find-simdoc"
 9 | repository = "https://github.com/legalforce-research/find-simdoc"
10 | readme = "README.md"
11 | keywords = ["search", "similarity", "all-pairs", "lsh"]
12 | categories = ["text-processing", "algorithms"]
13 | 
14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
15 | 
16 | [dependencies]
17 | num-traits = "0.2.15" # MIT or Apache-2.0
18 | hashbrown = "0.12.3" # MIT or Apache-2.0


--------------------------------------------------------------------------------
/all-pairs-hamming/README.md:
--------------------------------------------------------------------------------
 1 | # All pairs similarity search on binary sketches in the Hamming space
 2 | 
 3 | This library provides a fast and compact all pairs similarity search (or *similarity self-join*)
 4 | on binary sketches in the Hamming space.
 5 | The algorithm employs a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html),
 6 | a combination of the [multiple sorting](https://doi.org/10.1007/s10115-009-0271-6)
 7 | and the [multi-index approach](https://doi.org/10.1109/TKDE.2019.2899597).
 8 | 
 9 | This library is a part of [find-simdoc](https://github.com/legalforce-research/find-simdoc).
10 | 
11 | ## API documentation
12 | 
13 | https://docs.rs/all-pairs-hamming
14 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/bitset64.rs:
--------------------------------------------------------------------------------
  1 | #[derive(Clone, Copy)]
  2 | pub struct Bitset64(u64);
  3 | 
  4 | impl Bitset64 {
  5 |     #[inline(always)]
  6 |     pub const fn new() -> Self {
  7 |         Self(0)
  8 |     }
  9 | 
 10 |     #[inline(always)]
 11 |     pub const fn add(mut self, i: usize) -> Self {
 12 |         assert!(i < 64);
 13 |         self.0 |= 1 << i;
 14 |         self
 15 |     }
 16 | 
 17 |     #[inline(always)]
 18 |     pub const fn max(&self) -> Option<usize> {
 19 |         if self.0 == 0 {
 20 |             None
 21 |         } else {
 22 |             Some(63 - self.0.leading_zeros() as usize)
 23 |         }
 24 |     }
 25 | 
 26 |     #[inline(always)]
 27 |     pub const fn inverse(mut self) -> Self {
 28 |         self.0 = !self.0;
 29 |         self
 30 |     }
 31 | 
 32 |     #[inline(always)]
 33 |     pub const fn iter(&self) -> Bitset64Iter {
 34 |         Bitset64Iter(self.0)
 35 |     }
 36 | 
 37 |     #[inline(always)]
 38 |     pub const fn len(&self) -> usize {
 39 |         self.0.count_ones() as usize
 40 |     }
 41 | 
 42 |     #[inline(always)]
 43 |     #[allow(dead_code)]
 44 |     pub const fn is_empty(&self) -> bool {
 45 |         self.len() == 0
 46 |     }
 47 | }
 48 | 
 49 | pub struct Bitset64Iter(u64);
 50 | 
 51 | impl Iterator for Bitset64Iter {
 52 |     type Item = usize;
 53 | 
 54 |     #[inline(always)]
 55 |     fn next(&mut self) -> Option<Self::Item> {
 56 |         if self.0 == 0 {
 57 |             return None;
 58 |         }
 59 |         let numtz = self.0.trailing_zeros() as usize;
 60 |         let mask = 1 << numtz;
 61 |         self.0 ^= mask;
 62 |         Some(numtz)
 63 |     }
 64 | }
 65 | 
 66 | #[cfg(test)]
 67 | mod tests {
 68 |     use super::*;
 69 | 
 70 |     #[test]
 71 |     fn test_basic() {
 72 |         // {}
 73 |         let mut s = Bitset64::new();
 74 |         assert_eq!(s.len(), 0);
 75 |         assert_eq!(s.is_empty(), true);
 76 |         assert_eq!(s.max(), None);
 77 |         assert_eq!(s.iter().collect::<Vec<_>>(), vec![]);
 78 | 
 79 |         // {2}
 80 |         s = s.add(2);
 81 |         assert_eq!(s.len(), 1);
 82 |         assert_eq!(s.is_empty(), false);
 83 |         assert_eq!(s.max(), Some(2));
 84 |         assert_eq!(s.iter().collect::<Vec<_>>(), vec![2]);
 85 | 
 86 |         // {2,9}
 87 |         s = s.add(9);
 88 |         assert_eq!(s.len(), 2);
 89 |         assert_eq!(s.is_empty(), false);
 90 |         assert_eq!(s.max(), Some(9));
 91 |         assert_eq!(s.iter().collect::<Vec<_>>(), vec![2, 9]);
 92 | 
 93 |         // {2,5,9}
 94 |         s = s.add(5);
 95 |         assert_eq!(s.len(), 3);
 96 |         assert_eq!(s.is_empty(), false);
 97 |         assert_eq!(s.max(), Some(9));
 98 |         assert_eq!(s.iter().collect::<Vec<_>>(), vec![2, 5, 9]);
 99 | 
100 |         // {2,5,9}
101 |         s = s.add(9);
102 |         assert_eq!(s.len(), 3);
103 |         assert_eq!(s.is_empty(), false);
104 |         assert_eq!(s.max(), Some(9));
105 |         assert_eq!(s.iter().collect::<Vec<_>>(), vec![2, 5, 9]);
106 | 
107 |         // !{2,5,9}
108 |         s = s.inverse();
109 |         assert_eq!(s.len(), 61);
110 |         assert_eq!(s.is_empty(), false);
111 |         assert_eq!(s.max(), Some(63));
112 | 
113 |         let mut expexted = vec![0, 1, 3, 4, 6, 7, 8];
114 |         expexted.extend(10..64);
115 |         assert_eq!(s.iter().collect::<Vec<_>>(), expexted);
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/chunked_join.rs:
--------------------------------------------------------------------------------
  1 | //! A fast and compact implementation of similarity self-join on binary sketches in the Hamming space.
  2 | use hashbrown::HashSet;
  3 | 
  4 | use crate::errors::{AllPairsHammingError, Result};
  5 | use crate::multi_sort::MultiSort;
  6 | use crate::sketch::Sketch;
  7 | 
  8 | /// A fast and compact implementation of similarity self-join on binary sketches in the Hamming space.
  9 | /// The algorithm employs a modified variant of the sketch sorting with the multi-index approach.
 10 | ///
 11 | /// # Complexities
 12 | ///
 13 | /// The time and memory complexities are linear in the input and output size.
 14 | ///
 15 | /// # Examples
 16 | ///
 17 | /// ```
 18 | /// use all_pairs_hamming::ChunkedJoiner;
 19 | ///
 20 | /// let mut joiner = ChunkedJoiner::<u8>::new(2);
 21 | /// joiner.add([0b1111, 0b1001]);
 22 | /// joiner.add([0b1101, 0b1001]);
 23 | /// joiner.add([0b0101, 0b0001]);
 24 | ///
 25 | /// let mut results = joiner.similar_pairs(0.15);
 26 | /// assert_eq!(results, vec![(0, 1, 0.0625), (1, 2, 0.125)]);
 27 | /// ```
 28 | ///
 29 | /// # References
 30 | ///
 31 | /// - Tabei, Uno, Sugiyama, and Tsuda.
 32 | ///   [Single versus Multiple Sorting in All Pairs Similarity Search](https://proceedings.mlr.press/v13/tabei10a.html).
 33 | ///   ACML, 2010
 34 | /// - J. Qin et al.
 35 | ///   [Generalizing the Pigeonhole Principle for Similarity Search in Hamming Space](https://doi.org/10.1109/TKDE.2019.2899597).
 36 | ///   IEEE Transactions on Knowledge and Data Engineering, 2021
 37 | pub struct ChunkedJoiner<S> {
 38 |     chunks: Vec<Vec<S>>,
 39 |     shows_progress: bool,
 40 | }
 41 | 
 42 | impl<S> ChunkedJoiner<S>
 43 | where
 44 |     S: Sketch,
 45 | {
 46 |     /// Creates an instance, handling sketches of `num_chunks` chunks, i.e.,
 47 |     /// in `S::dim() * num_chunks` dimensions.
 48 |     pub fn new(num_chunks: usize) -> Self {
 49 |         Self {
 50 |             chunks: vec![vec![]; num_chunks],
 51 |             shows_progress: false,
 52 |         }
 53 |     }
 54 | 
 55 |     /// Prints the progress with stderr?
 56 |     pub const fn shows_progress(mut self, yes: bool) -> Self {
 57 |         self.shows_progress = yes;
 58 |         self
 59 |     }
 60 | 
 61 |     /// Appends a sketch of [`Self::num_chunks()`] chunks.
 62 |     /// The first [`Self::num_chunks()`] elements of an input iterator is stored.
 63 |     /// If the iterator is consumed until obtaining the elements, an error is returned.
 64 |     pub fn add<I>(&mut self, sketch: I) -> Result<()>
 65 |     where
 66 |         I: IntoIterator<Item = S>,
 67 |     {
 68 |         let num_chunks = self.num_chunks();
 69 |         let mut iter = sketch.into_iter();
 70 |         for chunk in self.chunks.iter_mut() {
 71 |             chunk.push(iter.next().ok_or_else(|| {
 72 |                 let msg = format!("The input sketch must include {num_chunks} chunks at least.");
 73 |                 AllPairsHammingError::input(msg)
 74 |             })?);
 75 |         }
 76 |         Ok(())
 77 |     }
 78 | 
 79 |     /// Finds all similar pairs whose normalized Hamming distance is within `radius`,
 80 |     /// returning triplets of the left-side id, the right-side id, and thier distance.
 81 |     pub fn similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
 82 |         let dimension = S::dim() * self.num_chunks();
 83 |         let hamradius = (dimension as f64 * radius).ceil() as usize;
 84 |         if self.shows_progress {
 85 |             eprintln!(
 86 |                 "[ChunkedJoiner::similar_pairs] #dimensions={dimension}, hamradius={hamradius}"
 87 |             );
 88 |         }
 89 | 
 90 |         // TODO: Threading.
 91 |         let mut candidates = HashSet::new();
 92 |         for (j, chunk) in self.chunks.iter().enumerate() {
 93 |             // Based on the general pigeonhole principle.
 94 |             // https://doi.org/10.1109/TKDE.2019.2899597
 95 |             if j + hamradius + 1 < self.chunks.len() {
 96 |                 continue;
 97 |             }
 98 |             let r = (j + hamradius + 1 - self.chunks.len()) / self.chunks.len();
 99 |             MultiSort::new().similar_pairs(chunk, r, &mut candidates);
100 | 
101 |             if self.shows_progress {
102 |                 eprintln!(
103 |                     "[ChunkedJoiner::similar_pairs] Processed {}/{}...",
104 |                     j + 1,
105 |                     self.chunks.len()
106 |                 );
107 |                 eprintln!(
108 |                     "[ChunkedJoiner::similar_pairs] #candidates={}",
109 |                     candidates.len()
110 |                 );
111 |             }
112 |         }
113 |         if self.shows_progress {
114 |             eprintln!("[ChunkedJoiner::similar_pairs] Done");
115 |         }
116 | 
117 |         let mut candidates: Vec<_> = candidates.into_iter().collect();
118 |         candidates.sort_unstable();
119 | 
120 |         let bound = (dimension as f64 * radius) as usize;
121 |         let mut matched = vec![];
122 | 
123 |         for (i, j) in candidates {
124 |             if let Some(dist) = self.hamming_distance(i, j, bound) {
125 |                 let dist = dist as f64 / dimension as f64;
126 |                 if dist <= radius {
127 |                     matched.push((i, j, dist));
128 |                 }
129 |             }
130 |         }
131 |         if self.shows_progress {
132 |             eprintln!("[ChunkedJoiner::similar_pairs] #matched={}", matched.len());
133 |         }
134 |         matched
135 |     }
136 | 
137 |     /// Gets the number of chunks.
138 |     pub fn num_chunks(&self) -> usize {
139 |         self.chunks.len()
140 |     }
141 | 
142 |     /// Gets the number of stored sketches.
143 |     pub fn num_sketches(&self) -> usize {
144 |         self.chunks.first().map(|v| v.len()).unwrap_or(0)
145 |     }
146 | 
147 |     /// Gets the memory usage in bytes.
148 |     pub fn memory_in_bytes(&self) -> usize {
149 |         self.num_chunks() * self.num_sketches() * std::mem::size_of::<S>()
150 |     }
151 | 
152 |     fn hamming_distance(&self, i: usize, j: usize, bound: usize) -> Option<usize> {
153 |         let mut dist = 0;
154 |         for chunk in &self.chunks {
155 |             dist += chunk[i].hamdist(chunk[j]);
156 |             if bound < dist {
157 |                 return None;
158 |             }
159 |         }
160 |         Some(dist)
161 |     }
162 | }
163 | 
164 | #[cfg(test)]
165 | mod tests {
166 |     use super::*;
167 | 
168 |     fn example_sketches() -> Vec<u16> {
169 |         vec![
170 |             0b_1110_0011_1111_1011, // 0
171 |             0b_0001_0111_0111_1101, // 1
172 |             0b_1100_1101_1000_1100, // 2
173 |             0b_1100_1101_0001_0100, // 3
174 |             0b_1010_1110_0010_1010, // 4
175 |             0b_0111_1001_0011_1111, // 5
176 |             0b_1110_0011_0001_0000, // 6
177 |             0b_1000_0111_1001_0101, // 7
178 |             0b_1110_1101_1000_1101, // 8
179 |             0b_0111_1001_0011_1001, // 9
180 |         ]
181 |     }
182 | 
183 |     fn naive_search(sketches: &[u16], radius: f64) -> Vec<(usize, usize, f64)> {
184 |         let mut results = vec![];
185 |         for i in 0..sketches.len() {
186 |             let x = sketches[i];
187 |             for j in i + 1..sketches.len() {
188 |                 let y = sketches[j];
189 |                 let dist = x.hamdist(y);
190 |                 let dist = dist as f64 / 16.;
191 |                 if dist <= radius {
192 |                     results.push((i, j, dist));
193 |                 }
194 |             }
195 |         }
196 |         results
197 |     }
198 | 
199 |     fn test_similar_pairs(radius: f64) {
200 |         let sketches = example_sketches();
201 |         let expected = naive_search(&sketches, radius);
202 | 
203 |         let mut joiner = ChunkedJoiner::new(2);
204 |         for s in sketches {
205 |             joiner.add([(s & 0xFF) as u8, (s >> 8) as u8]).unwrap();
206 |         }
207 |         let mut results = joiner.similar_pairs(radius);
208 |         results.sort_by_key(|&(i, j, _)| (i, j));
209 |         assert_eq!(results, expected);
210 |     }
211 | 
212 |     #[test]
213 |     fn test_similar_pairs_for_all() {
214 |         for radius in 0..=10 {
215 |             test_similar_pairs(radius as f64 / 10.);
216 |         }
217 |     }
218 | 
219 |     #[test]
220 |     fn test_short_sketch() {
221 |         let mut joiner = ChunkedJoiner::new(2);
222 |         let result = joiner.add([0u64]);
223 |         assert!(result.is_err());
224 |     }
225 | }
226 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/errors.rs:
--------------------------------------------------------------------------------
 1 | //! Error definitions.
 2 | use std::error::Error;
 3 | use std::{fmt, result};
 4 | 
 5 | /// A specialized Result type for this library.
 6 | pub type Result<T, E = AllPairsHammingError> = result::Result<T, E>;
 7 | 
 8 | /// Errors in this library.
 9 | #[derive(Debug)]
10 | pub enum AllPairsHammingError {
11 |     /// Contains [`InputError`].
12 |     Input(InputError),
13 | }
14 | 
15 | impl fmt::Display for AllPairsHammingError {
16 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
17 |         match self {
18 |             Self::Input(e) => e.fmt(f),
19 |         }
20 |     }
21 | }
22 | 
23 | impl Error for AllPairsHammingError {}
24 | 
25 | impl AllPairsHammingError {
26 |     pub(crate) const fn input(msg: String) -> Self {
27 |         Self::Input(InputError { msg })
28 |     }
29 | }
30 | 
31 | /// Error used when the input argument is invalid.
32 | #[derive(Debug)]
33 | pub struct InputError {
34 |     msg: String,
35 | }
36 | 
37 | impl fmt::Display for InputError {
38 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
39 |         write!(f, "InputError: {}", self.msg)
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! This library provides a fast and compact all pairs similarity search (or *similarity self-join*)
 2 | //! on binary sketches in the Hamming space.
 3 | //! The algorithm employs a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html),
 4 | //! a combination of the [multiple sorting](https://doi.org/10.1007/s10115-009-0271-6)
 5 | //! and the [multi-index approach](https://doi.org/10.1109/TKDE.2019.2899597).
 6 | #![deny(missing_docs)]
 7 | 
 8 | mod bitset64;
 9 | pub mod chunked_join;
10 | pub mod errors;
11 | pub mod multi_sort;
12 | pub mod simple_join;
13 | pub mod sketch;
14 | 
15 | pub use chunked_join::ChunkedJoiner;
16 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/multi_sort.rs:
--------------------------------------------------------------------------------
  1 | //! The core part of [`crate::ChunkedJoiner`].
  2 | use std::cell::RefCell;
  3 | use std::ops::Range;
  4 | 
  5 | use hashbrown::HashSet;
  6 | 
  7 | use crate::bitset64::Bitset64;
  8 | use crate::sketch::Sketch;
  9 | 
 10 | const SORT_SHIFT: usize = 8;
 11 | const SORT_MASK: usize = (1 << SORT_SHIFT) - 1;
 12 | const DEFAULT_THRESHOLD_IN_SORT: usize = 1000;
 13 | 
 14 | #[derive(Clone, Debug, Default)]
 15 | struct Record<S> {
 16 |     id: usize,
 17 |     sketch: S,
 18 | }
 19 | 
 20 | /// The core part of [`crate::ChunkedJoiner`]
 21 | /// implementing the multiple sorting algorithm for short binary sketches.
 22 | ///
 23 | /// # Complexities
 24 | ///
 25 | /// The time and memory complexities are linear in the input and output size.
 26 | ///
 27 | /// # References
 28 | ///
 29 | /// - Uno.
 30 | ///   [Multi-sorting algorithm for finding pairs of similar short substrings from large-scale string data](https://doi.org/10.1007/s10115-009-0271-6).
 31 | ///   Knowl Inf Syst 25, 229–251 (2010).
 32 | #[derive(Clone, Debug)]
 33 | pub struct MultiSort<S> {
 34 |     radius: usize,
 35 |     num_blocks: usize,
 36 |     masks: Vec<S>,
 37 |     offsets: Vec<usize>,
 38 |     // For radix sort
 39 |     threshold_in_sort: usize,
 40 |     buckets: RefCell<[usize; SORT_MASK + 1]>,
 41 |     sorted: RefCell<Vec<Record<S>>>,
 42 | }
 43 | 
 44 | impl<S> Default for MultiSort<S>
 45 | where
 46 |     S: Sketch,
 47 | {
 48 |     fn default() -> Self {
 49 |         Self::new()
 50 |     }
 51 | }
 52 | 
 53 | impl<S> MultiSort<S>
 54 | where
 55 |     S: Sketch,
 56 | {
 57 |     /// Creates an instance.
 58 |     pub const fn new() -> Self {
 59 |         Self {
 60 |             radius: 0,
 61 |             num_blocks: 0,
 62 |             masks: vec![],
 63 |             offsets: vec![],
 64 |             threshold_in_sort: DEFAULT_THRESHOLD_IN_SORT,
 65 |             buckets: RefCell::new([0usize; SORT_MASK + 1]),
 66 |             sorted: RefCell::new(vec![]),
 67 |         }
 68 |     }
 69 | 
 70 |     /// Sets the number of blocks.
 71 |     pub fn num_blocks(mut self, num_blocks: usize) -> Self {
 72 |         if num_blocks <= S::dim() {
 73 |             self.num_blocks = num_blocks;
 74 |         }
 75 |         self
 76 |     }
 77 | 
 78 |     /// Sets the size threshold for partial sorting.
 79 |     /// If the partial size is smaller than the threshold, a quicksort is used;
 80 |     /// otherwise, a radix sort is used.
 81 |     pub const fn threshold_in_sort(mut self, threshold_in_sort: usize) -> Self {
 82 |         self.threshold_in_sort = threshold_in_sort;
 83 |         self
 84 |     }
 85 | 
 86 |     /// Finds all similar pairs whose Hamming distance is within `radius`,
 87 |     /// inserting the results in a given hash table.
 88 |     pub fn similar_pairs(
 89 |         mut self,
 90 |         sketches: &[S],
 91 |         radius: usize,
 92 |         results: &mut HashSet<(usize, usize)>,
 93 |     ) {
 94 |         if self.num_blocks == 0 || self.num_blocks < radius {
 95 |             // Following Tabei's paper.
 96 |             self.num_blocks = S::dim().min(radius + 3);
 97 |         }
 98 | 
 99 |         self.build_masks_and_offsets();
100 |         self.radius = radius;
101 |         self.sorted = RefCell::new(Vec::with_capacity(sketches.len()));
102 | 
103 |         let mut records: Vec<_> = sketches
104 |             .iter()
105 |             .enumerate()
106 |             .map(|(id, &sketch)| Record { id, sketch })
107 |             .collect();
108 |         self.similar_pairs_recur(&mut records, Bitset64::new(), results);
109 |     }
110 | 
111 |     fn build_masks_and_offsets(&mut self) {
112 |         let mut masks = vec![S::default(); self.num_blocks];
113 |         let mut offsets = vec![0; self.num_blocks + 1];
114 |         let mut i = 0;
115 |         for (b, mask) in masks.iter_mut().enumerate().take(self.num_blocks) {
116 |             let dim = (b + S::dim()) / self.num_blocks;
117 |             *mask = S::mask(i..i + dim);
118 |             i += dim;
119 |             offsets[b + 1] = i;
120 |         }
121 |         self.masks = masks;
122 |         self.offsets = offsets;
123 |     }
124 | 
125 |     fn similar_pairs_recur(
126 |         &self,
127 |         records: &mut [Record<S>],
128 |         blocks: Bitset64,
129 |         results: &mut HashSet<(usize, usize)>,
130 |     ) {
131 |         if blocks.len() == self.num_blocks - self.radius {
132 |             self.verify_all_pairs(records, blocks, results);
133 |             return;
134 |         }
135 | 
136 |         let mut ranges = vec![];
137 |         let max_block = blocks.max().map(|x| x + 1).unwrap_or(0);
138 | 
139 |         for b in max_block..self.num_blocks {
140 |             self.sort_sketches(b, records);
141 |             self.collision_ranges(b, records, &mut ranges);
142 |             for r in ranges.iter().cloned() {
143 |                 self.similar_pairs_recur(&mut records[r], blocks.add(b), results);
144 |             }
145 |         }
146 |     }
147 | 
148 |     fn verify_all_pairs(
149 |         &self,
150 |         records: &[Record<S>],
151 |         blocks: Bitset64,
152 |         results: &mut HashSet<(usize, usize)>,
153 |     ) {
154 |         for i in 0..records.len() {
155 |             let x = &records[i];
156 |             for y in records.iter().skip(i + 1) {
157 |                 debug_assert!(self.debug_block_collisions(x.sketch, y.sketch, blocks));
158 |                 if x.sketch.hamdist(y.sketch) <= self.radius
159 |                     && self.check_canonical(x.sketch, y.sketch, blocks)
160 |                 {
161 |                     debug_assert_ne!(x.id, y.id);
162 |                     // Keeps the tuple order to ease debug.
163 |                     results.insert((x.id.min(y.id), x.id.max(y.id)));
164 |                 }
165 |             }
166 |         }
167 |     }
168 | 
169 |     fn check_canonical(&self, x: S, y: S, blocks: Bitset64) -> bool {
170 |         let max = blocks.max().unwrap_or(0);
171 |         let others = blocks.inverse();
172 |         for b in others.iter() {
173 |             if max <= b {
174 |                 break;
175 |             }
176 |             if x & self.masks[b] == y & self.masks[b] {
177 |                 return false;
178 |             }
179 |         }
180 |         true
181 |     }
182 | 
183 |     fn sort_sketches(&self, block_id: usize, records: &mut [Record<S>]) {
184 |         if records.len() < self.threshold_in_sort {
185 |             self.quick_sort_sketches(block_id, records);
186 |         } else {
187 |             self.radix_sort_sketches(block_id, records);
188 |         }
189 |     }
190 | 
191 |     fn quick_sort_sketches(&self, block_id: usize, records: &mut [Record<S>]) {
192 |         let mask = self.masks[block_id];
193 |         records.sort_unstable_by(|x, y| (x.sketch & mask).cmp(&(y.sketch & mask)));
194 |     }
195 | 
196 |     fn radix_sort_sketches(&self, block_id: usize, records: &mut [Record<S>]) {
197 |         let mut buckets = self.buckets.borrow_mut();
198 |         let mut sorted = self.sorted.borrow_mut();
199 |         sorted.resize(records.len(), Record::<S>::default());
200 | 
201 |         let mask = self.masks[block_id];
202 |         for j in (self.offsets[block_id]..self.offsets[block_id + 1]).step_by(SORT_SHIFT) {
203 |             buckets.fill(0);
204 |             for x in records.iter() {
205 |                 let k = ((x.sketch & mask) >> j).to_usize().unwrap() & SORT_MASK;
206 |                 buckets[k] += 1;
207 |             }
208 |             for k in 1..buckets.len() {
209 |                 buckets[k] += buckets[k - 1];
210 |             }
211 |             for x in records.iter().rev() {
212 |                 let k = ((x.sketch & mask) >> j).to_usize().unwrap() & SORT_MASK;
213 |                 buckets[k] -= 1;
214 |                 sorted[buckets[k]] = x.clone();
215 |             }
216 |             for i in 0..records.len() {
217 |                 records[i] = sorted[i].clone();
218 |             }
219 |         }
220 |     }
221 | 
222 |     fn collision_ranges(
223 |         &self,
224 |         block_id: usize,
225 |         records: &[Record<S>],
226 |         ranges: &mut Vec<Range<usize>>,
227 |     ) {
228 |         ranges.clear();
229 |         let mut i = 0;
230 |         for j in 1..records.len() {
231 |             let mask = self.masks[block_id];
232 |             let x = records[i].sketch & mask;
233 |             let y = records[j].sketch & mask;
234 |             if x == y {
235 |                 continue;
236 |             }
237 |             if 2 <= j - i {
238 |                 ranges.push(i..j);
239 |             }
240 |             i = j;
241 |         }
242 |         let j = records.len();
243 |         if 2 <= j - i {
244 |             ranges.push(i..j);
245 |         }
246 |     }
247 | 
248 |     fn debug_block_collisions(&self, x: S, y: S, blocks: Bitset64) -> bool {
249 |         for b in blocks.iter() {
250 |             let mx = x & self.masks[b];
251 |             let my = y & self.masks[b];
252 |             if mx != my {
253 |                 return false;
254 |             }
255 |         }
256 |         true
257 |     }
258 | }
259 | 
260 | #[cfg(test)]
261 | mod tests {
262 |     use super::*;
263 | 
264 |     fn example_sketches() -> Vec<u16> {
265 |         vec![
266 |             0b_1110_0011_1111_1011, // 0
267 |             0b_0001_0111_0111_1101, // 1
268 |             0b_1100_1101_1000_1100, // 2
269 |             0b_1100_1101_0001_0100, // 3
270 |             0b_1010_1110_0010_1010, // 4
271 |             0b_0111_1001_0011_1111, // 5
272 |             0b_1110_0011_0001_0000, // 6
273 |             0b_1000_0111_1001_0101, // 7
274 |             0b_1110_1101_1000_1101, // 8
275 |             0b_0111_1001_0011_1001, // 9
276 |         ]
277 |     }
278 | 
279 |     fn naive_search(sketches: &[u16], radius: usize) -> Vec<(usize, usize)> {
280 |         let mut results = vec![];
281 |         for i in 0..sketches.len() {
282 |             let x = sketches[i];
283 |             for j in i + 1..sketches.len() {
284 |                 let y = sketches[j];
285 |                 if x.hamdist(y) <= radius {
286 |                     results.push((i, j));
287 |                 }
288 |             }
289 |         }
290 |         results
291 |     }
292 | 
293 |     fn test_similar_pairs(radius: usize, num_blocks: usize) {
294 |         let sketches = example_sketches();
295 |         let expected = naive_search(&sketches, radius);
296 |         let mut results = HashSet::new();
297 |         MultiSort::new()
298 |             .num_blocks(num_blocks)
299 |             .threshold_in_sort(5)
300 |             .similar_pairs(&sketches, radius, &mut results);
301 |         let mut results: Vec<_> = results.into_iter().collect();
302 |         results.sort_unstable();
303 |         assert_eq!(results, expected);
304 |     }
305 | 
306 |     #[test]
307 |     fn test_similar_pairs_for_all() {
308 |         for radius in 0..=16 {
309 |             for num_blocks in radius..=16 {
310 |                 test_similar_pairs(radius, num_blocks);
311 |             }
312 |         }
313 |     }
314 | }
315 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/simple_join.rs:
--------------------------------------------------------------------------------
  1 | //! A naive implementation of similarity self-join on binary sketches in the Hamming space.
  2 | use crate::errors::{AllPairsHammingError, Result};
  3 | use crate::sketch::Sketch;
  4 | 
  5 | /// A naive implementation of similarity self-join on binary sketches in the Hamming space,
  6 | /// taking a quadratic time.
  7 | /// Do NOT use this for large datasets.
  8 | pub struct SimpleJoiner<S> {
  9 |     sketches: Vec<Vec<S>>,
 10 |     num_chunks: usize,
 11 |     shows_progress: bool,
 12 | }
 13 | 
 14 | impl<S> SimpleJoiner<S>
 15 | where
 16 |     S: Sketch,
 17 | {
 18 |     /// Creates an instance, handling sketches of `num_chunks` chunks, i.e.,
 19 |     /// in `S::dim() * num_chunks` dimensions.
 20 |     pub const fn new(num_chunks: usize) -> Self {
 21 |         Self {
 22 |             sketches: vec![],
 23 |             num_chunks,
 24 |             shows_progress: false,
 25 |         }
 26 |     }
 27 | 
 28 |     /// Prints the progress with stderr?
 29 |     pub const fn shows_progress(mut self, yes: bool) -> Self {
 30 |         self.shows_progress = yes;
 31 |         self
 32 |     }
 33 | 
 34 |     /// Appends a sketch of [`Self::num_chunks()`] chunks.
 35 |     /// The first [`Self::num_chunks()`] elements of an input iterator is stored.
 36 |     /// If the iterator is consumed until obtaining the elements, an error is returned.
 37 |     pub fn add<I>(&mut self, sketch: I) -> Result<()>
 38 |     where
 39 |         I: IntoIterator<Item = S>,
 40 |     {
 41 |         let mut iter = sketch.into_iter();
 42 |         let mut sketch = Vec::with_capacity(self.num_chunks());
 43 |         for _ in 0..self.num_chunks() {
 44 |             sketch.push(iter.next().ok_or_else(|| {
 45 |                 let msg = format!(
 46 |                     "The input sketch must include {} chunks at least.",
 47 |                     self.num_chunks()
 48 |                 );
 49 |                 AllPairsHammingError::input(msg)
 50 |             })?)
 51 |         }
 52 |         self.sketches.push(sketch);
 53 |         Ok(())
 54 |     }
 55 | 
 56 |     /// Finds all similar pairs whose normalized Hamming distance is within `radius`,
 57 |     /// returning triplets of the left-side id, the right-side id, and thier distance.
 58 |     pub fn similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
 59 |         let dimension = S::dim() * self.num_chunks();
 60 |         if self.shows_progress {
 61 |             eprintln!("[SimpleJoiner::similar_pairs] #dimensions={dimension}");
 62 |         }
 63 | 
 64 |         let bound = (dimension as f64 * radius) as usize;
 65 |         let mut matched = vec![];
 66 | 
 67 |         for i in 0..self.sketches.len() {
 68 |             if self.shows_progress && (i + 1) % 10000 == 0 {
 69 |                 eprintln!(
 70 |                     "[SimpleJoiner::similar_pairs] Processed {}/{}...",
 71 |                     i + 1,
 72 |                     self.sketches.len()
 73 |                 );
 74 |             }
 75 |             for j in i + 1..self.sketches.len() {
 76 |                 if let Some(dist) = self.hamming_distance(i, j, bound) {
 77 |                     let dist = dist as f64 / dimension as f64;
 78 |                     if dist <= radius {
 79 |                         matched.push((i, j, dist));
 80 |                     }
 81 |                 }
 82 |             }
 83 |         }
 84 |         if self.shows_progress {
 85 |             eprintln!("[SimpleJoiner::similar_pairs] Done");
 86 |             eprintln!("[SimpleJoiner::similar_pairs] #matched={}", matched.len());
 87 |         }
 88 |         matched
 89 |     }
 90 | 
 91 |     /// Gets the number of chunks.
 92 |     pub const fn num_chunks(&self) -> usize {
 93 |         self.num_chunks
 94 |     }
 95 | 
 96 |     /// Gets the number of stored sketches.
 97 |     pub fn num_sketches(&self) -> usize {
 98 |         self.sketches.len()
 99 |     }
100 | 
101 |     /// Gets the memory usage in bytes.
102 |     pub fn memory_in_bytes(&self) -> usize {
103 |         self.num_chunks() * self.num_sketches() * std::mem::size_of::<S>()
104 |     }
105 | 
106 |     fn hamming_distance(&self, i: usize, j: usize, bound: usize) -> Option<usize> {
107 |         let xs = &self.sketches[i];
108 |         let ys = &self.sketches[j];
109 |         let mut dist = 0;
110 |         for (&x, &y) in xs.iter().zip(ys.iter()) {
111 |             dist += x.hamdist(y);
112 |             if bound < dist {
113 |                 return None;
114 |             }
115 |         }
116 |         Some(dist)
117 |     }
118 | }
119 | 
120 | #[cfg(test)]
121 | mod tests {
122 |     use super::*;
123 | 
124 |     fn example_sketches() -> Vec<u16> {
125 |         vec![
126 |             0b_1110_0011_1111_1011, // 0
127 |             0b_0001_0111_0111_1101, // 1
128 |             0b_1100_1101_1000_1100, // 2
129 |             0b_1100_1101_0001_0100, // 3
130 |             0b_1010_1110_0010_1010, // 4
131 |             0b_0111_1001_0011_1111, // 5
132 |             0b_1110_0011_0001_0000, // 6
133 |             0b_1000_0111_1001_0101, // 7
134 |             0b_1110_1101_1000_1101, // 8
135 |             0b_0111_1001_0011_1001, // 9
136 |         ]
137 |     }
138 | 
139 |     fn naive_search(sketches: &[u16], radius: f64) -> Vec<(usize, usize, f64)> {
140 |         let mut results = vec![];
141 |         for i in 0..sketches.len() {
142 |             let x = sketches[i];
143 |             for j in i + 1..sketches.len() {
144 |                 let y = sketches[j];
145 |                 let dist = x.hamdist(y);
146 |                 let dist = dist as f64 / 16.;
147 |                 if dist <= radius {
148 |                     results.push((i, j, dist));
149 |                 }
150 |             }
151 |         }
152 |         results
153 |     }
154 | 
155 |     fn test_similar_pairs(radius: f64) {
156 |         let sketches = example_sketches();
157 |         let expected = naive_search(&sketches, radius);
158 | 
159 |         let mut joiner = SimpleJoiner::new(2);
160 |         for s in sketches {
161 |             joiner.add([(s & 0xFF) as u8, (s >> 8) as u8]).unwrap();
162 |         }
163 |         let results = joiner.similar_pairs(radius);
164 |         assert_eq!(results, expected);
165 |     }
166 | 
167 |     #[test]
168 |     fn test_similar_pairs_for_all() {
169 |         for radius in 0..=10 {
170 |             test_similar_pairs(radius as f64 / 10.);
171 |         }
172 |     }
173 | 
174 |     #[test]
175 |     fn test_short_sketch() {
176 |         let mut joiner = SimpleJoiner::new(2);
177 |         let result = joiner.add([0u64]);
178 |         assert!(result.is_err());
179 |     }
180 | }
181 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/src/sketch.rs:
--------------------------------------------------------------------------------
  1 | //! Traits of binary short sketches of primitive integer types.
  2 | use std::ops::Range;
  3 | 
  4 | use num_traits::int::PrimInt;
  5 | use num_traits::{FromPrimitive, ToPrimitive};
  6 | 
  7 | /// Trait of a binary short sketch from a primitive integer type.
  8 | pub trait Sketch: Default + PrimInt + FromPrimitive + ToPrimitive {
  9 |     /// Gets the number of dimensions.
 10 |     fn dim() -> usize;
 11 |     /// Gets the Hamming distance to the other sketch.
 12 |     fn hamdist(self, rhs: Self) -> usize;
 13 |     /// Produces a sketch for masking a given bit-position range.
 14 |     fn mask(rng: Range<usize>) -> Self;
 15 | }
 16 | 
 17 | impl Sketch for u8 {
 18 |     #[inline(always)]
 19 |     fn dim() -> usize {
 20 |         8
 21 |     }
 22 |     #[inline(always)]
 23 |     fn hamdist(self, rhs: Self) -> usize {
 24 |         (self ^ rhs).count_ones() as usize
 25 |     }
 26 |     #[inline(always)]
 27 |     fn mask(rng: Range<usize>) -> Self {
 28 |         debug_assert!(rng.end <= Self::dim());
 29 |         if rng.len() == Self::dim() {
 30 |             Self::MAX
 31 |         } else {
 32 |             ((1 << rng.len()) - 1) << rng.start
 33 |         }
 34 |     }
 35 | }
 36 | 
 37 | impl Sketch for u16 {
 38 |     #[inline(always)]
 39 |     fn dim() -> usize {
 40 |         16
 41 |     }
 42 |     #[inline(always)]
 43 |     fn hamdist(self, rhs: Self) -> usize {
 44 |         (self ^ rhs).count_ones() as usize
 45 |     }
 46 |     #[inline(always)]
 47 |     fn mask(rng: Range<usize>) -> Self {
 48 |         debug_assert!(rng.end <= Self::dim());
 49 |         if rng.len() == Self::dim() {
 50 |             Self::MAX
 51 |         } else {
 52 |             ((1 << rng.len()) - 1) << rng.start
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | impl Sketch for u32 {
 58 |     #[inline(always)]
 59 |     fn dim() -> usize {
 60 |         32
 61 |     }
 62 |     #[inline(always)]
 63 |     fn hamdist(self, rhs: Self) -> usize {
 64 |         (self ^ rhs).count_ones() as usize
 65 |     }
 66 |     #[inline(always)]
 67 |     fn mask(rng: Range<usize>) -> Self {
 68 |         debug_assert!(rng.end <= Self::dim());
 69 |         if rng.len() == Self::dim() {
 70 |             Self::MAX
 71 |         } else {
 72 |             ((1 << rng.len()) - 1) << rng.start
 73 |         }
 74 |     }
 75 | }
 76 | 
 77 | impl Sketch for u64 {
 78 |     #[inline(always)]
 79 |     fn dim() -> usize {
 80 |         64
 81 |     }
 82 |     #[inline(always)]
 83 |     fn hamdist(self, rhs: Self) -> usize {
 84 |         (self ^ rhs).count_ones() as usize
 85 |     }
 86 |     #[inline(always)]
 87 |     fn mask(rng: Range<usize>) -> Self {
 88 |         debug_assert!(rng.end <= Self::dim());
 89 |         if rng.len() == Self::dim() {
 90 |             Self::MAX
 91 |         } else {
 92 |             ((1 << rng.len()) - 1) << rng.start
 93 |         }
 94 |     }
 95 | }
 96 | 
 97 | #[cfg(test)]
 98 | mod tests {
 99 |     use super::*;
100 | 
101 |     #[test]
102 |     fn test_mask_u8() {
103 |         assert_eq!(u8::mask(0..4), 0b00001111);
104 |         assert_eq!(u8::mask(3..6), 0b00111000);
105 |         assert_eq!(u8::mask(4..8), 0b11110000);
106 |         assert_eq!(u8::mask(0..8), 0b11111111);
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/timeperf/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "timeperf"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | all-pairs-hamming = { path = ".." }  # MIT or Apache-2.0
10 | criterion = { version = "0.3", features = ["html_reports"] }  # Apache-2.0 or MIT
11 | rand = "0.8.5"
12 | 


--------------------------------------------------------------------------------
/all-pairs-hamming/timeperf/src/main.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Instant;
 2 | 
 3 | use all_pairs_hamming::chunked_join::ChunkedJoiner;
 4 | use all_pairs_hamming::simple_join::SimpleJoiner;
 5 | 
 6 | const TRIALS: usize = 3;
 7 | const SCALES: [usize; 4] = [1_000, 10_000, 100_000, 1_000_000];
 8 | const CHUNKS: [usize; 3] = [4, 16, 64];
 9 | const RADII: [f64; 3] = [0.01, 0.05, 0.1];
10 | 
11 | macro_rules! timeperf_common {
12 |     ($percent:expr, $name:expr, $method:ident, $sketches:ident, $radii:ident, $chunks:ident, $scales:ident) => {
13 |         for &num_chunks in $chunks {
14 |             let mut joiner = $method::new(num_chunks).shows_progress(true);
15 |             for &num_sketches in $scales {
16 |                 while joiner.num_sketches() < num_sketches {
17 |                     let sketch = &$sketches[joiner.num_sketches()];
18 |                     joiner.add(sketch.iter().cloned()).unwrap();
19 |                 }
20 |                 for &radius in $radii {
21 |                     let mut num_results = 0;
22 |                     let elapsed_sec = measure(TRIALS, || {
23 |                         num_results += joiner.similar_pairs(radius).len();
24 |                     });
25 |                     num_results /= TRIALS;
26 |                     println!(
27 |                         "[percent={},method={},num_chunks={num_chunks},num_sketches={num_sketches},radius={radius},num_results={num_results}] {elapsed_sec} sec",
28 |                         $percent, $name
29 |                     );
30 |                 }
31 |             }
32 |         }
33 |     };
34 | }
35 | 
36 | fn main() {
37 |     main_percent(50, false);
38 |     main_percent(80, false);
39 | }
40 | 
41 | fn main_percent(percent: u64, test_simple: bool) {
42 |     let max_chunks = *CHUNKS.last().unwrap();
43 |     let max_sketches = *SCALES.last().unwrap();
44 | 
45 |     let mut sketches = Vec::with_capacity(max_sketches);
46 |     for _ in 0..max_sketches {
47 |         let mut chunks = Vec::with_capacity(max_chunks);
48 |         for _ in 0..max_chunks {
49 |             chunks.push((0..64).fold(0u64, |acc, _| {
50 |                 let x = rand::random::<u64>() & 100;
51 |                 (acc << 1) | ((x < percent) as u64)
52 |             }));
53 |         }
54 |         sketches.push(chunks);
55 |     }
56 |     {
57 |         let radii = &RADII[..];
58 |         let chunks = &CHUNKS[..];
59 |         let scales = &SCALES[..];
60 |         timeperf_common!(
61 |             percent,
62 |             "chunked_join",
63 |             ChunkedJoiner,
64 |             sketches,
65 |             radii,
66 |             chunks,
67 |             scales
68 |         );
69 |     }
70 |     if test_simple {
71 |         let radii = &RADII[..1];
72 |         let chunks = &CHUNKS[..];
73 |         let scales = &SCALES[..3];
74 |         timeperf_common!(
75 |             percent,
76 |             "simple_join",
77 |             SimpleJoiner,
78 |             sketches,
79 |             radii,
80 |             chunks,
81 |             scales
82 |         );
83 |     }
84 | }
85 | 
86 | fn measure<F>(num_trials: usize, mut func: F) -> f64
87 | where
88 |     F: FnMut(),
89 | {
90 |     // Measure
91 |     let start = Instant::now();
92 |     for _ in 0..num_trials {
93 |         func();
94 |     }
95 |     let duration = start.elapsed();
96 |     duration.as_secs_f64() / num_trials as f64
97 | }
98 | 


--------------------------------------------------------------------------------
/find-simdoc-cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "find-simdoc-cli"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | all-pairs-hamming = { path = "../all-pairs-hamming" } # MIT or Apache-2.0
10 | clap = { version = "3.1", features = ["derive"] } # MIT or Apache-2.0
11 | find-simdoc = { path = "../find-simdoc" } # MIT or Apache-2.0
12 | hashbrown = "0.12.3" # MIT or Apache-2.0
13 | positioned-io = "0.3.0" # MIT
14 | rand = "0.8.5" # MIT or Apache-2.0
15 | rand_xoshiro = "0.6.0" # MIT or Apache-2.0
16 | rayon = "1.5.3" # MIT or Apache-2.0
17 | 
18 | [[bin]]
19 | name = "jaccard"
20 | path = "src/jaccard.rs"
21 | 
22 | [[bin]]
23 | name = "cosine"
24 | path = "src/cosine.rs"
25 | 
26 | [[bin]]
27 | name = "dump"
28 | path = "src/dump.rs"
29 | 
30 | [[bin]]
31 | name = "minhash_acc"
32 | path = "src/minhash_acc.rs"


--------------------------------------------------------------------------------
/find-simdoc-cli/src/cosine.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | use std::fs::File;
  3 | use std::io::{BufRead, BufReader, Read};
  4 | use std::path::PathBuf;
  5 | use std::str::FromStr;
  6 | use std::time::Instant;
  7 | 
  8 | use find_simdoc::tfidf::{Idf, Tf};
  9 | use find_simdoc::CosineSearcher;
 10 | 
 11 | use clap::Parser;
 12 | 
 13 | #[derive(Clone, Debug, PartialEq, Eq)]
 14 | enum TfWeights {
 15 |     Binary,
 16 |     Standard,
 17 |     Sublinear,
 18 | }
 19 | 
 20 | #[derive(Clone, Debug, PartialEq, Eq)]
 21 | enum IdfWeights {
 22 |     Unary,
 23 |     Standard,
 24 |     Smooth,
 25 | }
 26 | 
 27 | impl FromStr for TfWeights {
 28 |     type Err = &'static str;
 29 |     fn from_str(w: &str) -> Result<Self, Self::Err> {
 30 |         match w {
 31 |             "binary" => Ok(Self::Binary),
 32 |             "standard" => Ok(Self::Standard),
 33 |             "sublinear" => Ok(Self::Sublinear),
 34 |             _ => Err("Could not parse a tf-weighting value"),
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | impl FromStr for IdfWeights {
 40 |     type Err = &'static str;
 41 |     fn from_str(w: &str) -> Result<Self, Self::Err> {
 42 |         match w {
 43 |             "unary" => Ok(Self::Unary),
 44 |             "standard" => Ok(Self::Standard),
 45 |             "smooth" => Ok(Self::Smooth),
 46 |             _ => Err("Could not parse a idf-weighting value"),
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | #[derive(Parser, Debug)]
 52 | #[clap(
 53 |     name = "find-simdoc-cosine",
 54 |     about = "A program to find similar documents in the Cosine space."
 55 | )]
 56 | struct Args {
 57 |     /// File path to a document file to be searched.
 58 |     /// Empty lines must not be included.
 59 |     #[clap(short = 'i', long)]
 60 |     document_path: PathBuf,
 61 | 
 62 |     /// Search radius in the range of [0,1].
 63 |     #[clap(short = 'r', long)]
 64 |     radius: f64,
 65 | 
 66 |     /// Delimiter for recognizing words as tokens in feature extraction.
 67 |     /// If None, characters are used for tokens.
 68 |     #[clap(short = 'd', long)]
 69 |     delimiter: Option<char>,
 70 | 
 71 |     /// Window size for w-shingling in feature extraction (must be more than 0).
 72 |     #[clap(short = 'w', long, default_value = "1")]
 73 |     window_size: usize,
 74 | 
 75 |     /// Number of chunks in sketches, indicating that the number of dimensions in the Hamming space
 76 |     /// will be 64*#chunks. The larger this value, the more accurate the approximation,
 77 |     /// but the more time and memory it takes to search.
 78 |     #[clap(short = 'c', long, default_value = "8")]
 79 |     num_chunks: usize,
 80 | 
 81 |     /// Weighting variant of term frequency.
 82 |     /// "binary" is the boolean frequency.
 83 |     /// "standard" is the standard term frequency.
 84 |     /// "sublinear" is the logarithmically scaled frequency.
 85 |     #[clap(short = 'T', long, default_value = "standard")]
 86 |     tf: TfWeights,
 87 | 
 88 |     /// Weighting variant of inverse document frequency.
 89 |     /// "unary" is always 1.
 90 |     /// "standard" is the standard inverse document frequency.
 91 |     /// "smooth" is the smoothed inverse document frequency.
 92 |     #[clap(short = 'I', long, default_value = "smooth")]
 93 |     idf: IdfWeights,
 94 | 
 95 |     /// Seed value for random values.
 96 |     #[clap(short = 's', long)]
 97 |     seed: Option<u64>,
 98 | 
 99 |     /// Disables parallel construction.
100 |     #[clap(short = 'p', long)]
101 |     disable_parallel: bool,
102 | }
103 | 
104 | fn main() -> Result<(), Box<dyn Error>> {
105 |     let args = Args::parse();
106 | 
107 |     let document_path = args.document_path;
108 |     let radius = args.radius;
109 |     let delimiter = args.delimiter;
110 |     let window_size = args.window_size;
111 |     let num_chunks = args.num_chunks;
112 |     let tf_weight = args.tf;
113 |     let idf_weight = args.idf;
114 |     let seed = args.seed;
115 |     let disable_parallel = args.disable_parallel;
116 | 
117 |     let mut searcher = CosineSearcher::new(window_size, delimiter, seed)?.shows_progress(true);
118 | 
119 |     let tf = match tf_weight {
120 |         TfWeights::Binary => None,
121 |         TfWeights::Standard | TfWeights::Sublinear => {
122 |             Some(Tf::new().sublinear(tf_weight == TfWeights::Sublinear))
123 |         }
124 |     };
125 | 
126 |     let idf = match idf_weight {
127 |         IdfWeights::Unary => None,
128 |         IdfWeights::Standard | IdfWeights::Smooth => {
129 |             eprintln!("Building IDF...");
130 |             let start = Instant::now();
131 |             let documents = texts_iter(File::open(&document_path)?);
132 |             let idf = Idf::new()
133 |                 .smooth(idf_weight == IdfWeights::Smooth)
134 |                 .build(documents, searcher.config())?;
135 |             let duration = start.elapsed();
136 |             eprintln!("Produced in {} sec", duration.as_secs_f64());
137 |             Some(idf)
138 |         }
139 |     };
140 | 
141 |     searcher = searcher.tf(tf).idf(idf);
142 | 
143 |     {
144 |         eprintln!("Converting documents into sketches...");
145 |         let start = Instant::now();
146 |         let documents = texts_iter(File::open(&document_path)?);
147 |         searcher = if disable_parallel {
148 |             searcher.build_sketches(documents, num_chunks)?
149 |         } else {
150 |             searcher.build_sketches_in_parallel(documents, num_chunks)?
151 |         };
152 |         let duration = start.elapsed();
153 |         let memory_in_bytes = searcher.memory_in_bytes() as f64;
154 |         eprintln!(
155 |             "Produced {} sketches in {} sec, consuming {} MiB",
156 |             searcher.len(),
157 |             duration.as_secs_f64(),
158 |             memory_in_bytes / (1024. * 1024.)
159 |         );
160 |     }
161 | 
162 |     eprintln!("Finding all similar pairs in sketches...");
163 |     let start = Instant::now();
164 |     let results = searcher.search_similar_pairs(radius);
165 |     eprintln!("Done in {} sec", start.elapsed().as_secs_f64());
166 | 
167 |     println!("i,j,dist");
168 |     for (i, j, dist) in results {
169 |         println!("{i},{j},{dist}");
170 |     }
171 | 
172 |     Ok(())
173 | }
174 | 
175 | fn texts_iter<R>(rdr: R) -> impl Iterator<Item = String>
176 | where
177 |     R: Read,
178 | {
179 |     BufReader::new(rdr).lines().map(|line| line.unwrap())
180 | }
181 | 


--------------------------------------------------------------------------------
/find-simdoc-cli/src/dump.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::fs::File;
 3 | use std::io::{BufRead, BufReader};
 4 | use std::path::PathBuf;
 5 | 
 6 | use clap::Parser;
 7 | 
 8 | #[derive(Parser, Debug)]
 9 | #[clap(name = "find-simdoc-dump", about = "A program to dump similar texts.")]
10 | struct Args {
11 |     #[clap(short = 'i', long)]
12 |     text_path: PathBuf,
13 | 
14 |     #[clap(short = 's', long)]
15 |     simpair_path: PathBuf,
16 | }
17 | 
18 | fn main() -> Result<(), Box<dyn Error>> {
19 |     let args = Args::parse();
20 | 
21 |     let text_path = args.text_path;
22 |     let simpair_path = args.simpair_path;
23 | 
24 |     let texts: Vec<_> = BufReader::new(File::open(text_path)?)
25 |         .lines()
26 |         .map(|line| line.unwrap())
27 |         .collect();
28 | 
29 |     for (i, row) in BufReader::new(File::open(simpair_path)?)
30 |         .lines()
31 |         .enumerate()
32 |     {
33 |         if i == 0 {
34 |             continue;
35 |         }
36 |         let row = row?;
37 |         let cols: Vec<_> = row.split(',').collect();
38 |         let i = cols[0].parse::<usize>()?;
39 |         let j = cols[1].parse::<usize>()?;
40 |         let dist = cols[2].parse::<f64>()?;
41 |         println!("[i={i},j={j},dist={dist}]");
42 |         println!("{}", texts[i]);
43 |         println!("{}", texts[j]);
44 |     }
45 | 
46 |     Ok(())
47 | }
48 | 


--------------------------------------------------------------------------------
/find-simdoc-cli/src/jaccard.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | use std::fs::File;
  3 | use std::io::{BufRead, BufReader, Read};
  4 | use std::path::PathBuf;
  5 | use std::time::Instant;
  6 | 
  7 | use clap::Parser;
  8 | 
  9 | use find_simdoc::JaccardSearcher;
 10 | 
 11 | #[derive(Parser, Debug)]
 12 | #[clap(
 13 |     name = "find-simdoc-jaccard",
 14 |     about = "A program to find similar documents in the Jaccard space."
 15 | )]
 16 | struct Args {
 17 |     /// File path to a document file to be searched.
 18 |     /// Empty lines must not be included.
 19 |     #[clap(short = 'i', long)]
 20 |     document_path: PathBuf,
 21 | 
 22 |     /// Search radius in the range of [0,1].
 23 |     #[clap(short = 'r', long)]
 24 |     radius: f64,
 25 | 
 26 |     /// Delimiter for recognizing words as tokens in feature extraction.
 27 |     /// If None, characters are used for tokens.
 28 |     #[clap(short = 'd', long)]
 29 |     delimiter: Option<char>,
 30 | 
 31 |     /// Window size for w-shingling in feature extraction (must be more than 0).
 32 |     #[clap(short = 'w', long, default_value = "1")]
 33 |     window_size: usize,
 34 | 
 35 |     /// Number of chunks in sketches, indicating that the number of dimensions in the Hamming space
 36 |     /// will be 64*#chunks. The larger this value, the more accurate the approximation,
 37 |     /// but the more time and memory it takes to search.
 38 |     #[clap(short = 'c', long, default_value = "8")]
 39 |     num_chunks: usize,
 40 | 
 41 |     /// Seed value for random values.
 42 |     #[clap(short = 's', long)]
 43 |     seed: Option<u64>,
 44 | 
 45 |     /// Disables parallel construction.
 46 |     #[clap(short = 'p', long)]
 47 |     disable_parallel: bool,
 48 | }
 49 | 
 50 | fn main() -> Result<(), Box<dyn Error>> {
 51 |     let args = Args::parse();
 52 | 
 53 |     let document_path = args.document_path;
 54 |     let radius = args.radius;
 55 |     let delimiter = args.delimiter;
 56 |     let window_size = args.window_size;
 57 |     let num_chunks = args.num_chunks;
 58 |     let seed = args.seed;
 59 |     let disable_parallel = args.disable_parallel;
 60 | 
 61 |     let mut searcher = JaccardSearcher::new(window_size, delimiter, seed)?.shows_progress(true);
 62 | 
 63 |     {
 64 |         eprintln!("Converting documents into sketches...");
 65 |         let start = Instant::now();
 66 |         let documents = texts_iter(File::open(&document_path)?);
 67 |         searcher = if disable_parallel {
 68 |             searcher.build_sketches(documents, num_chunks)?
 69 |         } else {
 70 |             searcher.build_sketches_in_parallel(documents, num_chunks)?
 71 |         };
 72 |         let duration = start.elapsed();
 73 |         let memory_in_bytes = searcher.memory_in_bytes() as f64;
 74 |         eprintln!(
 75 |             "Produced {} sketches in {} sec, consuming {} MiB",
 76 |             searcher.len(),
 77 |             duration.as_secs_f64(),
 78 |             memory_in_bytes / (1024. * 1024.)
 79 |         );
 80 |     }
 81 | 
 82 |     eprintln!("Finding all similar pairs in sketches...");
 83 |     let start = Instant::now();
 84 |     let results = searcher.search_similar_pairs(radius);
 85 |     eprintln!("Done in {} sec", start.elapsed().as_secs_f64());
 86 | 
 87 |     println!("i,j,dist");
 88 |     for (i, j, dist) in results {
 89 |         println!("{i},{j},{dist}");
 90 |     }
 91 | 
 92 |     Ok(())
 93 | }
 94 | 
 95 | fn texts_iter<R>(rdr: R) -> impl Iterator<Item = String>
 96 | where
 97 |     R: Read,
 98 | {
 99 |     BufReader::new(rdr).lines().map(|line| line.unwrap())
100 | }
101 | 


--------------------------------------------------------------------------------
/find-simdoc-cli/src/minhash_acc.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::mutex_atomic)]
  2 | 
  3 | use std::env;
  4 | use std::error::Error;
  5 | use std::fmt::Write as _;
  6 | use std::fs::File;
  7 | use std::io::{BufRead, BufReader, Read};
  8 | use std::mem;
  9 | use std::path::PathBuf;
 10 | use std::sync::Mutex;
 11 | use std::time::Instant;
 12 | 
 13 | use all_pairs_hamming::sketch::Sketch;
 14 | use clap::Parser;
 15 | use find_simdoc::feature::{FeatureConfig, FeatureExtractor};
 16 | use find_simdoc::lsh::minhash::MinHasher;
 17 | use hashbrown::HashSet;
 18 | use positioned_io::WriteAt;
 19 | use rand::{RngCore, SeedableRng};
 20 | use rayon::prelude::*;
 21 | 
 22 | const MAX_CHUNKS: usize = 100;
 23 | 
 24 | #[derive(Parser, Debug)]
 25 | #[clap(
 26 |     name = "find-simdoc-minhash_acc",
 27 |     about = "A program to test accuracy in 1-bit minwise hashing."
 28 | )]
 29 | struct Args {
 30 |     /// File path to a document file to be searched.
 31 |     /// Empty lines must not be included.
 32 |     #[clap(short = 'i', long)]
 33 |     document_path: PathBuf,
 34 | 
 35 |     /// Delimiter for recognizing words as tokens in feature extraction.
 36 |     /// If None, characters are used for tokens.
 37 |     #[clap(short = 'd', long)]
 38 |     delimiter: Option<char>,
 39 | 
 40 |     /// Window size for w-shingling in feature extraction (must to be more than 0).
 41 |     #[clap(short = 'w', long, default_value = "1")]
 42 |     window_size: usize,
 43 | 
 44 |     /// Seed value for random values.
 45 |     #[clap(short = 's', long)]
 46 |     seed: Option<u64>,
 47 | 
 48 |     /// Directory path to write a tmp file.
 49 |     #[clap(short = 't', long)]
 50 |     tmp_dir: Option<PathBuf>,
 51 | }
 52 | 
 53 | fn main() -> Result<(), Box<dyn Error>> {
 54 |     let args = Args::parse();
 55 | 
 56 |     let document_path = args.document_path;
 57 |     let delimiter = args.delimiter;
 58 |     let window_size = args.window_size;
 59 |     let seed = args.seed;
 60 |     let tmp_dir = args.tmp_dir;
 61 | 
 62 |     if window_size == 0 {
 63 |         return Err("window_size must not be 0.".into());
 64 |     }
 65 | 
 66 |     let documents = BufReader::new(File::open(document_path)?)
 67 |         .lines()
 68 |         .map(|line| line.unwrap());
 69 | 
 70 |     let mut seeder =
 71 |         rand_xoshiro::SplitMix64::seed_from_u64(seed.unwrap_or_else(rand::random::<u64>));
 72 | 
 73 |     let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?;
 74 |     let extractor = FeatureExtractor::new(&config);
 75 | 
 76 |     let features = {
 77 |         eprintln!("Loading documents and extracting features...");
 78 |         let start = Instant::now();
 79 |         let mut features = vec![];
 80 |         for document in documents {
 81 |             if document.is_empty() {
 82 |                 return Err("Input document must not be empty.".into());
 83 |             }
 84 |             let mut feature = vec![];
 85 |             extractor.extract(document, &mut feature);
 86 |             features.push(feature);
 87 |         }
 88 |         let duration = start.elapsed();
 89 |         let total_bytes =
 90 |             features.iter().fold(0, |acc, f| acc + f.len()) * std::mem::size_of::<u64>();
 91 |         eprintln!(
 92 |             "Extracted {} features in {} sec, consuming {} MiB",
 93 |             features.len(),
 94 |             duration.as_secs_f64(),
 95 |             total_bytes as f64 / (1024. * 1024.)
 96 |         );
 97 |         features
 98 |     };
 99 | 
100 |     let sketches = {
101 |         eprintln!("Producing binary sketches...");
102 |         let start = Instant::now();
103 |         let hasher = MinHasher::new(seeder.next_u64());
104 | 
105 |         let processed = Mutex::new(0usize);
106 | 
107 |         let mut sketches = vec![vec![]; features.len()];
108 |         features
109 |             .par_iter()
110 |             .map(|feature| {
111 |                 {
112 |                     // Mutex::lock also locks eprintln.
113 |                     let mut cnt = processed.lock().unwrap();
114 |                     *cnt += 1;
115 |                     if *cnt % 1000 == 0 {
116 |                         eprintln!("Processed {} features...", *cnt);
117 |                     }
118 |                 }
119 |                 let mut iter = hasher.iter(feature);
120 |                 let mut sketch = Vec::with_capacity(MAX_CHUNKS);
121 |                 (0..MAX_CHUNKS).for_each(|_| sketch.push(iter.next().unwrap()));
122 |                 sketch
123 |             })
124 |             .collect_into_vec(&mut sketches);
125 | 
126 |         let duration = start.elapsed();
127 |         let total_bytes = sketches.len() * MAX_CHUNKS * std::mem::size_of::<u64>();
128 |         eprintln!(
129 |             "Produced in {} sec, consuming {} MiB",
130 |             duration.as_secs_f64(),
131 |             total_bytes as f64 / (1024. * 1024.)
132 |         );
133 |         sketches
134 |     };
135 | 
136 |     let tmp_path = {
137 |         let mut tmp_path = tmp_dir.unwrap_or_else(env::temp_dir);
138 |         tmp_path.push("tmp.jac_dist");
139 |         tmp_path
140 |     };
141 | 
142 |     let possible_pairs = {
143 |         let start = Instant::now();
144 | 
145 |         let possible_pairs = features.len() * (features.len() - 1) / 2;
146 |         eprintln!("Computing exact Jaccard distances for {possible_pairs} pairs...");
147 | 
148 |         let tmp_file_size = possible_pairs * mem::size_of::<f64>();
149 |         let offsets = {
150 |             let mut offset = 0;
151 |             let mut offsets = Vec::with_capacity(features.len());
152 |             for i in 0..features.len() {
153 |                 offsets.push(offset);
154 |                 offset += features.len() - i - 1;
155 |             }
156 |             assert_eq!(offset, possible_pairs);
157 |             offsets
158 |         };
159 | 
160 |         {
161 |             let processed = Mutex::new(0usize);
162 |             let writer = Mutex::new(File::create(&tmp_path)?);
163 | 
164 |             // Creates a file object of size tmp_file_size bytes.
165 |             {
166 |                 let mut w = writer.lock().unwrap();
167 |                 w.write_at(tmp_file_size as u64 - 1, &[0])?;
168 |             }
169 | 
170 |             eprintln!(
171 |                 "Created a tmp file of {} GiB, at {:?}",
172 |                 tmp_file_size as f64 / (1024. * 1024. * 1024.),
173 |                 &tmp_path
174 |             );
175 | 
176 |             (0..features.len()).into_par_iter().for_each(|i| {
177 |                 {
178 |                     // Mutex::lock also locks eprintln.
179 |                     let mut cnt = processed.lock().unwrap();
180 |                     *cnt += 1;
181 |                     if *cnt % 1000 == 0 {
182 |                         eprintln!("Processed {} features...", *cnt);
183 |                     }
184 |                 }
185 | 
186 |                 let mut jac_dists =
187 |                     Vec::with_capacity((features.len() - i) * mem::size_of::<f64>());
188 | 
189 |                 let x = &features[i];
190 |                 for y in features.iter().skip(i + 1) {
191 |                     let dist =
192 |                         find_simdoc::lsh::jaccard_distance(x.iter().clone(), y.iter().clone());
193 |                     jac_dists.extend_from_slice(&dist.to_le_bytes());
194 |                 }
195 | 
196 |                 // Writes distances with random access on a file stream.
197 |                 let offset = offsets[i] * mem::size_of::<f64>();
198 |                 {
199 |                     let mut w = writer.lock().unwrap();
200 |                     w.write_at(offset as u64, &jac_dists).unwrap();
201 |                 }
202 |             });
203 |         }
204 | 
205 |         let duration = start.elapsed();
206 |         eprintln!("Computed in {} sec", duration.as_secs_f64());
207 |         possible_pairs
208 |     };
209 | 
210 |     let radii = vec![0.01, 0.02, 0.05, 0.1, 0.2, 0.5];
211 |     let mut header = "num_chunks,dimensions,mean_absolute_error".to_string();
212 |     for &r in &radii {
213 |         write!(header, ",results_{r}")?;
214 |         write!(header, ",precision_{r}")?;
215 |         write!(header, ",recall_{r}")?;
216 |         write!(header, ",f1_{r}")?;
217 |     }
218 |     println!("{header}");
219 | 
220 |     eprintln!("Computing accuracy...");
221 |     let start = Instant::now();
222 | 
223 |     let results = {
224 |         let processed = Mutex::new(0usize);
225 |         let mut results: Vec<_> = (1..=MAX_CHUNKS)
226 |             .into_par_iter()
227 |             .map(|num_chunks| {
228 |                 {
229 |                     // Mutex::lock also locks eprintln.
230 |                     let mut cnt = processed.lock().unwrap();
231 |                     *cnt += 1;
232 |                     if *cnt % 10 == 0 {
233 |                         eprintln!("Processed {} chunks...", *cnt);
234 |                     }
235 |                 }
236 | 
237 |                 let mut sum_error = 0.;
238 |                 let mut true_results: Vec<_> = (0..radii.len()).map(|_| HashSet::new()).collect();
239 |                 let mut appx_results: Vec<_> = (0..radii.len()).map(|_| HashSet::new()).collect();
240 | 
241 |                 let mut reader = BufReader::new(File::open(&tmp_path).unwrap());
242 | 
243 |                 for i in 0..sketches.len() {
244 |                     let x = &sketches[i];
245 |                     for (j, y) in sketches.iter().enumerate().skip(i + 1) {
246 |                         let mut buf = [0; mem::size_of::<f64>()];
247 |                         reader.read_exact(&mut buf).unwrap();
248 | 
249 |                         let jac_dist = f64::from_le_bytes(buf);
250 |                         let ham_dist = hamming_distance(&x[..num_chunks], &y[..num_chunks]);
251 |                         sum_error += (jac_dist - ham_dist).abs();
252 | 
253 |                         for (k, &r) in radii.iter().enumerate() {
254 |                             if jac_dist <= r {
255 |                                 true_results[k].insert((i, j));
256 |                             }
257 |                             if ham_dist <= r {
258 |                                 appx_results[k].insert((i, j));
259 |                             }
260 |                         }
261 |                     }
262 |                 }
263 | 
264 |                 let dim = num_chunks * 64;
265 |                 let mae = sum_error / possible_pairs as f64;
266 | 
267 |                 let mut prf = vec![];
268 |                 for (tr, ar) in true_results.iter().zip(appx_results.iter()) {
269 |                     let true_positive = tr.intersection(ar).count() as f64;
270 |                     let false_positive = ar.len() as f64 - true_positive;
271 |                     let false_negative = tr.len() as f64 - true_positive;
272 |                     let precision = true_positive / (true_positive + false_positive);
273 |                     let recall = true_positive / (true_positive + false_negative);
274 |                     let f1 = (2. * precision * recall) / (precision + recall);
275 |                     prf.push((tr.len(), precision, recall, f1));
276 |                 }
277 | 
278 |                 let mut body = format!("{num_chunks},{dim},{mae}");
279 |                 for (t, p, r, f) in prf {
280 |                     write!(body, ",{t},{p},{r},{f}").unwrap();
281 |                 }
282 |                 (num_chunks, body)
283 |             })
284 |             .collect();
285 |         results.sort_by_key(|r| r.0);
286 |         results
287 |     };
288 |     let duration = start.elapsed();
289 |     eprintln!("Computed in {} sec", duration.as_secs_f64());
290 | 
291 |     for (_, body) in results {
292 |         println!("{body}");
293 |     }
294 | 
295 |     Ok(())
296 | }
297 | 
298 | fn hamming_distance(xs: &[u64], ys: &[u64]) -> f64 {
299 |     assert_eq!(xs.len(), ys.len());
300 |     let mut dist = 0;
301 |     for (&x, &y) in xs.iter().zip(ys.iter()) {
302 |         dist += x.hamdist(y);
303 |     }
304 |     // In 1-bit minhash, the collision probability is multiplied by 2 over the original.
305 |     // Thus, we should modify the Hamming distance with a factor of 2.
306 |     dist as f64 / (xs.len() * 64) as f64 * 2.
307 | }
308 | 


--------------------------------------------------------------------------------
/find-simdoc/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "find-simdoc"
 3 | version = "0.1.1"
 4 | edition = "2021"
 5 | authors = ["Shunsuke Kanda <shnsk.knd@gmail.com>"]
 6 | description = "Time- and memory-efficient all pairs similarity searches in documents."
 7 | license = "MIT OR Apache-2.0"
 8 | homepage = "https://github.com/legalforce-research/find-simdoc"
 9 | repository = "https://github.com/legalforce-research/find-simdoc"
10 | readme = "README.md"
11 | keywords = ["search", "similarity", "all-pairs", "lsh"]
12 | categories = ["text-processing", "algorithms"]
13 | 
14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
15 | 
16 | [dependencies]
17 | ahash = "0.8.0" # MIT or Apache-2.0
18 | all-pairs-hamming = { path = "../all-pairs-hamming", version = "0.1.0" } # MIT or Apache-2.0
19 | hashbrown = "0.12.3" # MIT or Apache-2.0
20 | rand = "0.8.5" # MIT or Apache-2.0
21 | rand_xoshiro = "0.6.0" # MIT or Apache-2.0
22 | rayon = "1.5.3" # MIT or Apache-2.0


--------------------------------------------------------------------------------
/find-simdoc/README.md:
--------------------------------------------------------------------------------
1 | # find-simdoc
2 | 
3 | Time- and memory-efficient all pairs similarity searches in documents.
4 | The detailed description can be found on the [project page](https://github.com/legalforce-research/find-simdoc).
5 | 
6 | ## API documentation
7 | 
8 | https://docs.rs/find-simdoc
9 | 


--------------------------------------------------------------------------------
/find-simdoc/examples/find_cosine.rs:
--------------------------------------------------------------------------------
 1 | use find_simdoc::tfidf::{Idf, Tf};
 2 | use find_simdoc::CosineSearcher;
 3 | 
 4 | fn main() {
 5 |     let documents = vec![
 6 |         "Welcome to Jimbocho, the town of books and curry!",
 7 |         "Welcome to Jimbocho, the city of books and curry!",
 8 |         "We welcome you to Jimbocho, the town of books and curry.",
 9 |         "Welcome to the town of books and curry, Jimbocho!",
10 |     ];
11 | 
12 |     // Creates a searcher for word unigrams (with random seed value 42).
13 |     let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 |     // Creates a term frequency (TF) weighter.
15 |     let tf = Tf::new();
16 |     // Creates a inverse document frequency (IDF) weighter.
17 |     let idf = Idf::new()
18 |         .build(documents.iter().clone(), searcher.config())
19 |         .unwrap();
20 |     // Builds the database of binary sketches converted from input documents,
21 |     let searcher = searcher
22 |         // with the TF weighter and
23 |         .tf(Some(tf))
24 |         // the IDF weighter,
25 |         .idf(Some(idf))
26 |         // where binary sketches are in the Hamming space of 10*64 dimensions.
27 |         .build_sketches_in_parallel(documents.iter(), 10)
28 |         .unwrap();
29 | 
30 |     // Searches all similar pairs within radius 0.25.
31 |     let results = searcher.search_similar_pairs(0.25);
32 |     // A result consists of the left-side id, the right-side id, and their distance.
33 |     assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34 | }
35 | 


--------------------------------------------------------------------------------
/find-simdoc/examples/find_jaccard.rs:
--------------------------------------------------------------------------------
 1 | use find_simdoc::JaccardSearcher;
 2 | 
 3 | fn main() {
 4 |     let documents = vec![
 5 |         "Welcome to Jimbocho, the town of books and curry!",
 6 |         "Welcome to Jimbocho, the city of books and curry!",
 7 |         "We welcome you to Jimbocho, the town of books and curry.",
 8 |         "Welcome to the town of books and curry, Jimbocho!",
 9 |     ];
10 | 
11 |     // Creates a searcher for character trigrams (with random seed value 42).
12 |     let searcher = JaccardSearcher::new(3, None, Some(42))
13 |         .unwrap()
14 |         // Builds the database of binary sketches converted from input documents,
15 |         // where binary sketches are in the Hamming space of 20*64 dimensions.
16 |         .build_sketches_in_parallel(documents.iter(), 20)
17 |         .unwrap();
18 | 
19 |     // Searches all similar pairs within radius 0.25.
20 |     let results = searcher.search_similar_pairs(0.25);
21 |     assert_eq!(results, vec![(0, 1, 0.1875), (0, 3, 0.2296875)]);
22 | }
23 | 


--------------------------------------------------------------------------------
/find-simdoc/src/cosine.rs:
--------------------------------------------------------------------------------
  1 | //! Searcher for all pairs of similar documents in the Cosine space.
  2 | use std::sync::Mutex;
  3 | 
  4 | use crate::errors::{FindSimdocError, Result};
  5 | use crate::feature::{FeatureConfig, FeatureExtractor};
  6 | use crate::lsh::simhash::SimHasher;
  7 | use crate::tfidf::{Idf, Tf};
  8 | 
  9 | use all_pairs_hamming::chunked_join::ChunkedJoiner;
 10 | use rand::{RngCore, SeedableRng};
 11 | use rayon::prelude::*;
 12 | 
 13 | /// Searcher for all pairs of similar documents in the Cosine space.
 14 | ///
 15 | /// # Approach
 16 | ///
 17 | /// The search steps consist of
 18 | ///
 19 | /// 1. Extracts features from documents,
 20 | ///    where a feature is a tfidf-weighted vector representation of character or word ngrams.
 21 | /// 2. Convert the features into binary sketches through the [simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592).
 22 | /// 3. Search for similar sketches in the Hamming space using [`ChunkedJoiner`].
 23 | ///
 24 | /// # Examples
 25 | ///
 26 | /// ```
 27 | /// use find_simdoc::tfidf::{Idf, Tf};
 28 | /// use find_simdoc::CosineSearcher;
 29 | ///
 30 | /// let documents = vec![
 31 | ///     "Welcome to Jimbocho, the town of books and curry!",
 32 | ///     "Welcome to Jimbocho, the city of books and curry!",
 33 | ///     "We welcome you to Jimbocho, the town of books and curry.",
 34 | ///     "Welcome to the town of books and curry, Jimbocho!",
 35 | /// ];
 36 | ///
 37 | /// // Creates a searcher for word unigrams (with random seed value 42).
 38 | /// let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
 39 | /// // Creates a term frequency (TF) weighter.
 40 | /// let tf = Tf::new();
 41 | /// // Creates a inverse document frequency (IDF) weighter.
 42 | /// let idf = Idf::new()
 43 | ///     .build(documents.iter().clone(), searcher.config())
 44 | ///     .unwrap();
 45 | /// // Builds the database of binary sketches converted from input documents,
 46 | /// let searcher = searcher
 47 | ///     // with the TF weighter and
 48 | ///     .tf(Some(tf))
 49 | ///     // the IDF weighter,
 50 | ///     .idf(Some(idf))
 51 | ///     // where binary sketches are in the Hamming space of 10*64 dimensions.
 52 | ///     .build_sketches_in_parallel(documents.iter(), 10)
 53 | ///     .unwrap();
 54 | ///
 55 | /// // Searches all similar pairs within radius 0.25.
 56 | /// let results = searcher.search_similar_pairs(0.25);
 57 | /// ```
 58 | pub struct CosineSearcher {
 59 |     config: FeatureConfig,
 60 |     hasher: SimHasher,
 61 |     tf: Option<Tf>,
 62 |     idf: Option<Idf<u64>>,
 63 |     joiner: Option<ChunkedJoiner<u64>>,
 64 |     shows_progress: bool,
 65 | }
 66 | 
 67 | impl CosineSearcher {
 68 |     /// Creates an instance.
 69 |     ///
 70 |     /// # Arguments
 71 |     ///
 72 |     /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0).
 73 |     /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction.
 74 |     ///                 If `None`, characters are used for tokens.
 75 |     /// * `seed` - Seed value for random values.
 76 |     pub fn new(window_size: usize, delimiter: Option<char>, seed: Option<u64>) -> Result<Self> {
 77 |         let seed = seed.unwrap_or_else(rand::random::<u64>);
 78 |         let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed);
 79 |         let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?;
 80 |         let hasher = SimHasher::new(seeder.next_u64());
 81 |         Ok(Self {
 82 |             config,
 83 |             hasher,
 84 |             tf: None,
 85 |             idf: None,
 86 |             joiner: None,
 87 |             shows_progress: false,
 88 |         })
 89 |     }
 90 | 
 91 |     /// Shows the progress via the standard error output?
 92 |     pub const fn shows_progress(mut self, yes: bool) -> Self {
 93 |         self.shows_progress = yes;
 94 |         self
 95 |     }
 96 | 
 97 |     /// Sets the scheme of TF weighting.
 98 |     #[allow(clippy::missing_const_for_fn)]
 99 |     pub fn tf(mut self, tf: Option<Tf>) -> Self {
100 |         self.tf = tf;
101 |         self
102 |     }
103 | 
104 |     /// Sets the scheme of IDF weighting.
105 |     #[allow(clippy::missing_const_for_fn)]
106 |     pub fn idf(mut self, idf: Option<Idf<u64>>) -> Self {
107 |         self.idf = idf;
108 |         self
109 |     }
110 | 
111 |     /// Builds the database of sketches from input documents.
112 |     ///
113 |     /// # Arguments
114 |     ///
115 |     /// * `documents` - List of documents (must not include an empty string).
116 |     /// * `num_chunks` - Number of chunks of sketches, indicating that
117 |     ///                  the number of dimensions in the Hamming space is `num_chunks*64`.
118 |     pub fn build_sketches<I, D>(mut self, documents: I, num_chunks: usize) -> Result<Self>
119 |     where
120 |         I: IntoIterator<Item = D>,
121 |         D: AsRef<str>,
122 |     {
123 |         let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
124 |         let extractor = FeatureExtractor::new(&self.config);
125 | 
126 |         let mut feature = vec![];
127 |         for (i, doc) in documents.into_iter().enumerate() {
128 |             if self.shows_progress && (i + 1) % 10000 == 0 {
129 |                 eprintln!("Processed {} documents...", i + 1);
130 |             }
131 |             let doc = doc.as_ref();
132 |             if doc.is_empty() {
133 |                 return Err(FindSimdocError::input("Input document must not be empty."));
134 |             }
135 |             extractor.extract_with_weights(doc, &mut feature);
136 |             if let Some(tf) = self.tf.as_ref() {
137 |                 tf.tf(&mut feature);
138 |             }
139 |             if let Some(idf) = self.idf.as_ref() {
140 |                 for (term, weight) in feature.iter_mut() {
141 |                     *weight *= idf.idf(*term);
142 |                 }
143 |             }
144 |             joiner.add(self.hasher.iter(&feature)).unwrap();
145 |         }
146 |         self.joiner = Some(joiner);
147 |         Ok(self)
148 |     }
149 | 
150 |     /// Builds the database of sketches from input documents in parallel.
151 |     ///
152 |     /// # Arguments
153 |     ///
154 |     /// * `documents` - List of documents (must not include an empty string).
155 |     /// * `num_chunks` - Number of chunks of sketches, indicating that
156 |     ///                  the number of dimensions in the Hamming space is `num_chunks*64`.
157 |     ///
158 |     /// # Notes
159 |     ///
160 |     /// The progress is not printed even if `shows_progress = true`.
161 |     pub fn build_sketches_in_parallel<I, D>(
162 |         mut self,
163 |         documents: I,
164 |         num_chunks: usize,
165 |     ) -> Result<Self>
166 |     where
167 |         I: Iterator<Item = D> + Send,
168 |         D: AsRef<str> + Send,
169 |     {
170 |         let extractor = FeatureExtractor::new(&self.config);
171 |         #[allow(clippy::mutex_atomic)]
172 |         let processed = Mutex::new(0usize);
173 |         let mut sketches: Vec<_> = documents
174 |             .into_iter()
175 |             .enumerate()
176 |             .par_bridge()
177 |             .map(|(i, doc)| {
178 |                 #[allow(clippy::mutex_atomic)]
179 |                 {
180 |                     // Mutex::lock also locks eprintln.
181 |                     let mut cnt = processed.lock().unwrap();
182 |                     *cnt += 1;
183 |                     if self.shows_progress && *cnt % 10000 == 0 {
184 |                         eprintln!("Processed {} documents...", *cnt);
185 |                     }
186 |                 }
187 |                 let doc = doc.as_ref();
188 |                 // TODO: Returns the error value (but I dont know the manner).
189 |                 assert!(!doc.is_empty(), "Input document must not be empty.");
190 |                 let mut feature = vec![];
191 |                 extractor.extract_with_weights(doc, &mut feature);
192 |                 if let Some(tf) = self.tf.as_ref() {
193 |                     tf.tf(&mut feature);
194 |                 }
195 |                 if let Some(idf) = self.idf.as_ref() {
196 |                     for (term, weight) in feature.iter_mut() {
197 |                         *weight *= idf.idf(*term);
198 |                     }
199 |                 }
200 |                 let mut gen = self.hasher.iter(&feature);
201 |                 let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect();
202 |                 (i, sketch)
203 |             })
204 |             .collect();
205 |         sketches.par_sort_by_key(|&(i, _)| i);
206 | 
207 |         let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
208 |         for (_, sketch) in sketches {
209 |             joiner.add(sketch).unwrap();
210 |         }
211 |         self.joiner = Some(joiner);
212 |         Ok(self)
213 |     }
214 | 
215 |     /// Searches for all pairs of similar documents within an input radius, returning
216 |     /// triplets of the left-side id, the right-side id, and their distance.
217 |     pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
218 |         self.joiner.as_ref().unwrap().similar_pairs(radius)
219 |     }
220 | 
221 |     /// Gets the number of input documents.
222 |     pub fn len(&self) -> usize {
223 |         self.joiner
224 |             .as_ref()
225 |             .map_or(0, |joiner| joiner.num_sketches())
226 |     }
227 | 
228 |     /// Checks if the database is empty.
229 |     pub fn is_empty(&self) -> bool {
230 |         self.len() == 0
231 |     }
232 | 
233 |     /// Gets the memory usage in bytes.
234 |     pub fn memory_in_bytes(&self) -> usize {
235 |         self.joiner
236 |             .as_ref()
237 |             .map_or(0, |joiner| joiner.memory_in_bytes())
238 |     }
239 | 
240 |     /// Gets the configure of feature extraction.
241 |     pub const fn config(&self) -> &FeatureConfig {
242 |         &self.config
243 |     }
244 | }
245 | 


--------------------------------------------------------------------------------
/find-simdoc/src/errors.rs:
--------------------------------------------------------------------------------
 1 | //! Error definitions.
 2 | use std::error::Error;
 3 | use std::{fmt, result};
 4 | 
 5 | /// A specialized Result type for this library.
 6 | pub type Result<T, E = FindSimdocError> = result::Result<T, E>;
 7 | 
 8 | /// Errors in this library.
 9 | #[derive(Debug)]
10 | pub enum FindSimdocError {
11 |     /// Contains [`InputError`].
12 |     Input(InputError),
13 | }
14 | 
15 | impl fmt::Display for FindSimdocError {
16 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
17 |         match self {
18 |             Self::Input(e) => e.fmt(f),
19 |         }
20 |     }
21 | }
22 | 
23 | impl Error for FindSimdocError {}
24 | 
25 | impl FindSimdocError {
26 |     pub(crate) const fn input(msg: &'static str) -> Self {
27 |         Self::Input(InputError { msg })
28 |     }
29 | }
30 | 
31 | /// Error used when the input argument is invalid.
32 | #[derive(Debug)]
33 | pub struct InputError {
34 |     msg: &'static str,
35 | }
36 | 
37 | impl fmt::Display for InputError {
38 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
39 |         write!(f, "InputError: {}", self.msg)
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/find-simdoc/src/feature.rs:
--------------------------------------------------------------------------------
  1 | //! Feature extractor.
  2 | use std::hash::{BuildHasher, Hash, Hasher};
  3 | use std::ops::Range;
  4 | 
  5 | use ahash::RandomState;
  6 | use rand::{RngCore, SeedableRng};
  7 | 
  8 | use crate::errors::{FindSimdocError, Result};
  9 | use crate::shingling::ShingleIter;
 10 | 
 11 | /// Configuration of feature extraction.
 12 | #[derive(Clone, Debug)]
 13 | pub struct FeatureConfig {
 14 |     window_size: usize,
 15 |     delimiter: Option<char>,
 16 |     build_hasher: RandomState,
 17 | }
 18 | 
 19 | impl FeatureConfig {
 20 |     /// Creates an instance.
 21 |     ///
 22 |     /// # Arguments
 23 |     ///
 24 |     /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0).
 25 |     /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction.
 26 |     ///                 If `None`, characters are used for tokens.
 27 |     /// * `seed` - Seed value for random values.
 28 |     pub fn new(window_size: usize, delimiter: Option<char>, seed: u64) -> Result<Self> {
 29 |         if window_size == 0 {
 30 |             return Err(FindSimdocError::input("Window size must not be 0."));
 31 |         }
 32 |         let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed);
 33 |         let build_hasher = RandomState::with_seeds(
 34 |             seeder.next_u64(),
 35 |             seeder.next_u64(),
 36 |             seeder.next_u64(),
 37 |             seeder.next_u64(),
 38 |         );
 39 |         Ok(Self {
 40 |             window_size,
 41 |             delimiter,
 42 |             build_hasher,
 43 |         })
 44 |     }
 45 | 
 46 |     fn hash<I, T>(&self, iter: I) -> u64
 47 |     where
 48 |         I: IntoIterator<Item = T>,
 49 |         T: Hash,
 50 |     {
 51 |         let mut s = self.build_hasher.build_hasher();
 52 |         for t in iter {
 53 |             t.hash(&mut s);
 54 |         }
 55 |         s.finish()
 56 |     }
 57 | }
 58 | 
 59 | /// Extractor of feature vectors.
 60 | pub struct FeatureExtractor<'a> {
 61 |     config: &'a FeatureConfig,
 62 | }
 63 | 
 64 | impl<'a> FeatureExtractor<'a> {
 65 |     /// Creates an instance.
 66 |     pub const fn new(config: &'a FeatureConfig) -> Self {
 67 |         Self { config }
 68 |     }
 69 | 
 70 |     /// Extracts a feature vector from an input text.
 71 |     pub fn extract<S>(&self, text: S, feature: &mut Vec<u64>)
 72 |     where
 73 |         S: AsRef<str>,
 74 |     {
 75 |         let text = text.as_ref();
 76 | 
 77 |         feature.clear();
 78 |         if self.config.delimiter.is_none() && self.config.window_size == 1 {
 79 |             // The simplest case.
 80 |             text.chars().for_each(|c| feature.push(c as u64));
 81 |         } else {
 82 |             let token_ranges = self.tokenize(text);
 83 |             for ranges in ShingleIter::new(&token_ranges, self.config.window_size) {
 84 |                 feature.push(self.config.hash(ranges.iter().cloned().map(|r| &text[r])));
 85 |             }
 86 |         }
 87 |     }
 88 | 
 89 |     /// Extracts a feature vector from an input text with weights of 1.0.
 90 |     pub fn extract_with_weights<S>(&self, text: S, feature: &mut Vec<(u64, f64)>)
 91 |     where
 92 |         S: AsRef<str>,
 93 |     {
 94 |         let text = text.as_ref();
 95 | 
 96 |         feature.clear();
 97 |         if self.config.delimiter.is_none() && self.config.window_size == 1 {
 98 |             // The simplest case.
 99 |             text.chars().for_each(|c| {
100 |                 let f = c as u64;
101 |                 let w = 1.;
102 |                 feature.push((f, w))
103 |             });
104 |         } else {
105 |             let token_ranges = self.tokenize(text);
106 |             for ranges in ShingleIter::new(&token_ranges, self.config.window_size) {
107 |                 let f = self.config.hash(ranges.iter().cloned().map(|r| &text[r]));
108 |                 let w = 1.;
109 |                 feature.push((f, w))
110 |             }
111 |         }
112 |     }
113 | 
114 |     fn tokenize(&self, text: &str) -> Vec<Range<usize>> {
115 |         let mut token_ranges = vec![];
116 |         for _ in 1..self.config.window_size {
117 |             token_ranges.push(0..0); // BOS
118 |         }
119 |         let mut offset = 0;
120 |         if let Some(delim) = self.config.delimiter {
121 |             while offset < text.len() {
122 |                 let len = text[offset..].find(delim);
123 |                 if let Some(len) = len {
124 |                     token_ranges.push(offset..offset + len);
125 |                     offset += len + 1;
126 |                 } else {
127 |                     token_ranges.push(offset..text.len());
128 |                     break;
129 |                 }
130 |             }
131 |         } else {
132 |             for c in text.chars() {
133 |                 let len = c.len_utf8();
134 |                 token_ranges.push(offset..offset + len);
135 |                 offset += len;
136 |             }
137 |         }
138 |         for _ in 1..self.config.window_size {
139 |             token_ranges.push(text.len()..text.len()); // EOS
140 |         }
141 |         token_ranges
142 |     }
143 | }
144 | 
145 | #[cfg(test)]
146 | mod tests {
147 |     use super::*;
148 | 
149 |     #[test]
150 |     fn test_char_unigram() {
151 |         let config = FeatureConfig::new(1, None, 42).unwrap();
152 |         let extractor = FeatureExtractor::new(&config);
153 | 
154 |         let text = "abcd";
155 |         let mut feature = vec![];
156 | 
157 |         extractor.extract(text, &mut feature);
158 |         assert_eq!(
159 |             feature,
160 |             vec!['a' as u64, 'b' as u64, 'c' as u64, 'd' as u64]
161 |         )
162 |     }
163 | 
164 |     #[test]
165 |     fn test_char_bigram() {
166 |         let config = FeatureConfig::new(2, None, 42).unwrap();
167 |         let extractor = FeatureExtractor::new(&config);
168 | 
169 |         let text = "abcd";
170 |         let mut feature = vec![];
171 | 
172 |         extractor.extract(text, &mut feature);
173 |         assert_eq!(
174 |             feature,
175 |             vec![
176 |                 config.hash(&["", "a"]),
177 |                 config.hash(&["a", "b"]),
178 |                 config.hash(&["b", "c"]),
179 |                 config.hash(&["c", "d"]),
180 |                 config.hash(&["d", ""]),
181 |             ]
182 |         )
183 |     }
184 | 
185 |     #[test]
186 |     fn test_char_trigram() {
187 |         let config = FeatureConfig::new(3, None, 42).unwrap();
188 |         let extractor = FeatureExtractor::new(&config);
189 | 
190 |         let text = "abcd";
191 |         let mut feature = vec![];
192 | 
193 |         extractor.extract(text, &mut feature);
194 |         assert_eq!(
195 |             feature,
196 |             vec![
197 |                 config.hash(&["", "", "a"]),
198 |                 config.hash(&["", "a", "b"]),
199 |                 config.hash(&["a", "b", "c"]),
200 |                 config.hash(&["b", "c", "d"]),
201 |                 config.hash(&["c", "d", ""]),
202 |                 config.hash(&["d", "", ""]),
203 |             ]
204 |         )
205 |     }
206 | 
207 |     #[test]
208 |     fn test_word_unigram() {
209 |         let config = FeatureConfig::new(1, Some(' '), 42).unwrap();
210 |         let extractor = FeatureExtractor::new(&config);
211 | 
212 |         let text = "abc de fgh";
213 |         let mut feature = vec![];
214 | 
215 |         extractor.extract(text, &mut feature);
216 |         assert_eq!(
217 |             feature,
218 |             vec![
219 |                 config.hash(&["abc"]),
220 |                 config.hash(&["de"]),
221 |                 config.hash(&["fgh"]),
222 |             ]
223 |         )
224 |     }
225 | 
226 |     #[test]
227 |     fn test_word_bigram() {
228 |         let config = FeatureConfig::new(2, Some(' '), 42).unwrap();
229 |         let extractor = FeatureExtractor::new(&config);
230 | 
231 |         let text = "abc de fgh";
232 |         let mut feature = vec![];
233 | 
234 |         extractor.extract(text, &mut feature);
235 |         assert_eq!(
236 |             feature,
237 |             vec![
238 |                 config.hash(&["", "abc"]),
239 |                 config.hash(&["abc", "de"]),
240 |                 config.hash(&["de", "fgh"]),
241 |                 config.hash(&["fgh", ""]),
242 |             ]
243 |         )
244 |     }
245 | 
246 |     #[test]
247 |     fn test_word_trigram() {
248 |         let config = FeatureConfig::new(3, Some(' '), 42).unwrap();
249 |         let extractor = FeatureExtractor::new(&config);
250 | 
251 |         let text = "abc de fgh";
252 |         let mut feature = vec![];
253 | 
254 |         extractor.extract(text, &mut feature);
255 |         assert_eq!(
256 |             feature,
257 |             vec![
258 |                 config.hash(&["", "", "abc"]),
259 |                 config.hash(&["", "abc", "de"]),
260 |                 config.hash(&["abc", "de", "fgh"]),
261 |                 config.hash(&["de", "fgh", ""]),
262 |                 config.hash(&["fgh", "", ""]),
263 |             ]
264 |         )
265 |     }
266 | }
267 | 


--------------------------------------------------------------------------------
/find-simdoc/src/jaccard.rs:
--------------------------------------------------------------------------------
  1 | //! Searcher for all pairs of similar documents in the Jaccard space.
  2 | use std::sync::Mutex;
  3 | 
  4 | use crate::errors::{FindSimdocError, Result};
  5 | use crate::feature::{FeatureConfig, FeatureExtractor};
  6 | use crate::lsh::minhash::MinHasher;
  7 | 
  8 | use all_pairs_hamming::chunked_join::ChunkedJoiner;
  9 | use rand::{RngCore, SeedableRng};
 10 | use rayon::prelude::*;
 11 | 
 12 | /// Searcher for all pairs of similar documents in the Jaccard space.
 13 | ///
 14 | /// # Approach
 15 | ///
 16 | /// The search steps consist of
 17 | ///
 18 | /// 1. Extracts features from documents,
 19 | ///    where a feature is a set representation of character or word ngrams.
 20 | /// 2. Convert the features into binary sketches through the [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759).
 21 | /// 3. Search for similar sketches in the Hamming space using [`ChunkedJoiner`].
 22 | ///
 23 | /// # Examples
 24 | ///
 25 | /// ```
 26 | /// use find_simdoc::JaccardSearcher;
 27 | ///
 28 | /// let documents = vec![
 29 | ///     "Welcome to Jimbocho, the town of books and curry!",
 30 | ///     "Welcome to Jimbocho, the city of books and curry!",
 31 | ///     "We welcome you to Jimbocho, the town of books and curry.",
 32 | ///     "Welcome to the town of books and curry, Jimbocho!",
 33 | /// ];
 34 | ///
 35 | /// // Creates a searcher for character trigrams (with random seed value 42).
 36 | /// let searcher = JaccardSearcher::new(3, None, Some(42))
 37 | ///     .unwrap()
 38 | ///     // Builds the database of binary sketches converted from input documents,
 39 | ///     // where binary sketches are in the Hamming space of 20*64 dimensions.
 40 | ///     .build_sketches_in_parallel(documents.iter(), 20)
 41 | ///     .unwrap();
 42 | ///
 43 | /// // Searches all similar pairs within radius 0.25.
 44 | /// let results = searcher.search_similar_pairs(0.25);
 45 | /// ```
 46 | pub struct JaccardSearcher {
 47 |     config: FeatureConfig,
 48 |     hasher: MinHasher,
 49 |     joiner: Option<ChunkedJoiner<u64>>,
 50 |     shows_progress: bool,
 51 | }
 52 | 
 53 | impl JaccardSearcher {
 54 |     /// Creates an instance.
 55 |     ///
 56 |     /// # Arguments
 57 |     ///
 58 |     /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0).
 59 |     /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction.
 60 |     ///                 If `None`, characters are used for tokens.
 61 |     /// * `seed` - Seed value for random values.
 62 |     pub fn new(window_size: usize, delimiter: Option<char>, seed: Option<u64>) -> Result<Self> {
 63 |         let seed = seed.unwrap_or_else(rand::random::<u64>);
 64 |         let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed);
 65 |         let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?;
 66 |         let hasher = MinHasher::new(seeder.next_u64());
 67 |         Ok(Self {
 68 |             config,
 69 |             hasher,
 70 |             joiner: None,
 71 |             shows_progress: false,
 72 |         })
 73 |     }
 74 | 
 75 |     /// Shows the progress via the standard error output?
 76 |     pub const fn shows_progress(mut self, yes: bool) -> Self {
 77 |         self.shows_progress = yes;
 78 |         self
 79 |     }
 80 | 
 81 |     /// Builds the database of sketches from input documents.
 82 |     ///
 83 |     /// # Arguments
 84 |     ///
 85 |     /// * `documents` - List of documents (must not include an empty string).
 86 |     /// * `num_chunks` - Number of chunks of sketches, indicating that
 87 |     ///                  the number of dimensions in the Hamming space is `num_chunks*64`.
 88 |     pub fn build_sketches<I, D>(mut self, documents: I, num_chunks: usize) -> Result<Self>
 89 |     where
 90 |         I: IntoIterator<Item = D>,
 91 |         D: AsRef<str>,
 92 |     {
 93 |         let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
 94 |         let extractor = FeatureExtractor::new(&self.config);
 95 | 
 96 |         let mut feature = vec![];
 97 |         for (i, doc) in documents.into_iter().enumerate() {
 98 |             if self.shows_progress && (i + 1) % 10000 == 0 {
 99 |                 eprintln!("Processed {} documents...", i + 1);
100 |             }
101 |             let doc = doc.as_ref();
102 |             if doc.is_empty() {
103 |                 return Err(FindSimdocError::input("Input document must not be empty."));
104 |             }
105 |             extractor.extract(doc, &mut feature);
106 |             joiner.add(self.hasher.iter(&feature)).unwrap();
107 |         }
108 |         self.joiner = Some(joiner);
109 |         Ok(self)
110 |     }
111 | 
112 |     /// Builds the database of sketches from input documents in parallel.
113 |     ///
114 |     /// # Arguments
115 |     ///
116 |     /// * `documents` - List of documents (must not include an empty string).
117 |     /// * `num_chunks` - Number of chunks of sketches, indicating that
118 |     ///                  the number of dimensions in the Hamming space is `num_chunks*64`.
119 |     ///
120 |     /// # Notes
121 |     ///
122 |     /// The progress is not printed even if `shows_progress = true`.
123 |     pub fn build_sketches_in_parallel<I, D>(
124 |         mut self,
125 |         documents: I,
126 |         num_chunks: usize,
127 |     ) -> Result<Self>
128 |     where
129 |         I: Iterator<Item = D> + Send,
130 |         D: AsRef<str> + Send,
131 |     {
132 |         let extractor = FeatureExtractor::new(&self.config);
133 |         #[allow(clippy::mutex_atomic)]
134 |         let processed = Mutex::new(0usize);
135 |         let mut sketches: Vec<_> = documents
136 |             .into_iter()
137 |             .enumerate()
138 |             .par_bridge()
139 |             .map(|(i, doc)| {
140 |                 #[allow(clippy::mutex_atomic)]
141 |                 {
142 |                     // Mutex::lock also locks eprintln.
143 |                     let mut cnt = processed.lock().unwrap();
144 |                     *cnt += 1;
145 |                     if self.shows_progress && *cnt % 10000 == 0 {
146 |                         eprintln!("Processed {} documents...", *cnt);
147 |                     }
148 |                 }
149 |                 let doc = doc.as_ref();
150 |                 // TODO: Returns the error value (but I dont know the manner).
151 |                 assert!(!doc.is_empty(), "Input document must not be empty.");
152 |                 let mut feature = vec![];
153 |                 extractor.extract(doc, &mut feature);
154 |                 let mut gen = self.hasher.iter(&feature);
155 |                 let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect();
156 |                 (i, sketch)
157 |             })
158 |             .collect();
159 |         sketches.par_sort_by_key(|&(i, _)| i);
160 | 
161 |         let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
162 |         for (_, sketch) in sketches {
163 |             joiner.add(sketch).unwrap();
164 |         }
165 |         self.joiner = Some(joiner);
166 |         Ok(self)
167 |     }
168 | 
169 |     /// Searches for all pairs of similar documents within an input radius, returning
170 |     /// triplets of the left-side id, the right-side id, and their distance.
171 |     pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
172 |         self.joiner.as_ref().map_or_else(Vec::new, |joiner| {
173 |             // In 1-bit minhash, the collision probability is multiplied by 2 over the original.
174 |             // Thus, we should search with the half of the actual radius.
175 |             let mut results = joiner.similar_pairs(radius / 2.);
176 |             // Modifies the distances.
177 |             results.iter_mut().for_each(|(_, _, d)| *d *= 2.);
178 |             results
179 |         })
180 |     }
181 | 
182 |     /// Gets the number of input documents.
183 |     pub fn len(&self) -> usize {
184 |         self.joiner
185 |             .as_ref()
186 |             .map_or(0, |joiner| joiner.num_sketches())
187 |     }
188 | 
189 |     /// Checks if the database is empty.
190 |     pub fn is_empty(&self) -> bool {
191 |         self.len() == 0
192 |     }
193 | 
194 |     /// Gets the memory usage in bytes.
195 |     pub fn memory_in_bytes(&self) -> usize {
196 |         self.joiner
197 |             .as_ref()
198 |             .map_or(0, |joiner| joiner.memory_in_bytes())
199 |     }
200 | 
201 |     /// Gets the configure of feature extraction.
202 |     pub const fn config(&self) -> &FeatureConfig {
203 |         &self.config
204 |     }
205 | }
206 | 


--------------------------------------------------------------------------------
/find-simdoc/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Time- and memory-efficient all pairs similarity searches in documents.
 2 | //! A more detailed description can be found on the [project page](https://github.com/legalforce-research/find-simdoc).
 3 | //!
 4 | //! # Problem definition
 5 | //!
 6 | //! - Input
 7 | //!   - List of documents
 8 | //!   - Distance function
 9 | //!   - Radius threshold
10 | //! - Output
11 | //!   - All pairs of similar document ids
12 | //!
13 | //! # Features
14 | //!
15 | //! ## Easy to use
16 | //!
17 | //! This software supports all essential steps of document similarity search,
18 | //! from feature extraction to output of similar pairs.
19 | //! Therefore, you can immediately try the fast all pairs similarity search using your document files.
20 | //!
21 | //! ## Flexible tokenization
22 | //!
23 | //! You can specify any delimiter when splitting words in tokenization for feature extraction.
24 | //! This can be useful in languages where multiple definitions of words exist, such as Japanese or Chinese.
25 | //!
26 | //! ## Time and memory efficiency
27 | //!
28 | //! The time and memory complexities are *linear* over the numbers of input documents and output results
29 | //! on the basis of the ideas behind the locality sensitive hashing (LSH) and [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html).
30 | //!
31 | //! ## Tunable search performance
32 | //!
33 | //! LSH allows tuning of performance in accuracy, time, and memory, through a manual parameter specifying search dimensions.
34 | //! You can flexibly perform searches depending on your dataset and machine environment.
35 | //!   - Specifying lower dimensions allows for faster and rougher searches with less memory usage.
36 | //!   - Specifying higher dimensions allows for more accurate searches with more memory usage.
37 | //!
38 | //! # Search steps
39 | //!
40 | //! 1. Extract features from documents
41 | //!    - Set representation of character or word ngrams
42 | //!    - Tfidf-weighted vector representation of character or word ngrams
43 | //! 2. Convert the features into binary sketches through locality sensitive hashing
44 | //!    - [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity
45 | //!    - [Simplified simhash](https://dl.acm.org/doi/10.1145/1242572.1242592) for the Cosine similarity
46 | //! 3. Search for similar sketches in the Hamming space using a modified variant of the [sketch sorting approach](https://proceedings.mlr.press/v13/tabei10a.html)
47 | #![deny(missing_docs)]
48 | 
49 | pub mod cosine;
50 | pub mod errors;
51 | pub mod feature;
52 | pub mod jaccard;
53 | pub mod lsh;
54 | pub mod tfidf;
55 | 
56 | mod shingling;
57 | 
58 | pub use cosine::CosineSearcher;
59 | pub use jaccard::JaccardSearcher;
60 | 


--------------------------------------------------------------------------------
/find-simdoc/src/lsh.rs:
--------------------------------------------------------------------------------
 1 | //! Locality-sensitive hashings.
 2 | pub mod minhash;
 3 | pub mod simhash;
 4 | 
 5 | use std::hash::Hash;
 6 | 
 7 | use hashbrown::HashSet;
 8 | use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 9 | 
10 | /// Generates a hash value.
11 | #[inline(always)]
12 | pub(crate) fn hash_u64(x: u64, seed: u64) -> u64 {
13 |     rand_xoshiro::SplitMix64::seed_from_u64(x ^ seed).next_u64()
14 | }
15 | 
16 | /// Computes the Jaccard distance.
17 | ///
18 | /// # Examples
19 | ///
20 | /// ```
21 | /// use find_simdoc::lsh::jaccard_distance;
22 | ///
23 | /// let x = vec![1, 2, 4];
24 | /// let y = vec![1, 2, 5, 7];
25 | /// assert_eq!(jaccard_distance(x, y), 0.6);
26 | /// ```
27 | pub fn jaccard_distance<I, T>(lhs: I, rhs: I) -> f64
28 | where
29 |     I: IntoIterator<Item = T>,
30 |     T: Hash + Eq,
31 | {
32 |     let a = HashSet::<T>::from_iter(lhs);
33 |     let b = HashSet::<T>::from_iter(rhs);
34 |     1. - (a.intersection(&b).count() as f64) / (a.union(&b).count() as f64)
35 | }
36 | 


--------------------------------------------------------------------------------
/find-simdoc/src/lsh/minhash.rs:
--------------------------------------------------------------------------------
 1 | //! 1-bit minwise hashing for the Jaccard similarity.
 2 | use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 3 | 
 4 | /// [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity.
 5 | pub struct MinHasher {
 6 |     seed: u64,
 7 | }
 8 | 
 9 | impl MinHasher {
10 |     /// Creates an instance.
11 |     pub const fn new(seed: u64) -> Self {
12 |         Self { seed }
13 |     }
14 | 
15 |     /// Creates an iterator to generate sketches from an input feature.
16 |     pub fn iter<'a>(&self, feature: &'a [u64]) -> MinHashIter<'a> {
17 |         MinHashIter {
18 |             feature,
19 |             seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed),
20 |         }
21 |     }
22 | }
23 | 
24 | /// Iterator to generate sketches with the 1-bit minwise hashing.
25 | pub struct MinHashIter<'a> {
26 |     feature: &'a [u64],
27 |     seeder: rand_xoshiro::SplitMix64,
28 | }
29 | 
30 | impl Iterator for MinHashIter<'_> {
31 |     type Item = u64;
32 | 
33 |     fn next(&mut self) -> Option<Self::Item> {
34 |         let mut x = 0;
35 |         for _ in 0..64 {
36 |             let seed = self.seeder.next_u64();
37 |             let h = self
38 |                 .feature
39 |                 .iter()
40 |                 .map(|&i| crate::lsh::hash_u64(i, seed))
41 |                 .min()
42 |                 .unwrap();
43 |             x = (x << 1) | (h & 1);
44 |         }
45 |         Some(x)
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/find-simdoc/src/lsh/simhash.rs:
--------------------------------------------------------------------------------
 1 | //! Simplified simhash for the Cosine similarity.
 2 | use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 3 | 
 4 | /// [Simplified simhash](https://dl.acm.org/doi/10.1145/2063576.2063737) for Cosine similarity.
 5 | pub struct SimHasher {
 6 |     seed: u64,
 7 | }
 8 | 
 9 | impl SimHasher {
10 |     /// Creates an instance.
11 |     pub const fn new(seed: u64) -> Self {
12 |         Self { seed }
13 |     }
14 | 
15 |     /// Creates an iterator to generate sketches from an input feature.
16 |     pub fn iter<'a>(&self, feature: &'a [(u64, f64)]) -> SimHashIter<'a> {
17 |         SimHashIter {
18 |             feature,
19 |             seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed),
20 |             weights: [0.; 64],
21 |         }
22 |     }
23 | }
24 | 
25 | /// Iterator to generate sketches with the simplified simhash.
26 | pub struct SimHashIter<'a> {
27 |     feature: &'a [(u64, f64)],
28 |     seeder: rand_xoshiro::SplitMix64,
29 |     weights: [f64; 64],
30 | }
31 | 
32 | impl Iterator for SimHashIter<'_> {
33 |     type Item = u64;
34 | 
35 |     fn next(&mut self) -> Option<Self::Item> {
36 |         self.weights.fill(0.);
37 |         let seed = self.seeder.next_u64();
38 |         for (h, x) in self
39 |             .feature
40 |             .iter()
41 |             .map(|&(i, x)| (crate::lsh::hash_u64(i, seed), x))
42 |         {
43 |             for (j, w) in self.weights.iter_mut().enumerate() {
44 |                 if (h >> j) & 1 == 0 {
45 |                     *w += x;
46 |                 } else {
47 |                     *w -= x;
48 |                 }
49 |             }
50 |         }
51 |         Some(
52 |             self.weights
53 |                 .iter()
54 |                 .fold(0, |acc, w| if *w >= 0. { (acc << 1) | 1 } else { acc << 1 }),
55 |         )
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/find-simdoc/src/shingling.rs:
--------------------------------------------------------------------------------
 1 | pub struct ShingleIter<'a, T> {
 2 |     tokens: &'a [T],
 3 |     window_size: usize,
 4 |     position: usize,
 5 | }
 6 | 
 7 | impl<'a, T> ShingleIter<'a, T> {
 8 |     pub fn new(tokens: &'a [T], window_size: usize) -> Self {
 9 |         assert!(!tokens.is_empty());
10 |         assert!(window_size <= tokens.len());
11 |         Self {
12 |             tokens,
13 |             window_size,
14 |             position: 0,
15 |         }
16 |     }
17 | }
18 | 
19 | impl<'a, T> Iterator for ShingleIter<'a, T> {
20 |     type Item = &'a [T];
21 | 
22 |     fn next(&mut self) -> Option<Self::Item> {
23 |         if self.tokens.len() < self.position + self.window_size {
24 |             return None;
25 |         }
26 |         let window = &self.tokens[self.position..self.position + self.window_size];
27 |         self.position += 1;
28 |         Some(window)
29 |     }
30 | }
31 | 
32 | #[cfg(test)]
33 | mod tests {
34 |     use super::*;
35 | 
36 |     #[test]
37 |     fn test_q1() {
38 |         let tokens = vec!["a", "b", "c"];
39 |         let mut iter = ShingleIter::new(&tokens, 1);
40 |         assert_eq!(iter.next(), Some(&tokens[0..1]));
41 |         assert_eq!(iter.next(), Some(&tokens[1..2]));
42 |         assert_eq!(iter.next(), Some(&tokens[2..3]));
43 |         assert_eq!(iter.next(), None);
44 |     }
45 | 
46 |     #[test]
47 |     fn test_q2() {
48 |         let tokens = vec!["a", "b", "c"];
49 |         let mut iter = ShingleIter::new(&tokens, 2);
50 |         assert_eq!(iter.next(), Some(&tokens[0..2]));
51 |         assert_eq!(iter.next(), Some(&tokens[1..3]));
52 |         assert_eq!(iter.next(), None);
53 |     }
54 | 
55 |     #[test]
56 |     fn test_q3() {
57 |         let tokens = vec!["a", "b", "c"];
58 |         let mut iter = ShingleIter::new(&tokens, 3);
59 |         assert_eq!(iter.next(), Some(&tokens[0..3]));
60 |         assert_eq!(iter.next(), None);
61 |     }
62 | 
63 |     #[test]
64 |     #[should_panic]
65 |     fn test_q4() {
66 |         let tokens = vec!["a", "b", "c"];
67 |         ShingleIter::new(&tokens, 4);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/find-simdoc/src/tfidf.rs:
--------------------------------------------------------------------------------
  1 | //! Weighters of TF-IDF.
  2 | use std::hash::Hash;
  3 | 
  4 | use hashbrown::{HashMap, HashSet};
  5 | 
  6 | use crate::errors::{FindSimdocError, Result};
  7 | use crate::feature::{FeatureConfig, FeatureExtractor};
  8 | 
  9 | /// Weighter of inverse document frequency.
 10 | #[derive(Default)]
 11 | pub struct Idf<T> {
 12 |     counter: HashMap<T, usize>,
 13 |     dedup: HashSet<T>,
 14 |     num_docs: usize,
 15 |     smooth: bool,
 16 | }
 17 | 
 18 | impl<T> Idf<T>
 19 | where
 20 |     T: Hash + Eq + Copy + Default,
 21 | {
 22 |     /// Creates an instance.
 23 |     pub fn new() -> Self {
 24 |         Self::default()
 25 |     }
 26 | 
 27 |     /// Enables smoothing.
 28 |     pub const fn smooth(mut self, yes: bool) -> Self {
 29 |         self.smooth = yes;
 30 |         self
 31 |     }
 32 | 
 33 |     /// Trains the frequency of terms for a document.
 34 |     pub fn add(&mut self, terms: &[T]) {
 35 |         self.dedup.clear();
 36 |         for &term in terms {
 37 |             if self.dedup.insert(term) {
 38 |                 self.counter
 39 |                     .entry(term)
 40 |                     .and_modify(|c| *c += 1)
 41 |                     .or_insert(1);
 42 |             }
 43 |         }
 44 |         self.num_docs += 1;
 45 |     }
 46 | 
 47 |     /// Gets the number of input documents.
 48 |     pub const fn num_docs(&self) -> usize {
 49 |         self.num_docs
 50 |     }
 51 | 
 52 |     /// Computes the IDF of an input term.
 53 |     pub fn idf(&self, term: T) -> f64 {
 54 |         let c = usize::from(self.smooth);
 55 |         let n = (self.num_docs + c) as f64;
 56 |         let m = (*self.counter.get(&term).unwrap() + c) as f64;
 57 |         (n / m).log10() + 1.
 58 |     }
 59 | }
 60 | 
 61 | impl Idf<u64> {
 62 |     /// Trains the term frequency of input documents.
 63 |     ///
 64 |     /// # Arguments
 65 |     ///
 66 |     /// * `documents` - List of documents.
 67 |     /// * `config` - Configuration of feature extraction. Use the same configuration as that in search.
 68 |     pub fn build<I, D>(mut self, documents: I, config: &FeatureConfig) -> Result<Self>
 69 |     where
 70 |         I: IntoIterator<Item = D>,
 71 |         D: AsRef<str>,
 72 |     {
 73 |         let extractor = FeatureExtractor::new(config);
 74 |         let mut feature = vec![];
 75 |         for doc in documents {
 76 |             let doc = doc.as_ref();
 77 |             if doc.is_empty() {
 78 |                 return Err(FindSimdocError::input("Input document must not be empty."));
 79 |             }
 80 |             extractor.extract(doc, &mut feature);
 81 |             self.add(&feature);
 82 |         }
 83 |         Ok(self)
 84 |     }
 85 | }
 86 | 
 87 | /// Weighter of term frequency.
 88 | #[derive(Default)]
 89 | pub struct Tf {
 90 |     sublinear: bool,
 91 | }
 92 | 
 93 | impl Tf {
 94 |     /// Creates an instance.
 95 |     pub fn new() -> Self {
 96 |         Self::default()
 97 |     }
 98 | 
 99 |     /// Enables sublinear normalization.
100 |     pub const fn sublinear(mut self, yes: bool) -> Self {
101 |         self.sublinear = yes;
102 |         self
103 |     }
104 | 
105 |     /// Computes the TF of input terms.
106 |     pub fn tf<T>(&self, terms: &mut [(T, f64)])
107 |     where
108 |         T: Hash + Eq + Copy + Default,
109 |     {
110 |         let counter = self.count(terms);
111 |         let total = terms.len() as f64;
112 |         for (term, weight) in terms {
113 |             let cnt = *counter.get(term).unwrap() as f64;
114 |             *weight = if self.sublinear {
115 |                 cnt.log10() + 1.
116 |             } else {
117 |                 cnt / total
118 |             };
119 |         }
120 |     }
121 | 
122 |     fn count<T>(&self, terms: &[(T, f64)]) -> HashMap<T, usize>
123 |     where
124 |         T: Hash + Eq + Copy + Default,
125 |     {
126 |         let mut counter = HashMap::new();
127 |         for &(term, _) in terms.iter() {
128 |             counter.entry(term).and_modify(|c| *c += 1).or_insert(1);
129 |         }
130 |         counter
131 |     }
132 | }
133 | 
134 | #[cfg(test)]
135 | mod tests {
136 |     use std::vec;
137 | 
138 |     use super::*;
139 | 
140 |     #[test]
141 |     fn test_idf() {
142 |         let mut idf = Idf::new();
143 |         idf.add(&['A', 'A', 'C']);
144 |         idf.add(&['A', 'C']);
145 |         idf.add(&['B', 'A']);
146 | 
147 |         assert_eq!(idf.num_docs(), 3);
148 | 
149 |         idf = idf.smooth(false);
150 |         assert_eq!(idf.idf('A'), (3f64 / 3f64).log10() + 1.);
151 |         assert_eq!(idf.idf('B'), (3f64 / 1f64).log10() + 1.);
152 |         assert_eq!(idf.idf('C'), (3f64 / 2f64).log10() + 1.);
153 | 
154 |         idf = idf.smooth(true);
155 |         assert_eq!(idf.idf('A'), (4f64 / 4f64).log10() + 1.);
156 |         assert_eq!(idf.idf('B'), (4f64 / 2f64).log10() + 1.);
157 |         assert_eq!(idf.idf('C'), (4f64 / 3f64).log10() + 1.);
158 |     }
159 | 
160 |     #[test]
161 |     fn test_tf() {
162 |         let mut tf = Tf::new();
163 |         let mut terms = vec![('A', 0.), ('B', 0.), ('A', 0.)];
164 | 
165 |         tf = tf.sublinear(false);
166 |         tf.tf(&mut terms);
167 |         assert_eq!(
168 |             terms.clone(),
169 |             vec![('A', 2. / 3.), ('B', 1. / 3.), ('A', 2. / 3.)]
170 |         );
171 | 
172 |         tf = tf.sublinear(true);
173 |         tf.tf(&mut terms);
174 |         assert_eq!(
175 |             terms.clone(),
176 |             vec![
177 |                 ('A', 2f64.log10() + 1.),
178 |                 ('B', 1f64.log10() + 1.),
179 |                 ('A', 2f64.log10() + 1.)
180 |             ]
181 |         );
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/scripts/load_nltk_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Download NLTK corpora and create a text file of sentences with lowercase letters and no duplicate lines.
 4 | '''
 5 | 
 6 | import sys
 7 | import nltk
 8 | from argparse import ArgumentParser
 9 | 
10 | 
11 | def download_reuters():
12 |     from nltk.corpus import reuters
13 |     nltk.download('reuters')
14 |     return reuters.sents()
15 | 
16 | 
17 | def download_gutenberg():
18 |     from nltk.corpus import gutenberg
19 |     nltk.download('gutenberg')
20 |     return gutenberg.sents()
21 | 
22 | 
23 | def download_webtext():
24 |     from nltk.corpus import webtext
25 |     nltk.download('webtext')
26 |     return webtext.sents()
27 | 
28 | 
29 | def download_brown():
30 |     from nltk.corpus import brown
31 |     nltk.download('brown')
32 |     return brown.sents()
33 | 
34 | 
35 | def download_inaugural():
36 |     from nltk.corpus import inaugural
37 |     nltk.download('inaugural')
38 |     return inaugural.sents()
39 | 
40 | 
41 | def main():
42 |     parser = ArgumentParser()
43 |     parser.add_argument('name')
44 |     args = parser.parse_args()
45 | 
46 |     nltk.download('punkt')
47 | 
48 |     if args.name == 'reuters':
49 |         sents = download_reuters()
50 |     elif args.name == 'gutenberg':
51 |         sents = download_gutenberg()
52 |     elif args.name == 'webtext':
53 |         sents = download_webtext()
54 |     elif args.name == 'brown':
55 |         sents = download_brown()
56 |     elif args.name == 'inaugural':
57 |         sents = download_inaugural()
58 |     else:
59 |         print(f'unsupported corpus name: {args.name}', file=sys.stderr)
60 |         return
61 | 
62 |     with open(f'{args.name}.txt', 'wt') as fout:
63 |         sents = [' '.join(sent).lower() for sent in sents]
64 |         for sent in set(sents):
65 |             fout.write(sent)
66 |             fout.write('\n')
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------