├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── doc └── intro.md ├── project.clj ├── resources ├── config.edn ├── en-token.bin ├── logback.xml ├── stopwords.txt ├── test.txt ├── test1.txt └── testforest ├── src └── consimilo │ ├── config.clj │ ├── core.clj │ ├── lsh_forest.clj │ ├── lsh_query.clj │ ├── lsh_state.clj │ ├── lsh_util.clj │ ├── minhash.clj │ ├── minhash_util.clj │ ├── random_seed.clj │ ├── sha1.clj │ └── text_processing.clj └── test └── consimilo ├── core_test.clj ├── lsh_forest_test.clj ├── lsh_util_test.clj ├── minhash_test.clj ├── minhash_util_test.clj ├── random_seed_test.clj ├── sha1_test.clj └── text_processing_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | .idea/ 13 | *.iml 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: clojure 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v0.1.0 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # consimilo 2 | 3 | [![Build Status](https://travis-ci.org/andrewmcloud/consimilo.svg?branch=master)](https://travis-ci.org/andrewmcloud/consimilo) 4 | [![Clojars Project](https://img.shields.io/clojars/v/consimilo.svg)](https://clojars.org/consimilo) 5 | 6 | ## A Clojure library for querying large data-sets on similarity 7 | 8 | consimilo is a library that utilizes locality sensitive hashing (implemented as lsh-forest) and minhashing, to support 9 | *top-k* similar item queries. Finding similar items across expansive data-sets is a common problem that presents itself 10 | in many real world applications (e.g. finding articles from the same source, plagiarism detection, collaborative 11 | filtering, context filtering, document similarity, etc...). Searching a corpus for *top-k* similar items quickly grows 12 | to an unwieldy complexity at relatively small corpus sizes *(n choose 2)*. LSH reduces the search space by "hashing" 13 | items in such a way that collisions occur as a result of similarity. Once the items are hashed and indexed the 14 | lsh-forest supports a *top-k* most similar items query of ~*O(log n)*. There is an accuracy trade-off that comes with 15 | the enormous increase in query speed. More information can be found in chapter 3 of 16 | [Mining Massive Datasets](http://infolab.stanford.edu/~ullman/mmds/ch3.pdf). 17 | 18 | ## Getting Started 19 | 20 | Add consimilo as a dependency in your project.clj: 21 | 22 | ```clojure 23 | [consimilo "0.1.1"] 24 | ``` 25 | 26 | The main methods you are likely to need are all located in [`core.clj`](./src/consimilo/core.clj). 27 | Import it with something like: 28 | 29 | ```clojure 30 | (ns my-ns (:require [consimilo.core :as consimilo])) 31 | ``` 32 | 33 | ## Building a forest 34 | 35 | First you need to load the candidates vector into a forest. This vector can represent any arbitrary information 36 | (e.g. tokens in a document, ngrams, metadata about users, content interactions, context surrounding 37 | interactions). The candidates vector must be a collection of maps, each representing an item. The map will have an 38 | `:id` key which is used to reference the minhash vector in the forest and a `:features` key which is a vector 39 | containing the individual features. `[{:id id1 :features [feature1 feature2 ... featuren]} ... ]` 40 | 41 | ### Adding feature vectors to a forest 42 | 43 | Once your candidates vector is in the correct form, you can add the items to the forest: 44 | 45 | ```clojure 46 | (def my-forest (consimilo/add-all-to-forest candidates-vector)) ;;creates new forest, my-forest 47 | ``` 48 | 49 | You can continue to add to this forest by passing it as the first argument to `add-all-to-forest`. The forest data 50 | structure is stored in an atom, so the existing forest is modified in place. 51 | 52 | Note: upon every call to `add-all-to-forest` an expensive sort function is called to enable *O(log n)* queries. It is 53 | better to add all items to the forest at once or in the case of a live system, add new items to the forest in batches 54 | offline and replace the production forest. 55 | 56 | ```clojure 57 | (consimilo/add-all-to-forest my-forest new-candidates-vector) ;;updates my-forest in place 58 | ``` 59 | 60 | ### Adding strings and files to a forest (helper functions) 61 | 62 | consimilo provides helper functions for constructing feature vectors from strings and files. By default, a new forest 63 | is created and stopwords are removed. You may add to an existing forest and/or include stopwords via optional 64 | parameters `:forest` `:remove-stopwords?`. The optional parameters are defaulted to `:forest (new-forest)` `:remove-stopwords? true`. 65 | 66 | Add a collection of strings to a **new** forest and **remove** stopwords: 67 | 68 | ```clojure 69 | (def my-forest (consimilo/add-strings-to-forest 70 | [{:id id1 :features "my sample string 1"} 71 | {:id id2 :features "my sample string 2"}])) 72 | ``` 73 | 74 | Add a collection of strings to an **existing** forest and **do not remove** stopwords: 75 | 76 | ```clojure 77 | (consimilo/add-strings-to-forest [{:id id1 :features "my sample string 1"} 78 | {:id id2 :features "my sample string 2"}] 79 | :forest my-forest)) ;;updates my-forest in place 80 | ``` 81 | 82 | Add a collection of files to a **new** forest and **remove** stopwords: 83 | 84 | ```clojure 85 | (def my-forest (consimilo/add-files-to-forest 86 | [FileObj-1 FileObj-2 FileObj-3 FileObj-n])) ;;creates new forest, my-forest 87 | ``` 88 | 89 | Note: when calling `add-files-to-forest` `:id` is auto-generated from the file name and `:features` are generated from 90 | the tokenized, extracted text. The same optional parameters available for `add-strings-to-forest` are also available for 91 | `add-files-to-forest`. 92 | 93 | ## Querying a forest 94 | 95 | Once you have your forest built, you can query for `k` most similar items to feature-vector `v` by running: 96 | 97 | ```clojure 98 | (def results (consimilo/query-forest my-forest k v)) 99 | 100 | (println (:top-k results)) ;;returns a list of keys ordered by similarity 101 | (println (:query-hash results)) ;;returns the minhash of the query. Utilized to calculate similarity. 102 | ``` 103 | 104 | ### Querying a forest with strings and files (helper functions) 105 | 106 | consimilo provides helper functions for querying the forest with strings and files. The helper functions `query-string` 107 | and `query-file` have an optional parameter `:remove-stopwords?` which is defaulted `true`, removing stopwords. Queries 108 | against strings and files should be made using the same tokenization scheme used to input items in the forest 109 | (stopwords present or removed). 110 | 111 | Querying a forest with a string: 112 | 113 | ```clojure 114 | (def results (consimilo/query-string my-forest k "my query string")) 115 | 116 | (println (:top-k results)) ;;returns a list of keys ordered by similarity 117 | (println (:query-hash results)) ;;returns the minhash of the query. Utilized to calculate similarity. 118 | ``` 119 | 120 | Querying a forest with a file: 121 | 122 | ```clojure 123 | (def results (consimilo/query-file my-forest k Fileobj)) 124 | 125 | (println (:top-k results)) ;;returns a list of keys ordered by similarity 126 | (println (:query-hash results)) ;;returns the minhash of the query. Utilized to calculate similarity. 127 | ``` 128 | ## Calculating similarity 129 | 130 | consimilo provides functions for calculating approximate distance / similarity between the query and *top-k* results. 131 | The function `similar-k` accepts optional parameters to specify which distance / similarity function should be used. 132 | For calculating Jaccard similarity, use: `:sim-fn :jaccard`, for calculating Hamming distance, use: `:sim-fn :hamming`, 133 | and for calculating cosine distance, use: `:sim-fn :cosine`. `similar-k` returns a hash-map, `keys` are the *top-k* ids 134 | and `vals` are the similarity / distance. As with the other query functions, queries against strings and files 135 | should be made using the same tokenization scheme used to input the items in the forest (stopwords present or removed). 136 | 137 | ### Querying a forest with strings, files, or feature-vectors and calculating similarity 138 | 139 | consimilo will dispatch to the correct query function based on query type (string, file, collection of features). There are 3 similarity functions available for use: `:consine`, `jaccard`, & `hamming`. 140 | 141 | ```clojure 142 | (def similar-items (consimilo/similarity-k 143 | my-forest 144 | k 145 | query 146 | :sim-fn :cosine)) 147 | 148 | (println similar-items) ;;{id1 (cosine-distance(query id1)) ... idk (cosine-distance (query idk))} 149 | ``` 150 | 151 | ## Saving and loading forests 152 | 153 | consimilo uses [Nippy](https://github.com/ptaoussanis/nippy) to provide simple, robust, serialization / deserialization 154 | of your forests. 155 | 156 | Serialize and save a forest to a file: 157 | ```clojure 158 | (consimilo/freeze-forest my-forest "/tmp/my-saved-forest") 159 | ``` 160 | 161 | Load a forest from a file: 162 | ```clojure 163 | (def my-forest (consimilo/thaw-forest "/tmp/my-saved-forest")) 164 | ``` 165 | 166 | ## Configuration 167 | 168 | consimilo uses [config](https://github.com/yogthos/config) to manage configuration. consimilo has three configurable 169 | options: 170 | * Number of trees in the forest (default 8): `:trees` 171 | * Number of permutation functions used to build the minhash (default 128): `:perms` 172 | * Random number seed used to generate minhash functions (default 1) `:seed` 173 | 174 | The defaults should work well in most cases, however they may be overridden by placing a config.edn file in the 175 | resources directory of your project. See [`config.edn`](./resources/config.edn). 176 | 177 | ## Contributions / Issues 178 | 179 | Please use the project's GitHub issues page for questions, ideas, etc. Pull requests are welcome. 180 | 181 | ## License 182 | 183 | Copyright 2018 Andrew McLoud 184 | 185 | Licensed under the Apache License, Version 2.0 (the "License"); 186 | you may not use this file except in compliance with the License. 187 | You may obtain a copy of the License at 188 | 189 | [http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0) 190 | 191 | Unless required by applicable law or agreed to in writing, software 192 | distributed under the License is distributed on an "AS IS" BASIS, 193 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 194 | See the License for the specific language governing permissions and 195 | limitations under the License. 196 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to consimilo 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject consimilo "0.1.1" 2 | :description "A Clojure library for querying large data-sets on similarity" 3 | :url "http://github.com/andrewmcloud/consimilo" 4 | :license {:name "Apache License 2.0" 5 | :url "http://www.apache.org/licenses"} 6 | :dependencies [[org.clojure/clojure "1.10.0"] 7 | [yogthos/config "1.1.1"] 8 | [clojure-opennlp "0.4.0"] 9 | [com.novemberain/pantomime "2.10.0"] 10 | [org.clojure/tools.logging "0.3.1"] 11 | [ch.qos.logback/logback-classic "1.1.3"] 12 | [com.taoensso/nippy "2.13.0"]] 13 | :target-path "target/%s" 14 | :profiles {:uberjar {:aot :all}}) 15 | -------------------------------------------------------------------------------- /resources/config.edn: -------------------------------------------------------------------------------- 1 | {:trees 8 ;; number of trees in the lshforest 2 | :perms 128 ;; diminsions of minhash. used to calculate lshforest minhash slices 3 | :seed 1} ;; random number generator seed used to generate minhash functions -------------------------------------------------------------------------------- /resources/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewmcloud/consimilo/db96c1695248c3486e1d23de5589b39f0e0bd49f/resources/en-token.bin -------------------------------------------------------------------------------- /resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | WARN 6 | 7 | 8 | UTF-8 9 | %d %-4relative [%thread] %-5level %logger{35} - %msg%n 10 | 11 | 12 | 13 | 14 | logs/consimilo.log 15 | 16 | 18 | logs/consimilo_%d{yyyy-MM-dd}.%i.log 19 | 20 | 21 | 5MB 22 | 23 | 24 | 30 25 | 26 | 27 | 28 | UTF-8 29 | %d %-4relative [%thread] %-5level %logger{35} - %msg%n 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /resources/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | after 5 | again 6 | against 7 | all 8 | am 9 | an 10 | and 11 | any 12 | are 13 | aren't 14 | as 15 | at 16 | be 17 | because 18 | been 19 | before 20 | being 21 | below 22 | between 23 | both 24 | but 25 | by 26 | can't 27 | cannot 28 | could 29 | couldn't 30 | did 31 | didn't 32 | do 33 | does 34 | doesn't 35 | doing 36 | don't 37 | down 38 | during 39 | each 40 | few 41 | for 42 | from 43 | further 44 | had 45 | hadn't 46 | has 47 | hasn't 48 | have 49 | haven't 50 | having 51 | he 52 | he'd 53 | he'll 54 | he's 55 | her 56 | here 57 | here's 58 | hers 59 | herself 60 | him 61 | himself 62 | his 63 | how 64 | how's 65 | i 66 | i'd 67 | i'll 68 | i'm 69 | i've 70 | if 71 | in 72 | into 73 | is 74 | isn't 75 | it 76 | it's 77 | its 78 | itself 79 | let's 80 | me 81 | more 82 | most 83 | mustn't 84 | my 85 | myself 86 | no 87 | nor 88 | not 89 | of 90 | off 91 | on 92 | once 93 | only 94 | or 95 | other 96 | ought 97 | our 98 | ours ourselves 99 | out 100 | over 101 | own 102 | same 103 | shan't 104 | she 105 | she'd 106 | she'll 107 | she's 108 | should 109 | shouldn't 110 | so 111 | some 112 | such 113 | than 114 | that 115 | that's 116 | the 117 | their 118 | theirs 119 | them 120 | themselves 121 | then 122 | there 123 | there's 124 | these 125 | they 126 | they'd 127 | they'll 128 | they're 129 | they've 130 | this 131 | those 132 | through 133 | to 134 | too 135 | under 136 | until 137 | up 138 | very 139 | was 140 | wasn't 141 | we 142 | we'd 143 | we'll 144 | we're 145 | we've 146 | were 147 | weren't 148 | what 149 | what's 150 | when 151 | when's 152 | where 153 | where's 154 | which 155 | while 156 | who 157 | who's 158 | whom 159 | why 160 | why's 161 | with 162 | won't 163 | would 164 | wouldn't 165 | you 166 | you'd 167 | you'll 168 | you're 169 | you've 170 | your 171 | yours 172 | yourself 173 | yourselves -------------------------------------------------------------------------------- /resources/test.txt: -------------------------------------------------------------------------------- 1 | My name is Bonnie and I live in Charleston SC. I am staying home for Christmas this year. -------------------------------------------------------------------------------- /resources/test1.txt: -------------------------------------------------------------------------------- 1 | My name is Andrew and I live in Charleston SC. I am the author of this codebase. -------------------------------------------------------------------------------- /resources/testforest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrewmcloud/consimilo/db96c1695248c3486e1d23de5589b39f0e0bd49f/resources/testforest -------------------------------------------------------------------------------- /src/consimilo/config.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.config 2 | (:require [config.core :refer [env]] 3 | [clojure.tools.logging :as log])) 4 | 5 | (defonce trees 6 | (if-let [trees-env (:trees env)] 7 | trees-env 8 | (do 9 | (log/info "Number of trees (:trees) cannot be read from config; Defaulting to 8.") 10 | 8))) 11 | 12 | (defonce perms 13 | (if-let [perms-env (:perms env)] 14 | perms-env 15 | (do 16 | (log/info "Number of permutations (:perms) cannot be read from config; Defaulting to 128.") 17 | 128))) 18 | 19 | (defonce seed 20 | (if-let [seed-env (:seed env)] 21 | seed-env 22 | (do 23 | (log/info "Random number seed (:seed) cannot be read from config; Defaulting to 1.") 24 | 1))) -------------------------------------------------------------------------------- /src/consimilo/core.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.core 2 | (:require [consimilo.lsh-forest :as f] 3 | [consimilo.lsh-util :as util] 4 | [consimilo.minhash :as mh] 5 | [consimilo.minhash-util :as mhu] 6 | [consimilo.lsh-query :as q] 7 | [consimilo.text-processing :as text] 8 | [taoensso.nippy :as nippy] 9 | [clojure.tools.logging :as log])) 10 | 11 | (defn add-all-to-forest 12 | "Adds each vector in `feature-coll` to an lsh forest and returns the forest. 13 | If you want to add the `feature-coll` to an existing `forest` pass the forest as the first argument. 14 | Each item of `feature-coll` should be a map with :id and :features entries. 15 | The :id is the identifier for the minhash vector that will be returned upon query of the forest. 16 | This id can be utilized to lookup the minhash vector in the :keys hashmap of the forest. 17 | The :features is a collection of strings which will be utilized to create the minhash vector 18 | (e.g. in the case of a document, the :features could be tokens or n-grams). 19 | 20 | Note: items should be loaded into the forest as few times as possible in large chunks. An expensive 21 | sort called after items are added to the forest to enable ~log(n) queries." 22 | ([feature-coll] 23 | (add-all-to-forest (f/new-forest) feature-coll)) 24 | ([forest feature-coll] 25 | (if (util/valid-input? feature-coll coll?) 26 | (do 27 | (dorun (pmap #(f/add-lsh! forest (:id %) (mh/build-minhash (:features %))) feature-coll)) 28 | (f/index! forest) 29 | forest) 30 | (log/warn "invalid input, feature-coll must be a collection of maps, each having keys :id and :features; 31 | :features must be a collection")))) 32 | 33 | (defn add-strings-to-forest 34 | "Convenience method for processing documents. Each item of feature-coll should be a map with 35 | :id and :features entries. The :id is the identifier for the minhash vector stored in the forest. 36 | The :features is a string which will be tokenized into features per the optional 37 | parameters. The feature vector will be minhashed and inserted into the lsh-forest. 38 | 39 | Optional Keyword Arguments: :forest - add to an existing forest; default: create new forest 40 | 41 | Note: items should be loaded into the forest as few times as possible in large chunks. An expensive 42 | sort called after items are added to the forest to enable ~log(n) queries." 43 | 44 | [feature-coll & {:keys [forest] :or {forest (f/new-forest)}}] 45 | (if (util/valid-input? feature-coll string?) 46 | (add-all-to-forest forest 47 | (map #(assoc % :features (text/tokenize-text (:features %))) feature-coll)) 48 | (log/warn "invalid input, feature-coll must be a collection of maps, each having keys :id and :features; 49 | :features must be a string"))) 50 | 51 | (defn add-files-to-forest 52 | "Convenience method for processing files. Files should be a collection of File objects. 53 | The :id used for entry into the forest will be generated from the file name. The :features will 54 | be generated by extracting the text from each file and tokenizing and/or shingling per the optional 55 | parameters. The feature vector is minhashed and inserted into the lsh-forest. 56 | 57 | Optional Keyword Arguments: :forest - add to an existing forest; default: create new forest 58 | 59 | Note: items should be loaded into the forest as few times as possible in large chunks. An expensive 60 | sort called after items are added to the forest to enable ~log(n) queries." 61 | [files & {:keys [forest] :or {forest (f/new-forest)}}] 62 | (if (util/valid-input-add-files? files) 63 | (add-strings-to-forest (map (fn [f] {:id (.getName f) 64 | :features (text/extract-text f)}) 65 | files) 66 | :forest forest) 67 | (log/warn "invalid input, files must be a collection of file objects"))) 68 | 69 | (defn query-forest 70 | "Finds the closest `k` vectors to vector `v` stored in the `forest`." 71 | [forest k v] 72 | (let [minhash (mh/build-minhash v)] 73 | {:top-k (q/query forest k minhash) :query-hash minhash})) 74 | 75 | (defn query-string 76 | "Convenience method for querying the forest for top-k similar strings. forest is the forest to be 77 | queried. string will be converted to a feature vector through tokenization / shingling per the optional 78 | parameters. The feature vector is minhashed and used to query the forest. K is the number of results 79 | (top-k most similar items)." 80 | [forest k string] 81 | (query-forest forest k (text/tokenize-text string))) 82 | 83 | (defn query-file 84 | "Convenience method for querying the forest for top-k similar files. Forest is the forest to be 85 | queried. File is converted to a feature vector through text-extraction, tokenizating / shingling 86 | per the optional arguments. The feature vector is minhashed and used to query the forest. k is the number 87 | of results (top-k most similar items)." 88 | [forest k file] 89 | (query-string forest k (text/extract-text file))) 90 | 91 | (defn get-sim-fn 92 | [key] 93 | (condp = key 94 | :jaccard mhu/jaccard-similarity 95 | :cosine mhu/cosine-distance 96 | :hamming mhu/hamming-distance)) 97 | 98 | (defmulti similarity-k 99 | "Query forest for top-k items, returns a hashmap: {item-key1 sim-fn-result1 item-key-k sim-fn-result-k}. Available 100 | similarity functions are Jaccard similarity, cosine distance, and Hamming distance. sim-fn is defaulted to :jaccard, 101 | but can be overridden by passing the optional :sim-fn key and :jaccard, :cosine, or :hamming. similarity-k Dispatches 102 | based on input: string, file, or feature-vector." 103 | (fn [forest k input & {:keys [sim-fn] :or {sim-fn :jaccard}}] 104 | (condp #(%1 %2) input 105 | coll? :feature-vec 106 | string? :string 107 | :file))) 108 | 109 | (defmethod similarity-k :string 110 | [forest k string & {:keys [sim-fn] :or {sim-fn :jaccard}}] 111 | (let [return (query-string forest k string) 112 | f (get-sim-fn sim-fn)] 113 | (mhu/zip-similarity forest return f))) 114 | 115 | (defmethod similarity-k :file 116 | [forest k file & {:keys [sim-fn] :or {sim-fn :jaccard}}] 117 | (let [return (query-file forest k file) 118 | f (get-sim-fn sim-fn)] 119 | (mhu/zip-similarity forest return f))) 120 | 121 | (defmethod similarity-k :feature-vec 122 | [forest k feature-vector & {:keys [sim-fn] :or {sim-fn :jaccard}}] 123 | (let [return (query-forest forest k feature-vector) 124 | f (get-sim-fn sim-fn)] 125 | (mhu/zip-similarity forest return f))) 126 | 127 | (defn freeze-forest 128 | "Serializes forest and saves to a file. Forest should be created using one of the add-*-to-forest functions. 129 | file-path should be a string representing the filepath. Returns the byte-array representation of the serialize 130 | object and creates a file containing the byte-string representation of the serialized object." 131 | [forest file-path] 132 | (nippy/freeze-to-file file-path @forest)) 133 | 134 | (defn thaw-forest 135 | "Deserializes forest from file. file-path should be a string representing the filepath of the serialized object. 136 | Returns an lsh-forest." 137 | [file-path] 138 | (atom (nippy/thaw-from-file file-path))) -------------------------------------------------------------------------------- /src/consimilo/lsh_forest.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-forest 2 | (:require [consimilo.lsh-util :as util] 3 | [consimilo.lsh-state :as state] 4 | [consimilo.lsh-query :as q] 5 | [consimilo.config :as config] 6 | [clojure.tools.logging :as log])) 7 | 8 | (defn new-forest 9 | "Create new empty initialized forest structure." 10 | [] 11 | (atom {:keys {} 12 | :hashtables (util/build-hashtables config/trees) 13 | :sorted-hash (util/build-sorted-hashtables config/trees)})) 14 | 15 | (defn add-lsh! 16 | "add minhash to lsh-forest. key must be a string, will be converted to keyword" 17 | [forest key minhash] 18 | (cond 19 | (get-in @forest [:keys (util/keywordize key)]) (log/warn "key already added to hash") 20 | (< (count minhash) state/hashrange) (log/warn "minhash is not correct permutation size") 21 | :else (state/plant-trees! forest key (util/slice-minhash minhash state/hashranges)))) 22 | 23 | (defn index! 24 | "builds sorted-hash, must be called in order to query. " 25 | [forest] 26 | (swap! forest 27 | assoc 28 | :sorted-hash 29 | (into {} (doall (pmap (partial state/sort-tree forest) (util/tree-keys config/trees)))))) 30 | 31 | (defn query-forest 32 | "search lsh-forest for top k most similar items, utilizes binary search. 33 | index! must be called prior to build the sorted hashes." 34 | [forest minhash k-items] 35 | (q/query forest minhash k-items)) -------------------------------------------------------------------------------- /src/consimilo/lsh_query.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-query 2 | (:require [consimilo.lsh-state :as state] 3 | [consimilo.lsh-util :as util] 4 | [consimilo.config :as config] 5 | [clojure.tools.logging :as log])) 6 | 7 | (defn- hashtable-lookup 8 | "returns collection of values for key in nested hashtable {:tree {:key value}....}" 9 | [hashtable key] 10 | (map #(get-in hashtable [% key]) (util/tree-keys config/trees))) 11 | 12 | (defn- hashtables-lookup 13 | "returns collection of values for keys in nested hashtable" 14 | [hashtable keys] 15 | (map #(hashtable-lookup hashtable %) keys)) 16 | 17 | (defn- pred-search 18 | "Finds the first index less then `j` for which `pred` is satisfied." 19 | ([pred j] 20 | (pred-search pred j 0)) 21 | ([pred j i] 22 | (if (>= i j) 23 | i 24 | (let [h (int (+ i (/ (- j i) 2)))] 25 | (if-not (pred h) 26 | (recur pred j (inc h)) 27 | (recur pred h i)))))) 28 | 29 | (defn- query-fn 30 | "performs a binary search to find the r-length prefix over the sorted hashtables" 31 | [forest min-slice tk r] 32 | (let [sorted (get-in @forest [:sorted-hash tk]) 33 | hashtable (get-in @forest [:hashtables tk]) 34 | min-prefix (util/coll-prefix min-slice r) 35 | sorted-range (dec (count sorted)) 36 | i (pred-search (fn [x] 37 | (util/v>=v 38 | (util/coll-prefix (get sorted x) r) 39 | min-prefix)) 40 | sorted-range)] 41 | (if (util/v=v (util/coll-prefix (get sorted i) r) min-prefix) 42 | (take-while #(util/v=v (util/coll-prefix % r) min-prefix) (drop i sorted))))) 43 | 44 | (defn- query-k-prefix 45 | "queries for the r-length prefix of each minhash slice in the forest" 46 | [forest minhash r] 47 | (mapcat #(query-fn forest %1 %2 r) 48 | (util/slice-minhash minhash state/hashranges) 49 | (util/tree-keys config/trees))) 50 | 51 | (defn query 52 | "returns a list of the keys of the top k-items most similar to minhash" 53 | [forest k-items minhash] 54 | (cond 55 | (<= k-items 0) (log/warn "k must be greater than zero") 56 | (< (count minhash) (* state/k config/trees)) (log/warn "the perm of Minhash out of range") 57 | :else (->> (range state/k) 58 | reverse 59 | (mapcat #(query-k-prefix forest minhash %)) 60 | (hashtables-lookup (get @forest :hashtables)) 61 | flatten 62 | (filter some?) 63 | (distinct) 64 | (take k-items)))) -------------------------------------------------------------------------------- /src/consimilo/lsh_state.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-state 2 | (:require [config.core :refer [env]] 3 | [consimilo.lsh-util :as util] 4 | [consimilo.config :as config] 5 | [clojure.tools.logging :as log])) 6 | 7 | ;; length of minhash slices 8 | (def k (int (/ config/perms config/trees))) 9 | 10 | ;; range of minhash 11 | (def hashrange (util/get-range k config/trees)) 12 | 13 | ;; vector of minhash slice lengths 14 | (def hashranges (util/get-hashranges k config/trees)) 15 | 16 | (defn- populate-hastables! 17 | "adds each slice of the minhash to a differnt hashtable bucket" 18 | [forest key minhash] 19 | (dorun 20 | (map (fn [index min-slice] 21 | (let [kw (util/keywordize index)] 22 | (swap! forest assoc-in [:hashtables kw min-slice] (util/keywordize key)))) 23 | (range config/trees) 24 | minhash))) 25 | 26 | (defn- populate-keys! 27 | "associates a key to the list of minhash slices" 28 | [forest key sliced-minhashes] 29 | (swap! forest assoc-in [:keys (util/keywordize key)] (flatten sliced-minhashes))) 30 | 31 | (defn plant-trees! 32 | "populates :hashtables and :keys with the minhash slices" 33 | [forest key sliced-minhashes] 34 | (populate-hastables! forest key sliced-minhashes) 35 | (populate-keys! forest key sliced-minhashes)) 36 | 37 | (defn sort-tree 38 | "sorts the list of keys in each of the hashtables and shoves them into a map as a value of tree-key" 39 | [forest tree-key] 40 | (->> (get-in @forest [:hashtables tree-key]) 41 | keys 42 | (map vec) 43 | sort 44 | (into []) 45 | (assoc {} tree-key))) 46 | -------------------------------------------------------------------------------- /src/consimilo/lsh_util.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-util) 2 | 3 | (defn- slice 4 | "Slices from start to end non incluseive." 5 | [start end coll] 6 | (drop start (take end coll))) 7 | 8 | (defn get-hashranges 9 | "Vectors of [start stop] for each bucket for the given `k` buckets and `trees` trees." 10 | [k trees] 11 | (map #(vector (* % k) (* (inc %) k)) (range trees))) 12 | 13 | (defn get-range 14 | "Total number of ranges for given `k` and `trees`." 15 | [k trees] 16 | (* k trees)) 17 | 18 | (defn keywordize 19 | "if not keyword? i, converts i to keyword" 20 | [i] 21 | (if (keyword? i) 22 | i 23 | (keyword (str i)))) 24 | 25 | (defn tree-keys 26 | "Keywords for each integer between 0 and `trees`." 27 | [trees] 28 | (mapv keywordize (range trees))) 29 | 30 | (defn v=v 31 | "predicate: vector1 = vector2" 32 | [v1 v2] 33 | (zero? (compare v1 v2))) 34 | 35 | (defn v>=v 36 | "predicate: vector1 >= vector2" 37 | [v1 v2] 38 | (>= (compare v1 v2) 0)) 39 | 40 | (defn build-hashtables 41 | "Creates map from keywords for 0 to `trees` to {}." 42 | [trees] 43 | (zipmap (map keywordize (range trees)) (repeat {}))) 44 | 45 | (defn build-sorted-hashtables 46 | "Creates map from keywords for 0 to `trees` to []." 47 | [trees] 48 | (zipmap (map keywordize (range trees)) (repeat []))) 49 | 50 | (defn coll-prefix 51 | "returns vector of first k items in coll" 52 | [coll k] 53 | (vec (take k coll))) 54 | 55 | (defn slice-minhash 56 | "Slices `minhash` at `hashranges` boundaries. 57 | `hashranges` is sequence of sequences each with 2 elements, 58 | the first is the start of the bucket range and the second 59 | is the end of that bucket." 60 | [minhash hashranges] 61 | (mapv #(slice (first %) (last %) minhash) hashranges)) 62 | 63 | (defn valid-input? 64 | "validates the input of add-*-to-forest functions" 65 | [feature-coll pred] 66 | (->> feature-coll 67 | (map #(and (contains? % :id) (contains? % :features) (pred (:features %)))) 68 | (every? true?))) 69 | 70 | (defn valid-input-add-files? 71 | "validates the input of add-*-to-forest functions" 72 | [files] 73 | (and (coll? files) 74 | (->> files 75 | (map #(instance? java.io.File %)) 76 | (every? true?)))) -------------------------------------------------------------------------------- /src/consimilo/minhash.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.minhash 2 | (:require [consimilo.random-seed :as rseed] 3 | [consimilo.sha1 :as sha] 4 | [consimilo.config :as config] 5 | [consimilo.minhash-util :as util] 6 | [clojure.core :exclude [rand-int]] 7 | [clojure.tools.logging :as log])) 8 | 9 | ;; prime number larger than sha1 hash 10 | (def large-prime 3064991081731777716716694054300618367237478244367416721N) 11 | 12 | (defn- init-hashvalues 13 | "initializes minhash signature to infinity" 14 | [] 15 | (vec (repeat config/perms large-prime))) 16 | 17 | (defn- build-permutations 18 | "builds seeded random number populated vectors to simulate 19 | the vector permutations a and b" 20 | [] 21 | (rseed/set-random-seed! config/seed) 22 | {:a (rseed/rand-vec config/perms large-prime) 23 | :b (rseed/rand-vec config/perms large-prime)}) 24 | 25 | ;; build seeded vector permutations once. They are the same for every minhash 26 | ;; which allows incremental minhashing a single vector at a time. 27 | (defonce permutations (build-permutations)) 28 | 29 | (defn update-minhash 30 | "updates minhash with each document feature (token, shingle, n-gram, etc...) 31 | Tokens are hashed using sha1 hash and truncated at max-hash to allow hashing 32 | of documents with varying feature sizes. One minhash should be created for 33 | each document" 34 | [hashvalues feature] 35 | (let [hv (sha/get-hash-bigint (str feature)) 36 | a (:a permutations) 37 | b (:b permutations)] 38 | (-> (util/scalar-mul a hv) 39 | (util/elementwise-add b) 40 | (util/scalar-mod large-prime) 41 | (util/elementwise-min hashvalues)))) 42 | 43 | (defn build-minhash 44 | "iterates through a document feature collection: ['token-1' token-2' ... 'token-n], 45 | updating the minhash with each feature. Complete minhash is returned." 46 | ([feature-coll] 47 | (build-minhash feature-coll (init-hashvalues))) 48 | 49 | ([[feature & features] hashvalues] 50 | (if (nil? feature) 51 | (vec hashvalues) 52 | (recur features (update-minhash hashvalues feature))))) 53 | 54 | (defn merge-minhash 55 | "merges two minhashes together by taking the elementwise minimum between the two 56 | minhash vectors" 57 | [minhash1 minhash2] 58 | (util/elementwise-min minhash1 minhash2)) 59 | 60 | -------------------------------------------------------------------------------- /src/consimilo/minhash_util.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.minhash-util 2 | (:require [clojure.set :as set] 3 | [clojure.tools.logging :as log])) 4 | 5 | (defn scalar-mod 6 | "performs a scalar modulus on each element of vec and k" 7 | [v k] 8 | (mapv #(mod % (bigint k)) v)) 9 | 10 | (defn scalar-mul 11 | "performs a scalar multiply on each element of vec and k" 12 | [v k] 13 | (mapv #(* % (bigint k)) v)) 14 | 15 | (defn elementwise-add 16 | "performs elementwise addition betwen vectors v1 and v1" 17 | [v1 v2] 18 | (if (= (count v1) (count v2)) 19 | (mapv + v1 v2) 20 | (log/error "cannot compute elementwise-add on 2 vectors of different length"))) 21 | 22 | (defn elementwise-min 23 | "performs elementwise minimum between vectors v1 and v2" 24 | [v1 v2] 25 | (if (= (count v1) (count v2)) 26 | (mapv min v1 v2) 27 | (log/error "cannot compute elementwise-min on 2 vectors of different length"))) 28 | 29 | (defn dot 30 | "computes the dot product of two vectors v1 and v2" 31 | [v1 v2] 32 | (if (= (count v1) (count v2)) 33 | (reduce + (map * v1 v2)) 34 | (log/error "cannot compute dot product on 2 vectors of different length"))) 35 | 36 | (defn l2nrm 37 | "computes the l2 norm of vector v" 38 | [v] 39 | (Math/sqrt (reduce + (map #(* % %) v)))) 40 | 41 | (defn- similarity 42 | "helper function for computing cosine distance, computes similarity" 43 | [v1 v2] 44 | (/ (dot v1 v2) 45 | (* (l2nrm v1) (l2nrm v2)))) 46 | 47 | (defn cosine-distance 48 | "computes cosine distance between two positive vectors of equal length, v1 and v2 " 49 | [v1 v2] 50 | (if (= (count v1) (count v2)) 51 | (/ (* 180 (Math/acos (similarity v1 v2))) 52 | (Math/PI)) 53 | (log/error "cannot compute cosine-distance between 2 vectors of different length"))) 54 | 55 | (defn hamming-distance 56 | [v1 v2] 57 | (if (= (count v1) (count v2)) 58 | (->> (map = v1 v2) (filter false?) count) 59 | (log/error "cannot compute hamming-distance between 2 vectors of different length"))) 60 | 61 | (defn jaccard-similarity 62 | "performs jaccard on vectors self and other" 63 | [v1 v2] 64 | (if (= (count v1) (count v2)) 65 | (try 66 | (/ (count (set/intersection (set v1) (set v2))) 67 | (count (set/union (set v1) (set v2)))) 68 | (catch ArithmeticException e 69 | 0)) 70 | (do 71 | (log/error "cannot compute jaccard-similarity between 2 vectors of different length") 72 | 0))) 73 | 74 | (defn zip-similarity 75 | "returns key value pairs {minhash-key, jaccard}" 76 | [forest query sim-f] 77 | (zipmap (:top-k query) 78 | (map #(sim-f (:query-hash query) (get-in @forest [:keys %])) 79 | (:top-k query)))) -------------------------------------------------------------------------------- /src/consimilo/random_seed.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.random-seed 2 | (:refer-clojure :exclude [rand-int 3 | rand]) 4 | (:import (java.util Random))) 5 | 6 | (defonce r (Random.)) 7 | 8 | (defn set-random-seed! 9 | "Sets the random number generator seed" 10 | [seed] 11 | (.setSeed r seed)) 12 | 13 | (defn rand 14 | "Overloads the clojure rand function to with a seeded implementation 15 | utilizing java.util.Random" 16 | ([] (.nextFloat r)) 17 | ([n] (* n (rand)))) 18 | 19 | (defn rand-bigint 20 | "returns a random number of type bigint, may be seeded by calling set-random-seed! prior 21 | to calling rand-bigint" 22 | [n] 23 | (bigint (rand n))) 24 | 25 | (defn rand-vec 26 | "returns a vector of length n random numbers range [0 - max-range], may be seeded by calling 27 | set-random-seed! prior to calling rand-bigint" 28 | [n max-range] 29 | (repeatedly n #(rand-bigint max-range))) 30 | -------------------------------------------------------------------------------- /src/consimilo/sha1.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.sha1 2 | (:import (java.security MessageDigest))) 3 | 4 | (defn- get-hash 5 | "Returns 'type' hash (ex: sha1) for data" 6 | [type data] 7 | (.digest (MessageDigest/getInstance type) (.getBytes data))) 8 | 9 | (defn- sha1-hash 10 | "Returns the sha1 hash of data" 11 | [data] 12 | (get-hash "sha1" data)) 13 | 14 | (defn get-hash-bigint 15 | "Converts the sha1 hash into a Java bigint" 16 | [data] 17 | (bigint (sha1-hash data))) -------------------------------------------------------------------------------- /src/consimilo/text_processing.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.text-processing 2 | (:require [opennlp.nlp :as nlp] 3 | [pantomime.extract :as extract] 4 | [clojure.java.io :as io] 5 | [clojure.string :as s] 6 | [clojure.tools.logging :as log])) 7 | 8 | (def ^:private tokenize (nlp/make-tokenizer (io/resource "en-token.bin"))) 9 | (def ^:private stopwords (set (s/split-lines (slurp (io/resource "stopwords.txt"))))) 10 | 11 | (defn- remove-stopwords 12 | "If remove-stopwords?: returns tokenized-text with stopwords removed, else: returns tokenized-text unaltered" 13 | [remove-stopwords? tokenized-text] 14 | (if remove-stopwords? 15 | (remove stopwords tokenized-text) 16 | tokenized-text)) 17 | 18 | (defn tokenize-text 19 | "Tokenizes a string of text. If remove-stopwords?: removes stopwords from token collection" 20 | [text & {:keys [remove-stopwords?] :or {remove-stopwords? true}}] 21 | (->> (s/lower-case text) 22 | tokenize 23 | (remove-stopwords remove-stopwords?))) 24 | 25 | ;;Not currently used 26 | (defn shingle 27 | "Generates contiguous sequences of tokens of length n, may be a better gauge of similarity when using consimilo 28 | to query a text corpus for similarity. Generate tokenized-text via consimilo.text-processing/tokenize-text" 29 | ([tokenized-text n] 30 | (if (and (> n 1) (<= n (count tokenized-text))) 31 | (shingle tokenized-text n []) 32 | (do 33 | (log/warn "Invalid shingle size. Shingle size must be (1 < n <= tokenized-text) returning tokenized-text") 34 | tokenized-text))) 35 | ([[first & rest] n coll] 36 | (let [k (dec n)] 37 | (if (not= k (count (take k rest))) 38 | coll 39 | (recur rest n (conj coll (->> rest 40 | (take k) 41 | (concat [first]) 42 | (apply str)))))))) 43 | 44 | (defn- parse-file-to-text 45 | "Parse pdf calls extract/parse and catches an IndexOutOfBounds exception that is thrown by tika on rare occasion." 46 | [file] 47 | (try 48 | (extract/parse file) 49 | (catch IndexOutOfBoundsException e 50 | (log/warn "Unable to extract text from pdf - filename: " (.getName file))))) 51 | 52 | (defn extract-text 53 | "Return extracted text by file content (as `java.io.File`)." 54 | [file_obj] 55 | (:text (parse-file-to-text file_obj))) 56 | -------------------------------------------------------------------------------- /test/consimilo/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.core-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.core :refer :all] 4 | [clojure.java.io :as io])) 5 | 6 | ;; TODO: add more extensive testing of core 7 | 8 | (def minhash1 {:id "1" :features ["1" "2" "3"]}) 9 | (def minhash2 {:id "2" :features ["1" "3" "10"]}) 10 | (def minhash3 {:id "3" :features ["32" "64" "128"]}) 11 | 12 | (def forest-from-hash (add-all-to-forest [minhash1 minhash2 minhash3])) 13 | (def forest-from-strings (add-strings-to-forest [{:id "1" :features "My name is Andrew and I live in Charleston SC. I am staying home for Christmas this year."} 14 | {:id "2" :features "My name is Christina and I live in West Ashley SC. I am not staying home for Christmas this year."} 15 | {:id "3" :features "My name is David and I reside in Summerville, SC. I am going to go Florida for Christmas this year."}])) 16 | (def forest-from-one-file (add-files-to-forest [(io/as-file (io/resource "test.txt"))])) 17 | (def forest-from-files (add-files-to-forest [(io/as-file (io/resource "test1.txt"))] :forest forest-from-one-file)) 18 | 19 | (deftest core-add-all-test 20 | (testing "core add all returns indexed forest" 21 | (is (> (count (get-in @forest-from-hash 22 | [:sorted-hash :0])) 23 | 0)))) 24 | 25 | (deftest core-query-test 26 | (testing "query api returns best results" 27 | (is (= '(:1 :2) 28 | (:top-k (query-forest forest-from-hash 29 | 2 30 | ["1" "2" "4"]))))) 31 | (testing "query-string" 32 | (is (= '(:1 :2) 33 | (:top-k (query-string forest-from-strings 34 | 2 35 | "My name is Bonnie and I live in Charleston SC. I am staying home for Christmas this year."))))) 36 | 37 | (testing "query-string - forest built form files incrementally" 38 | (is (= '(:test1.txt) 39 | (:top-k (query-string forest-from-files 40 | 1 41 | "My name is Andrew and I am the author of this codebase. I live in Charleston, SC."))))) 42 | 43 | (testing "query-file" 44 | (is (= '(:1 :2) 45 | (:top-k (query-file forest-from-strings 46 | 2 47 | (io/as-file (io/resource "test.txt")))))))) 48 | 49 | 50 | (deftest core-add-strings-test 51 | (testing "adding several strings to forest" 52 | (is (> (count (get-in @forest-from-strings 53 | [:sorted-hash :0])) 54 | 0)))) 55 | 56 | (deftest core-jaccard-k 57 | (testing "calculate jaccard on top-k results, string input" 58 | (is (>= (:1 (similarity-k forest-from-strings 59 | 1 60 | "My name is Bonnie and I live in Charleston SC. I am staying home for Christmas this year." 61 | :sim-fn :jaccard)) 62 | 3/5))) 63 | (testing "calculate jaccard on top-k results, file input" 64 | (is (>= (:1 (similarity-k forest-from-strings 65 | 1 66 | (io/resource "test.txt") 67 | :sim-fn :jaccard)) 68 | 3/5))) 69 | (testing "calculate jaccard on top-k results, feature-vector input" 70 | (is (>= (:1 (similarity-k forest-from-hash 71 | 1 72 | ["1" "2" "3"] 73 | :sim-fn :jaccard)) 74 | 1)))) 75 | 76 | (deftest core-cosine 77 | (testing "calculate cosine distance on top-k results with a string input" 78 | (is (= (:1 (similarity-k forest-from-strings 79 | 1 80 | "My name is Andrew and I live in Charleston SC. I am staying home for Christmas this year." 81 | :sim-fn :cosine)) 82 | 0.0))) 83 | (testing "calculate cosine distance on top-k results with a file input" 84 | (is (>= (:1 (similarity-k forest-from-strings 85 | 1 86 | (io/as-file (io/resource "test.txt")) 87 | :sim-fn :cosine)) 88 | 19.91))) 89 | (testing "calculate cosine distance on top-k results with a feature-vector input" 90 | (is (>= (:2 (similarity-k forest-from-hash 91 | 1 92 | ["1" "3" "128"] 93 | :sim-fn :cosine)) 94 | 34.97)))) 95 | 96 | (deftest core-hamming 97 | (testing "calculate cosine distance on top-k results with a string input" 98 | (is (= (:1 (similarity-k forest-from-strings 99 | 1 100 | "My name is Anabelle and I live in Charleston SC. I am staying home for Christmas this year." 101 | :sim-fn :hamming)) 102 | 27))) 103 | (testing "calculate cosine distance on top-k results with a file input" 104 | (is (= (:1 (similarity-k forest-from-strings 105 | 1 106 | (io/as-file (io/resource "test.txt")) 107 | :sim-fn :hamming)) 108 | 26))) 109 | (testing "calculate cosine distance on top-k results with a feature-vector input" 110 | (is (= (:2 (similarity-k forest-from-hash 111 | 1 112 | ["1" "3" "128"] 113 | :sim-fn :hamming)) 114 | 66)))) 115 | 116 | (deftest serialize-test 117 | (testing "save forest to file, load forest, query" 118 | (let [loaded-forest (thaw-forest (io/resource "testforest"))] 119 | (is (= '(:2 :3) 120 | (:top-k (query-string loaded-forest 121 | 2 122 | "My name is Bonnie and I live in Charleston SC. I am staying home for Christmas this year."))))))) -------------------------------------------------------------------------------- /test/consimilo/lsh_forest_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-forest-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.lsh-forest :refer :all] 4 | [consimilo.minhash :refer [build-minhash]])) 5 | 6 | (def minhash1 (build-minhash ["1" "2" "3"])) 7 | (def minhash2 (build-minhash ["1" "2" "10"])) 8 | (def minhash3 (build-minhash ["32" "64" "128"])) 9 | 10 | (deftest populate-hashtables-test 11 | (testing "updates might-atom :hashtables entry" 12 | (let [private-populate-hashtables #'consimilo.lsh-state/populate-hastables! 13 | forest (atom {})] 14 | (private-populate-hashtables forest "a" minhash1) 15 | (is (not (empty? (get-in @forest [:hashtables :0]))))))) 16 | 17 | (deftest populate-keys-test 18 | (testing "updates might-atom :keys entry" 19 | (let [private-populate-keys #'consimilo.lsh-state/populate-keys! 20 | forest (atom {})] 21 | (private-populate-keys forest "a" minhash1) 22 | (is (not (empty? (get-in @forest [:keys :a]))))))) 23 | 24 | (deftest lsh-forest-integration-test 25 | (testing "lsh forest returns best match on query" 26 | (let [forest (new-forest)] 27 | (dorun (map-indexed #(add-lsh! forest (str (inc %)) %2) [minhash1 minhash2 minhash3])) 28 | (index! forest) 29 | (is (= :1 (keyword (first (query-forest forest 1 minhash1)))))))) 30 | -------------------------------------------------------------------------------- /test/consimilo/lsh_util_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.lsh-util-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.lsh-util :refer :all] 4 | [consimilo.lsh-query :refer :all])) 5 | 6 | (deftest get-hashranges-test 7 | (testing "all ranges less than k * num trees" 8 | (is (every? #(>= (* 5 8) %) (map #(apply max %) (get-hashranges 5 8))))) 9 | (testing "buckets are the same size" 10 | (is (apply = (map #(- (second %) (first %)) (get-hashranges 5 8)))))) 11 | 12 | (deftest get-range-test 13 | (testing "returns correct number of buckets" 14 | (is (= 40 (get-range 5 8))))) 15 | 16 | (deftest keyword-int-test 17 | (testing "returns int turned into keyword" 18 | (is (= :0 (keywordize 0))))) 19 | 20 | (deftest build-hashtables-test 21 | (testing "initializes empty hashtables structure" 22 | (is (= {:0 {} :1 {} :2 {}} 23 | (build-hashtables 3))))) 24 | 25 | (deftest build-sorted-hashtables-test 26 | (testing "initializes empty sorted-hashtables structure" 27 | (is (= {:0 [] :1 [] :2 []} 28 | (build-sorted-hashtables 3))))) 29 | 30 | (deftest v=v-test 31 | (testing "vector1 = vector2" 32 | (is (true? (v=v [1 2 3] [1 2 3]))) 33 | (is (false? (v=v [1 2 2] [0 2 2]))) 34 | (is (false? (v=v [1 2 2] [1 0 2]))) 35 | (is (false? (v=v [1 2 2] [1 2 3]))))) 36 | 37 | (deftest v>=v-test 38 | (testing "vector1 >= vector2" 39 | (is (true? (v>=v [2 3 3] [1 2 3]))) 40 | (is (true? (v>=v [2 3 3] [2 2 3]))) 41 | (is (true? (v>=v [2 3 3] [2 3 2]))) 42 | (is (true? (v>=v [2 3 3] [2 3 3]))) 43 | (is (false? (v>=v [1 3 3] [2 3 3]))) 44 | (is (false? (v>=v [2 2 3] [2 3 3]))) 45 | (is (false? (v>=v [2 3 2] [2 3 3]))))) 46 | 47 | (deftest coll-prefix-test 48 | (testing "get first k elements of collection" 49 | (is (= [1 2 3] (coll-prefix [1 2 3 4 5] 3)))) 50 | (testing "get first k elements of empty collection" 51 | (is (= [] (coll-prefix [] 3))))) 52 | 53 | (deftest slice-test 54 | (let [private-slice #'consimilo.lsh-util/slice] 55 | (testing "slice of empty coll" 56 | (is (= () 57 | (private-slice 0 10 [])))) 58 | (testing "slice at begining" 59 | (is (= '(1 2 3) 60 | (private-slice 0 3 [1 2 3 4 5])))) 61 | (testing "slice in middle" 62 | (is (= '(2 3 4) 63 | (private-slice 1 4 [1 2 3 4 5])))))) 64 | 65 | (deftest slice-minhash-test 66 | (testing "returns sequence of slices" 67 | (is (= '((1 2 3) (4 5 6)) 68 | (slice-minhash [1 2 3 4 5 6] [[0 3] [3 6]]))))) 69 | 70 | (deftest tree-keys-test 71 | (testing "correct keywords" 72 | (is (= [:0 :1 :2] 73 | (tree-keys 3))))) 74 | 75 | (deftest pred-search-test 76 | (let [private-pred-search #'consimilo.lsh-query/pred-search 77 | sorted-vec [[0 1 2] [1 2 3] [2 3 4] [3 4 5] [4 5 6] [5 6 7] [6 7 8] [7 8 9] [8 9 0]]] 78 | (testing "search for min" 79 | (is (= 2 80 | (private-pred-search #(>= (compare (get sorted-vec %) [2 3 4]) 0) (count sorted-vec))))))) 81 | 82 | (deftest valid-input?-test 83 | (testing "valid input, correct keys and :features is a collection" 84 | (is (= true (try (valid-input? [{:id 1 :features [1]} {:id 2 :features [2]}] coll?) (catch AssertionError e false))))) 85 | (testing "invalid input, incorrect keys and :features is a collection" 86 | (is (= false (try (valid-input? [{:id 1 :feat [1]} {:id 2 :features [2]}] coll?) (catch AssertionError e false))))) 87 | (testing "invalid input, correct keys but :features is not a collection" 88 | (is (= false (try (valid-input? [{:id 1 :features [1]} {:id 2 :features 2}] coll?) (catch AssertionError e false)))))) 89 | 90 | (deftest valid-input-add-strings?-test 91 | (testing "valid input, correct keys and :features is a collection" 92 | (is (= true (try (valid-input? [{:id 1 :features "my name is andrew"} {:id 2 :features "i like clojure"}] string?) (catch AssertionError e false))))) 93 | (testing "invalid input, incorrect keys and :features is a collection" 94 | (is (= false (try (valid-input? [{:id 1 :feat "my name is andrew"} {:id 2 :features "i like clojure"}] string?) (catch AssertionError e false))))) 95 | (testing "invalid input, correct keys but :features is a collection instead of string" 96 | (is (= false (try (valid-input? [{:id 1 :features "my name is andrew"} {:id 2 :features [2]}] string?) (catch AssertionError e false)))))) 97 | 98 | (deftest valid-input-add-files?-test 99 | (testing "valid input, multiple files in collection" 100 | (is (= true (try (valid-input-add-files? [(clojure.java.io/as-file "t1") 101 | (clojure.java.io/as-file "t1") 102 | (clojure.java.io/as-file "t2")]) 103 | (catch AssertionError e false))))) 104 | (testing "valid input, single file in collection" 105 | (is (= true (try (valid-input-add-files? [(clojure.java.io/file "t1")]) (catch AssertionError e false))))) 106 | (testing "invalid input, no files in collection" 107 | (is (= false (try (valid-input-add-files? [{:id 1 :features [1]} {:id 2 :features 2}]) (catch AssertionError e false)))))) -------------------------------------------------------------------------------- /test/consimilo/minhash_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.minhash-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.set :refer [intersection]] 4 | [consimilo.minhash :refer :all] 5 | [consimilo.config :refer :all]) 6 | (:import (clojure.lang BigInt))) 7 | 8 | (defn build-bigint-coll 9 | [coll] 10 | (map bigint coll)) 11 | 12 | (def minhash-1 (build-bigint-coll '(1 2 3 4 5 6 7 8 9))) 13 | (def minhash-2 (build-bigint-coll '(9 8 7 6 5 4 3 2 1))) 14 | (def minhash-3 (build-bigint-coll '(1 2 3 4 5 4 3 2 1))) 15 | 16 | (deftest init-hashvalues-test 17 | (let [private-init-hashvals #'consimilo.minhash/init-hashvalues 18 | hashvals (private-init-hashvals)] 19 | (testing "init-hahsvalues returns a collection of mersenne primes" 20 | (is (every? #(= large-prime %) hashvals))) 21 | (testing "init-hashvalues returns a collection of type bigint" 22 | (is (every? #(instance? BigInt %) hashvals))) 23 | (testing "init-hashvalues returns a collection of length perms" 24 | (is (= perms (count hashvals)))))) 25 | 26 | (deftest build-permutations-test 27 | (let [private-build-permutations #'consimilo.minhash/build-permutations 28 | p (private-build-permutations)] 29 | (testing "keys :a and :b are not nil in permutations map" 30 | (is (not (nil? (:a p)))) 31 | (is (not (nil? (:b p))))) 32 | (testing "keys :a and :b are collections of length perms" 33 | (is (= perms (count (:a p)))) 34 | (is (= perms (count (:b p))))) 35 | (testing "keys :a and :b are unique collections" 36 | (is (not= perms (count (intersection (set (:a p)) (set (:b p))))))))) 37 | 38 | (deftest build-minhash-test 39 | (let [minhash (build-minhash ["my" "name" "is" "andrew"])] 40 | (testing "resulting minhash is length perms" 41 | (is (= perms (count minhash)))) 42 | (testing "elements in minhash collection are of type minhash" 43 | (is (instance? BigInt (first minhash))) 44 | (is (instance? BigInt (last minhash)))))) 45 | 46 | (deftest merge-minhash-test 47 | (testing "testing merging two minhash vectors together." 48 | (is (= minhash-3 (merge-minhash minhash-1 minhash-2))))) -------------------------------------------------------------------------------- /test/consimilo/minhash_util_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.minhash-util-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.minhash-util :refer :all]) 4 | (:import (clojure.lang BigInt))) 5 | 6 | (defn bigint_vec 7 | [n val] 8 | (repeat n (bigint val))) 9 | 10 | (def big-coll-1 (bigint_vec 3 3)) 11 | (def big-coll-2 (bigint_vec 3 10)) 12 | (def big-coll-3 [(bigint 3) (bigint 3) (bigint 2)]) 13 | 14 | (deftest scalar-mod-test 15 | (let [result (scalar-mod big-coll-1 2)] 16 | (testing "functionality of scalar-mod" 17 | (is (= '(1 1 1) result))) 18 | (testing "scalar-mod returns collection of type bigint" 19 | (is (= true (instance? BigInt (first result)))) 20 | (is (= true (instance? BigInt (last result))))))) 21 | 22 | (deftest scalar-mul-test 23 | (let [result (scalar-mul big-coll-1 3)] 24 | (testing "functionality of scalar-mul" 25 | (is (= '(9 9 9) result))) 26 | (testing "scalar-mul returns collection of type bigint" 27 | (is (= true (instance? BigInt (first result)))) 28 | (is (= true (instance? BigInt (last result))))))) 29 | 30 | (deftest elementwise-min-test 31 | (let [result (elementwise-min big-coll-1 big-coll-2)] 32 | (testing "functionality of elementwise-min" 33 | (is (= '(3 3 3) result))) 34 | (testing "elementwise-min returns collection of type bigint" 35 | (is (= true (instance? BigInt (first result)))) 36 | (is (= true (instance? BigInt (last result))))))) 37 | 38 | (deftest elementwise-add-test 39 | (let [result (elementwise-add big-coll-1 big-coll-2)] 40 | (testing "functionality of elementwise-add" 41 | (is (= '(13 13 13) result))) 42 | (testing "elementwise-add returns collection of type bigint" 43 | (is (= true (instance? BigInt (first result)))) 44 | (is (= true (instance? BigInt (last result))))))) 45 | 46 | (deftest dot-test 47 | (testing "dot product between two vectors of equal length" 48 | (is (= 70 (dot [1 2 3 4 ] [5 6 7 8]))))) 49 | 50 | (deftest hamming-distance-test 51 | (testing "hamming-distance, number of differing elements between two collections" 52 | (is (= 3 (hamming-distance [2 1 7 3 8 9 6] [2 2 3 3 7 9 6]))) 53 | (is (= 2 (hamming-distance [1 0 1 1 1 0 1] [1 0 0 1 0 0 1]))) 54 | (is (= 0 (hamming-distance [1 2 3 4 5 6 7] [1 2 3 4 5 6 7]))) 55 | (is (= 4 (hamming-distance [0 1 2 5] [1 4 6 7]))))) 56 | 57 | (deftest cosine-distance-test 58 | (testing "cosine distance between two vectors with same direction" 59 | (is (= 0.0 (cosine-distance [1 2 3 4 5] [1 2 3 4 5])))) 60 | (testing "cosine distance between two perpendicular vectors" 61 | (is (= 90.0 (cosine-distance [0 5] [5 0])))) 62 | (testing "cosine distance between two different vectors" 63 | (is (> 10e-12 (Math/abs ^float (- 45 (cosine-distance [1 1] [5 0]))))))) 64 | 65 | (deftest jaccard-test 66 | (testing "jaccard functionality" 67 | (is (= 0 (jaccard-similarity big-coll-1 big-coll-2)) 68 | (= 2/3 (jaccard-similarity big-coll-1 big-coll-3))))) -------------------------------------------------------------------------------- /test/consimilo/random_seed_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.random-seed-test 2 | (:refer-clojure :exclude [rand rand-bigint]) 3 | (:require [clojure.test :refer :all] 4 | [consimilo.random-seed :refer :all]) 5 | (:import (clojure.lang BigInt))) 6 | 7 | (defn- get-seeded-random 8 | "Generates a seeded random number for testing" 9 | [seed max-range] 10 | (set-random-seed! seed) 11 | (rand-bigint max-range)) 12 | 13 | (defn- get-seeded-random-vec 14 | "Generates a seeded random vector for testing" 15 | [seed n max-range] 16 | (set-random-seed! seed) 17 | (rand-vec n max-range)) 18 | 19 | (deftest rand-bigint-test 20 | (testing "ensure seeded rand-bigint returns the same value after seeding" 21 | (is (= (get-seeded-random 3 1024) 22 | (get-seeded-random 3 1024)))) 23 | (testing "testing rand-bigint returns type bigint" 24 | (is (= true (instance? BigInt (rand-bigint 5)))))) 25 | 26 | (deftest rand-vec-test 27 | (testing "seeded rand-vec returns the same random collection each time" 28 | (is (= (doall (get-seeded-random-vec 1 10 4096)) 29 | (doall (get-seeded-random-vec 1 10 4096))))) 30 | (testing "rand-vec returns a collection of type bigint" 31 | (is (= true (instance? BigInt (first (rand-vec 4 1024))))) 32 | (is (= true (instance? BigInt (last (rand-vec 4 1024))))))) -------------------------------------------------------------------------------- /test/consimilo/sha1_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.sha1-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.sha1 :refer :all]) 4 | (:import (clojure.lang BigInt))) 5 | 6 | (deftest get-hash-bigint-test 7 | (testing "get-hash-bigint returns type bigint" 8 | (is (= true (instance? BigInt (get-hash-bigint "andrew")))))) -------------------------------------------------------------------------------- /test/consimilo/text_processing_test.clj: -------------------------------------------------------------------------------- 1 | (ns consimilo.text-processing-test 2 | (:require [clojure.test :refer :all] 3 | [consimilo.text-processing :refer :all] 4 | [clojure.java.io :as io])) 5 | 6 | (deftest remove-stopwords-test 7 | (let [private-remove-stopwords #'consimilo.text-processing/remove-stopwords] 8 | (testing "remove-stopwords, remove-stopwords? true" 9 | (is (= ["name" "andrew" "live" "charleston"] 10 | (private-remove-stopwords true ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"])))) 11 | (testing "remove-stopwords, remove-stopwords? false" 12 | (is (= ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"] 13 | (private-remove-stopwords false ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"])))))) 14 | 15 | (deftest tokenize-text-test 16 | (testing "tokenize text, remove-stopwords? true" 17 | (is (= ["name" "andrew" "live" "charleston"] 18 | (tokenize-text "My name is Andrew and I live in Charleston")))) 19 | (testing "tokenize text, remove-stopwords? false" 20 | (is (= ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"] 21 | (tokenize-text "My name is Andrew and I live in Charleston" 22 | :remove-stopwords? false))))) 23 | 24 | (deftest shingle-test 25 | (testing "shingle, n 3" 26 | (is (= ["mynameis" "nameisandrew" "isandrewand" "andrewandi" "andilive" "ilivein" "liveincharleston"] 27 | (shingle (tokenize-text "My name is Andrew and I live in Charleston" 28 | :remove-stopwords? false) 29 | 3)))) 30 | (testing "shingle, n = 1" 31 | (is (= ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"] 32 | (shingle (tokenize-text "My name is Andrew and I live in Charleston" 33 | :remove-stopwords? false) 34 | 1)))) 35 | (testing "shingle, n < 1" 36 | (is (= ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"] 37 | (shingle (tokenize-text "My name is Andrew and I live in Charleston" 38 | :remove-stopwords? false) 39 | -1)))) 40 | (testing "shingle, n > (count tokenize-text)" 41 | (is (= ["my" "name" "is" "andrew" "and" "i" "live" "in" "charleston"] 42 | (shingle (tokenize-text "My name is Andrew and I live in Charleston" 43 | :remove-stopwords? false) 44 | 20))))) 45 | 46 | (deftest extract-text-test 47 | (testing "extracting text from file" 48 | (is (= "My name is Bonnie and I live in Charleston SC. I am staying home for Christmas this year.\n" 49 | (extract-text (io/resource "test.txt")))))) --------------------------------------------------------------------------------