├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── doc └── intro.md ├── project.clj ├── src ├── clj │ └── clj_similar │ │ └── core.clj └── java │ └── edu │ └── wlu │ └── cs │ └── levy │ └── CG │ ├── Checker.java │ ├── DistanceMetric.java │ ├── Editor.java │ ├── EuclideanDistance.java │ ├── HPoint.java │ ├── HRect.java │ ├── HammingDistance.java │ ├── KDException.java │ ├── KDNode.java │ ├── KDTree.java │ ├── KeyDuplicateException.java │ ├── KeyMissingException.java │ ├── KeySizeException.java │ └── NearestNeighborList.java └── test └── clj_similar ├── benchmark.clj └── core_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | 13 | .DS_Store 14 | .classpath 15 | .project 16 | .settings/org.eclipse.jdt.core.prefs 17 | .settings/org.eclipse.m2e.core.prefs -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: clojure 3 | lein: lein2 4 | jdk: 5 | - oraclejdk8 6 | script: lein2 install && lein2 test 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clj-similar 2 | [![Clojars Project](https://img.shields.io/clojars/v/clj-similar.svg)](https://clojars.org/clj-similar) [![Build Status](https://travis-ci.org/vortext/clj-similar.png?branch=develop)](https://travis-ci.org/vortext/clj-similar) 3 | 4 | 5 | Experimental library for (fast) approximate similar set lookup. 6 | Under the hood it uses a [MinHash](https://en.wikipedia.org/wiki/MinHash) to compute [locality sensitive hashes](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) of a collection of sets, and loads them into a [k-d tree](https://en.wikipedia.org/wiki/K-d_tree). 7 | The constructed `similar` probabilistic data structure can be used to retrieve [nearest neighbors](https://en.wikipedia.org/wiki/Nearest_neighbor_search) of a given target set. 8 | While the construction of the data structure can be expensive, lookups should be fast. 9 | The results are non-deterministic, and depend on the bucket and banding sizes. 10 | 11 | Note that it will always return some set that is considered nearest. 12 | The resulting sets can optionally be filtered by their [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index), allowing to omit values that are too dissimilar. 13 | 14 | ## Caveats 15 | - Collections of sets that have more than the maximum integer value of distinct values are currently unsupported. 16 | - The data structure is read-only, support for modifications is not currently planned (but pull requests welcome). 17 | - Read the [binning caveats](https://github.com/tdebatty/java-LSH#binning) before using! 18 | - The performance is *very* sensitive to the `bucket` and `stages` parameters, [LSH forests](http://www.cs.princeton.edu/courses/archive/spr06/cos592/bib/LSHForest-bawa05.pdf) might provide a solution, but are currently not implemented. 19 | 20 | ## Usage 21 | Note, LSH using MinHash is very sensitive to the average Jaccard similarity in your dataset! If most vectors in your dataset have a Jaccard similarity above or below 0.5, they might all fall in the same bucket. The example below might thus give different results for each run. 22 | 23 | ```clojure 24 | (require '[clj-similar.core :refer [similar nearest]]) 25 | (def coll [#{"a" "b" "c"} #{"d" "e" "c"} #{"f" "e" "a" "b"}]) 26 | ;; Creates the data structure 27 | 28 | ;; the number of buckets should be chosen such that we have at least 100 items per bucket 29 | (def buckets 10) 30 | ;; the number of stages is also sometimes called the number of bands 31 | (def stages 3) 32 | (def s (similar coll)) ;; default buckets and stages are (10, 3) 33 | (def s (similar coll buckets stages)) 34 | 35 | ;; A single nearest neighbor 36 | (nearest s #{"f" "e" "a" "b"}) 37 | ;=> #{"f" "e" "a" "b"} 38 | 39 | (nearest s #{"f" "e" "a" "b" "x"}) 40 | ;=> #{"f" "e" "a" "b"} 41 | 42 | (nearest s #{"f" "e" "a"}) 43 | ;=> #{"f" "e" "a" "b"} 44 | 45 | (nearest s #{"a"}) 46 | ;=> #{"a" "b" "c"} 47 | 48 | ;; Two nearest neighbors 49 | (nearest s #{"a" "b"} 2) 50 | ;=> (#{"a" "b" "c"} #{"f" "e" "a" "b"}) 51 | 52 | ;; To access the distance metrics use the associated metadata 53 | (:jaccard-index (meta (nearest s #{"a" "b"}))) 54 | 55 | ;; Or you can optionally filter values below a certain jaccard-index threshold 56 | (nearest s #{"a" "b"} 2 :threshold 0.6) 57 | ;=> (#{"a" "b" "c"}) 58 | 59 | ;; The values of the sets can be any Clojure data structure, even other collections 60 | (def coll [#{["a"] ["a" "b"]} #{["c" "d"] ["a" "c"]}]) 61 | (def s (similar coll)) 62 | (nearest s #{["a" "b"]}) 63 | ;=> #{["a" "b"] ["a"]} 64 | 65 | ``` 66 | 67 | ## Benchmark 68 | ``` 69 | Generating 1000000 random sets with max-size 10 70 | Generating similar data structure 71 | "Elapsed time: 44071.000835 msecs" 72 | Testing speed of nearest neighbor retrieval 73 | Sample output for random target sets 74 | in #{Y R a g l W c} out (#{q Y R t g X l W} #{Y Z R a g N}) approximate ({:jaccard-index 0.5} {:jaccard-index 0.4444444444444444}) 75 | in #{d w Z X N D} out (#{Z X N D} #{d Z a X D}) approximate ({:jaccard-index 0.6666666666666666} {:jaccard-index 0.5714285714285714}) 76 | in #{w q Z H B i A I} out (#{e q M H R B i I} #{q x M H B a i A}) approximate ({:jaccard-index 0.45454545454545453} {:jaccard-index 0.45454545454545453}) 77 | in #{w p U b N o} out (#{w U b N o} #{U b N}) approximate ({:jaccard-index 0.8333333333333334} {:jaccard-index 0.5}) 78 | in #{B D} out (#{B D} #{E B D}) approximate ({:jaccard-index 1.0} {:jaccard-index 0.6666666666666666}) 79 | in #{B V O c} out (#{T K B V h c} #{v B V i m c}) approximate ({:jaccard-index 0.42857142857142855} {:jaccard-index 0.42857142857142855}) 80 | in #{F a V l} out (#{w F a V l} #{F a V l N}) approximate ({:jaccard-index 0.8} {:jaccard-index 0.8}) 81 | in #{d f x J S k l u D} out (#{x J S E l u D} #{d L x J v S C y u D}) approximate ({:jaccard-index 0.6} {:jaccard-index 0.46153846153846156}) 82 | in #{w s q v a P O i} out (#{v a P i} #{q v O i D}) approximate ({:jaccard-index 0.5} {:jaccard-index 0.4444444444444444}) 83 | in #{d j Z R O k D} out (#{j Z a O D} #{d e j x Z B O D}) approximate ({:jaccard-index 0.5} {:jaccard-index 0.5}) 84 | Sample output for existing sets 85 | in #{E U O i g l A D} original #{L E U O i g l A I D} out (#{L E U O i g l A I D} #{e t U i g l A D}) exact ({:jaccard-index 0.8} {:jaccard-index 0.6}) 86 | in #{n i h} original #{n U i k h} out (#{n i h} #{n i l h}) exact ({:jaccard-index 1.0} {:jaccard-index 0.75}) 87 | in #{p v R y D} original #{p v R B y X D} out (#{q p v R y N D} #{p R D}) exact ({:jaccard-index 0.7142857142857143} {:jaccard-index 0.6}) 88 | in #{n q x G U} original #{n q x G C U W} out (#{n x G U} #{d n q p x G U}) exact ({:jaccard-index 0.8} {:jaccard-index 0.7142857142857143}) 89 | in #{v k b y m} original #{w v B k b y m} out (#{w v B k b y m} #{v k y A m D}) exact ({:jaccard-index 0.7142857142857143} {:jaccard-index 0.5714285714285714}) 90 | in #{l} original #{f l D} out (#{l} #{X l}) exact ({:jaccard-index 1.0} {:jaccard-index 0.5}) 91 | in #{P b} original #{f S P b} out (#{P b} #{q P b}) exact ({:jaccard-index 1.0} {:jaccard-index 0.6666666666666666}) 92 | in #{f} original #{f W D} out (#{f} #{f N}) exact ({:jaccard-index 1.0} {:jaccard-index 0.5}) 93 | in #{U b} original #{U O b l} out (#{U b} #{U b o}) exact ({:jaccard-index 1.0} {:jaccard-index 0.6666666666666666}) 94 | in #{T E V l D} original #{T E B V U l D} out (#{T E a V D} #{V O l D}) exact ({:jaccard-index 0.6666666666666666} {:jaccard-index 0.5}) 95 | Evaluation count : 8700 in 60 samples of 145 calls. 96 | Execution time mean : 6.829506 ms 97 | Execution time std-deviation : 323.983327 µs 98 | Execution time lower quantile : 6.341004 ms ( 2.5%) 99 | Execution time upper quantile : 7.483967 ms (97.5%) 100 | Overhead used : 9.201927 ns 101 | 102 | Found 1 outliers in 60 samples (1.6667 %) 103 | low-severe 1 (1.6667 %) 104 | Variance from outliers : 33.5825 % Variance is moderately inflated by outliers 105 | 106 | ``` 107 | 108 | Benchmarks were run on OS X El Capitan (Intel Xeon E3-1240V2 @ 3.4Ghz, 32 GB DDR3 RAM) 109 | 110 | ## Dependencies 111 | 112 | * [java-lsh](https://github.com/tdebatty/java-LSH) 113 | * [K-d tree by Levy](http://home.wlu.edu/~levys/software/kd/) 114 | 115 | ## License 116 | 117 | Copyright © 2016 Joël Kuiper 118 | 119 | Distributed under the Eclipse Public License either version 1.0 or (at 120 | your option) any later version. 121 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to clj-similar 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-similar "0.1.4" 2 | :description "Fast similar set lookup using MinHash and K-d trees" 3 | :url "https://github.com/vortext/clj-similar" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :profiles {:test {:dependencies [[criterium "0.4.4"]]}} 7 | :source-paths ["src/clj"] 8 | :java-source-paths ["src/java"] 9 | :dependencies [[org.clojure/clojure "1.8.0"] 10 | [org.apache.commons/commons-lang3 "3.4"] 11 | [info.debatty/java-lsh "0.9"]]) 12 | -------------------------------------------------------------------------------- /src/clj/clj_similar/core.clj: -------------------------------------------------------------------------------- 1 | (ns clj-similar.core 2 | (:require [clojure.set :as set]) 3 | (:import [info.debatty.java.lsh MinHash LSHMinHash] 4 | [edu.wlu.cs.levy.CG KDTree] 5 | [org.apache.commons.lang3 ArrayUtils] 6 | [java.util TreeSet Set Collection])) 7 | 8 | (defn index-array 9 | [dict size s] 10 | (let [lst (boolean-array (take size (repeat false))) 11 | idx (map (fn [e] (int (get dict e 0))) s)] 12 | (doseq [i idx] 13 | (aset-boolean lst i true)) 14 | lst)) 15 | 16 | (defn index-sets 17 | "Given a mapping of values to indexes and a collection of sets, 18 | returns a new collection of sets where each set has have their 19 | values replaced by indexes" 20 | [dict size coll] 21 | (reduce (fn [mem s] (assoc mem (index-array dict size s) s)) {} coll)) 22 | 23 | (defn value-index 24 | [v] 25 | (into {} (map-indexed (fn [idx itm] [itm (inc idx)]) v))) 26 | 27 | (defn points 28 | [hash-fn indexed-sets] 29 | (let [rf (fn [mem [ia k]] 30 | (let [hash (hash-fn ia) 31 | hash-vec (vec hash) 32 | ;; Hash bucket 33 | bucket (get mem hash-vec [])] 34 | (assoc mem hash-vec (conj bucket k))))] 35 | (reduce rf {} indexed-sets))) 36 | 37 | 38 | (defrecord Similar [dict size tree hash-fn mh]) 39 | 40 | (defn build-tree 41 | [points size] 42 | (let [^KDTree tree (KDTree. size)] 43 | (doseq [point points] 44 | (.insert tree 45 | ^doubles (double-array (first point)) 46 | ^Object (second point))) 47 | tree)) 48 | 49 | (defn get-field 50 | "Returns obj's private or public field with given field-name, 51 | defined in klass. Pass nil into obj for static fields." 52 | [klass field-name obj] 53 | (-> klass (.getDeclaredField (name field-name)) 54 | (doto (.setAccessible true)) 55 | (.get obj))) 56 | 57 | (defn- similar-internal 58 | [coll buckets stages] 59 | (let [n (count coll) 60 | dict (value-index (reduce set/union #{} coll)) 61 | size (inc (count dict)) 62 | sets (index-sets dict size coll) 63 | lsh (LSHMinHash. ^int (int stages) ^int (int buckets) ^int (int size)) 64 | mh (get-field LSHMinHash "mh" lsh) ;; why oh why is this private. 65 | hash-fn #(.hash ^LSHMinHash lsh %) 66 | tree (build-tree (points hash-fn sets) stages)] 67 | (Similar. dict size tree hash-fn mh))) 68 | 69 | 70 | ;;; Public API 71 | (defn similar 72 | "Constructs a new similar set from a collection of sets. 73 | Can be used for lookup of nearest sets using `nearest`. 74 | Optionally takes `bucket` and `stages` (also known as bands) as arguments." 75 | ([coll] 76 | (similar coll 10 3)) 77 | ([coll buckets] 78 | (similar coll buckets 3)) 79 | ([coll buckets stages] 80 | {:pre [(every? set? coll)]} 81 | (similar-internal coll buckets stages))) 82 | 83 | 84 | (defn jaccard-index 85 | [^Set s1 ^Set s2] 86 | (MinHash/JaccardIndex s1 s2)) 87 | 88 | (defn approximate-jaccard-index 89 | [^MinHash mh ^booleans ia1 ^booleans ia2] 90 | (.similarity mh (.signature mh ia1) (.signature mh ia2))) 91 | 92 | (defn similarity 93 | [exact? minhash s1 s2 ia1 ia2] 94 | (if exact? 95 | (jaccard-index s1 s2) 96 | (approximate-jaccard-index minhash ia1 ia2))) 97 | 98 | 99 | (defn nearest 100 | "Given a `similar` data structure and a target set `s`, finds the 101 | nearest matching set. Optionally takes a parameter `n` for the `n` 102 | nearest sets. Pass `threshold` as an optional arguments to filter 103 | elements with a jaccard index below the threshold. 104 | Returning sets have distance metrics and vector associated as metadata." 105 | ([similar s] 106 | {:pre [(set? s)]} 107 | (first (nearest similar s 1))) 108 | ([similar s n & {:keys [threshold exact?] :or {threshold 0.0 exact? true}}] 109 | {:pre [(set? s)]} 110 | (let [{dict :dict 111 | size :size 112 | hash-fn :hash-fn 113 | mh :mh} similar 114 | index-array* (partial index-array dict size) 115 | ia (index-array* s) 116 | hash ((:hash-fn similar) ia) 117 | ff #(> (:jaccard-index (meta %)) threshold) 118 | mf (fn [e] 119 | (let [ji (similarity exact? mh s e ia (index-array* e))] 120 | (with-meta e {:jaccard-index ji}))) 121 | nearest (.nearest ^KDTree (:tree similar) ^doubles (double-array hash) ^int n) 122 | sf #(:jaccard-index (meta %))] 123 | (take n (filter ff (reverse (sort-by sf (distinct (map mf (flatten (vec nearest))))))))))) 124 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/Checker.java: -------------------------------------------------------------------------------- 1 | // Checker.java : class for filtering KD-Tree matches by usability 2 | // 3 | // Copyright (C) Michael Lorton and Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | public interface Checker { 27 | public boolean usable(T v); 28 | } 29 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/DistanceMetric.java: -------------------------------------------------------------------------------- 1 | // DistanceMetric.java : Abstract distance metric class 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | // 24 | 25 | package edu.wlu.cs.levy.CG; 26 | 27 | abstract class DistanceMetric { 28 | 29 | protected abstract double distance(double[] a, double[] b); 30 | } 31 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/Editor.java: -------------------------------------------------------------------------------- 1 | // Editor.java : class for adding/removing nodes from KD-Tree 2 | // 3 | // Copyright (C) Michael Lorton and Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | public interface Editor { 27 | public T edit(T current) throws KeyDuplicateException; 28 | 29 | public static abstract class BaseEditor implements Editor { 30 | final T val; 31 | public BaseEditor(final T val) { 32 | this.val = val; 33 | } 34 | public abstract T edit(T current) throws KeyDuplicateException; 35 | } 36 | public static class Inserter extends BaseEditor { 37 | public Inserter(final T val) { 38 | super(val); 39 | } 40 | public T edit(final T current) throws KeyDuplicateException { 41 | if (current == null) { 42 | return this.val; 43 | } 44 | throw new KeyDuplicateException(); 45 | } 46 | } 47 | public static class OptionalInserter extends BaseEditor { 48 | public OptionalInserter(final T val) { 49 | super(val); 50 | } 51 | public T edit(final T current) { 52 | return (current == null) ? this.val : current; 53 | } 54 | } 55 | public static class Replacer extends BaseEditor { 56 | public Replacer(final T val) { 57 | super(val); 58 | } 59 | public T edit(final T current) { 60 | return this.val; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/EuclideanDistance.java: -------------------------------------------------------------------------------- 1 | // EuclideanDistance.java : Class for Euclidean distance metric 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | class EuclideanDistance extends DistanceMetric { 27 | 28 | protected double distance(final double[] a, final double[] b) { 29 | 30 | return Math.sqrt(sqrdist(a, b)); 31 | 32 | } 33 | 34 | protected static double sqrdist(final double[] a, final double[] b) { 35 | 36 | double dist = 0; 37 | 38 | for (int i = 0; i < a.length; ++i) { 39 | final double diff = (a[i] - b[i]); 40 | dist += diff * diff; 41 | } 42 | 43 | return dist; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/HPoint.java: -------------------------------------------------------------------------------- 1 | // HPoint.java : Hyper-Point class supporting KDTree class 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | import java.io.Serializable; 27 | 28 | class HPoint implements Serializable { 29 | 30 | protected double[] coord; 31 | 32 | protected HPoint(final int n) { 33 | coord = new double[n]; 34 | } 35 | 36 | protected HPoint(final double[] x) { 37 | 38 | coord = new double[x.length]; 39 | for (int i = 0; i < x.length; ++i) 40 | coord[i] = x[i]; 41 | } 42 | 43 | protected Object clone() { 44 | 45 | return new HPoint(coord); 46 | } 47 | 48 | protected boolean equals(final HPoint p) { 49 | 50 | // seems faster than java.util.Arrays.equals(), which is not 51 | // currently supported by Matlab anyway 52 | for (int i = 0; i < coord.length; ++i) 53 | if (coord[i] != p.coord[i]) 54 | return false; 55 | 56 | return true; 57 | } 58 | 59 | protected static double sqrdist(final HPoint x, final HPoint y) { 60 | 61 | return EuclideanDistance.sqrdist(x.coord, y.coord); 62 | } 63 | 64 | public String toString() { 65 | String s = ""; 66 | for (int i = 0; i < coord.length; ++i) { 67 | s = s + coord[i] + " "; 68 | } 69 | return s; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/HRect.java: -------------------------------------------------------------------------------- 1 | // HRect.java : Hyper-Rectangle class supporting KDTree class 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | import java.io.Serializable; 27 | 28 | class HRect implements Serializable { 29 | 30 | protected HPoint min; 31 | protected HPoint max; 32 | 33 | protected HRect(final int ndims) { 34 | min = new HPoint(ndims); 35 | max = new HPoint(ndims); 36 | } 37 | 38 | protected HRect(final HPoint vmin, final HPoint vmax) { 39 | 40 | min = (HPoint) vmin.clone(); 41 | max = (HPoint) vmax.clone(); 42 | } 43 | 44 | protected Object clone() { 45 | 46 | return new HRect(min, max); 47 | } 48 | 49 | // from Moore's eqn. 6.6 50 | protected HPoint closest(final HPoint t) { 51 | 52 | final HPoint p = new HPoint(t.coord.length); 53 | 54 | for (int i = 0; i < t.coord.length; ++i) { 55 | if (t.coord[i] <= min.coord[i]) { 56 | p.coord[i] = min.coord[i]; 57 | } else if (t.coord[i] >= max.coord[i]) { 58 | p.coord[i] = max.coord[i]; 59 | } else { 60 | p.coord[i] = t.coord[i]; 61 | } 62 | } 63 | 64 | return p; 65 | } 66 | 67 | // used in initial conditions of KDTree.nearest() 68 | protected static HRect infiniteHRect(final int d) { 69 | 70 | final HPoint vmin = new HPoint(d); 71 | final HPoint vmax = new HPoint(d); 72 | 73 | for (int i = 0; i < d; ++i) { 74 | vmin.coord[i] = Double.NEGATIVE_INFINITY; 75 | vmax.coord[i] = Double.POSITIVE_INFINITY; 76 | } 77 | 78 | return new HRect(vmin, vmax); 79 | } 80 | 81 | // currently unused 82 | protected HRect intersection(final HRect r) { 83 | 84 | final HPoint newmin = new HPoint(min.coord.length); 85 | final HPoint newmax = new HPoint(min.coord.length); 86 | 87 | for (int i = 0; i < min.coord.length; ++i) { 88 | newmin.coord[i] = Math.max(min.coord[i], r.min.coord[i]); 89 | newmax.coord[i] = Math.min(max.coord[i], r.max.coord[i]); 90 | if (newmin.coord[i] >= newmax.coord[i]) 91 | return null; 92 | } 93 | 94 | return new HRect(newmin, newmax); 95 | } 96 | 97 | // currently unused 98 | protected double area() { 99 | 100 | double a = 1; 101 | 102 | for (int i = 0; i < min.coord.length; ++i) { 103 | a *= (max.coord[i] - min.coord[i]); 104 | } 105 | 106 | return a; 107 | } 108 | 109 | public String toString() { 110 | return min + "\n" + max + "\n"; 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/HammingDistance.java: -------------------------------------------------------------------------------- 1 | // HammingDistance.java : Class for Hamming distance 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | // 24 | 25 | package edu.wlu.cs.levy.CG; 26 | 27 | class HammingDistance extends DistanceMetric { 28 | 29 | protected double distance(final double[] a, final double[] b) { 30 | 31 | double dist = 0; 32 | 33 | for (int i = 0; i < a.length; ++i) { 34 | final double diff = (a[i] - b[i]); 35 | dist += Math.abs(diff); 36 | } 37 | 38 | return dist; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KDException.java: -------------------------------------------------------------------------------- 1 | // KDException.java : general exception class for KD-Tree library 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | public class KDException extends Exception { 27 | protected KDException(final String s) { 28 | super(s); 29 | } 30 | public static final long serialVersionUID = 1L; 31 | } 32 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KDNode.java: -------------------------------------------------------------------------------- 1 | // KDNode.java : K-D Tree node class 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | import java.io.Serializable; 27 | import java.util.List; 28 | 29 | class KDNode implements Serializable { 30 | 31 | // these are seen by KDTree 32 | protected HPoint k; 33 | T v; 34 | protected KDNode left, right; 35 | protected boolean deleted; 36 | 37 | // Method ins translated from 352.ins.c of Gonnet & Baeza-Yates 38 | protected static int edit(final HPoint key, final Editor editor, final KDNode t, final int lev, final int K) throws KeyDuplicateException { 39 | KDNode next_node = null; 40 | final int next_lev = (lev + 1) % K; 41 | synchronized (t) { 42 | if (key.equals(t.k)) { 43 | final boolean was_deleted = t.deleted; 44 | t.v = editor.edit(t.deleted ? null : t.v); 45 | t.deleted = (t.v == null); 46 | 47 | if (t.deleted == was_deleted) { 48 | // if I was and still am deleted or was and still am alive 49 | return 0; 50 | } else if (was_deleted) { 51 | // if I was deleted => I am now undeleted 52 | return 1; 53 | } 54 | // I was not deleted, but I am now deleted 55 | return -1; 56 | } else if (key.coord[lev] > t.k.coord[lev]) { 57 | next_node = t.right; 58 | if (next_node == null) { 59 | t.right = create(key, editor); 60 | return t.right.deleted ? 0 : 1; 61 | } 62 | } else { 63 | next_node = t.left; 64 | if (next_node == null) { 65 | t.left = create(key, editor); 66 | return t.left.deleted ? 0 : 1; 67 | } 68 | } 69 | } 70 | 71 | return edit(key, editor, next_node, next_lev, K); 72 | } 73 | 74 | protected static KDNode create(final HPoint key, final Editor editor) throws KeyDuplicateException { 75 | final KDNode t = new KDNode(key, editor.edit(null)); 76 | if (t.v == null) { 77 | t.deleted = true; 78 | } 79 | return t; 80 | } 81 | 82 | protected static boolean del(final KDNode t) { 83 | synchronized (t) { 84 | if (!t.deleted) { 85 | t.deleted = true; 86 | return true; 87 | } 88 | } 89 | return false; 90 | } 91 | 92 | // Method srch translated from 352.srch.c of Gonnet & Baeza-Yates 93 | protected static KDNode srch(final HPoint key, KDNode t, final int K) { 94 | 95 | for (int lev = 0; t != null; lev = (lev + 1) % K) { 96 | 97 | if (!t.deleted && key.equals(t.k)) { 98 | return t; 99 | } else if (key.coord[lev] > t.k.coord[lev]) { 100 | t = t.right; 101 | } else { 102 | t = t.left; 103 | } 104 | } 105 | 106 | return null; 107 | } 108 | 109 | // Method rsearch translated from 352.range.c of Gonnet & Baeza-Yates 110 | protected static void rsearch(final HPoint lowk, final HPoint uppk, final KDNode t, final int lev, final int K, final List> v) { 111 | 112 | if (t == null) 113 | return; 114 | if (lowk.coord[lev] <= t.k.coord[lev]) { 115 | rsearch(lowk, uppk, t.left, (lev + 1) % K, K, v); 116 | } 117 | if (!t.deleted) { 118 | int j = 0; 119 | while (j < K && lowk.coord[j] <= t.k.coord[j] && uppk.coord[j] >= t.k.coord[j]) { 120 | j++; 121 | } 122 | if (j == K) 123 | v.add(t); 124 | } 125 | if (uppk.coord[lev] > t.k.coord[lev]) { 126 | rsearch(lowk, uppk, t.right, (lev + 1) % K, K, v); 127 | } 128 | } 129 | 130 | // Method Nearest Neighbor from Andrew Moore's thesis. Numbered 131 | // comments are direct quotes from there. NearestNeighborList solution 132 | // courtesy of Bjoern Heckel. 133 | protected static void nnbr(final KDNode kd, final HPoint target, final HRect hr, double max_dist_sqd, final int lev, final int K, 134 | final NearestNeighborList> nnl, final Checker checker, final long timeout) { 135 | 136 | // 1. if kd is empty then set dist-sqd to infinity and exit. 137 | if (kd == null) { 138 | return; 139 | } 140 | 141 | if ((timeout > 0) && (timeout < System.currentTimeMillis())) { 142 | return; 143 | } 144 | // 2. s := split field of kd 145 | final int s = lev % K; 146 | 147 | // 3. pivot := dom-elt field of kd 148 | final HPoint pivot = kd.k; 149 | final double pivot_to_target = HPoint.sqrdist(pivot, target); 150 | 151 | // 4. Cut hr into to sub-hyperrectangles left-hr and right-hr. 152 | // The cut plane is through pivot and perpendicular to the s 153 | // dimension. 154 | final HRect left_hr = hr; // optimize by not cloning 155 | final HRect right_hr = (HRect) hr.clone(); 156 | left_hr.max.coord[s] = pivot.coord[s]; 157 | right_hr.min.coord[s] = pivot.coord[s]; 158 | 159 | // 5. target-in-left := target_s <= pivot_s 160 | final boolean target_in_left = target.coord[s] < pivot.coord[s]; 161 | 162 | KDNode nearer_kd; 163 | HRect nearer_hr; 164 | KDNode further_kd; 165 | HRect further_hr; 166 | 167 | // 6. if target-in-left then 168 | // 6.1. nearer-kd := left field of kd and nearer-hr := left-hr 169 | // 6.2. further-kd := right field of kd and further-hr := right-hr 170 | if (target_in_left) { 171 | nearer_kd = kd.left; 172 | nearer_hr = left_hr; 173 | further_kd = kd.right; 174 | further_hr = right_hr; 175 | } 176 | // 177 | // 7. if not target-in-left then 178 | // 7.1. nearer-kd := right field of kd and nearer-hr := right-hr 179 | // 7.2. further-kd := left field of kd and further-hr := left-hr 180 | else { 181 | nearer_kd = kd.right; 182 | nearer_hr = right_hr; 183 | further_kd = kd.left; 184 | further_hr = left_hr; 185 | } 186 | 187 | // 8. Recursively call Nearest Neighbor with paramters 188 | // (nearer-kd, target, nearer-hr, max-dist-sqd), storing the 189 | // results in nearest and dist-sqd 190 | nnbr(nearer_kd, target, nearer_hr, max_dist_sqd, lev + 1, K, nnl, checker, timeout); 191 | 192 | KDNode nearest = nnl.getHighest(); 193 | double dist_sqd; 194 | 195 | if (!nnl.isCapacityReached()) { 196 | dist_sqd = Double.MAX_VALUE; 197 | } else { 198 | dist_sqd = nnl.getMaxPriority(); 199 | } 200 | 201 | // 9. max-dist-sqd := minimum of max-dist-sqd and dist-sqd 202 | max_dist_sqd = Math.min(max_dist_sqd, dist_sqd); 203 | 204 | // 10. A nearer point could only lie in further-kd if there were some 205 | // part of further-hr within distance max-dist-sqd of 206 | // target. 207 | final HPoint closest = further_hr.closest(target); 208 | if (HPoint.sqrdist(closest, target) < max_dist_sqd) { 209 | 210 | // 10.1 if (pivot-target)^2 < dist-sqd then 211 | if (pivot_to_target < dist_sqd) { 212 | 213 | // 10.1.1 nearest := (pivot, range-elt field of kd) 214 | nearest = kd; 215 | 216 | // 10.1.2 dist-sqd = (pivot-target)^2 217 | dist_sqd = pivot_to_target; 218 | 219 | // add to nnl 220 | if (!kd.deleted && ((checker == null) || checker.usable(kd.v))) { 221 | nnl.insert(kd, dist_sqd); 222 | } 223 | 224 | // 10.1.3 max-dist-sqd = dist-sqd 225 | // max_dist_sqd = dist_sqd; 226 | if (nnl.isCapacityReached()) { 227 | max_dist_sqd = nnl.getMaxPriority(); 228 | } else { 229 | max_dist_sqd = Double.MAX_VALUE; 230 | } 231 | } 232 | 233 | // 10.2 Recursively call Nearest Neighbor with parameters 234 | // (further-kd, target, further-hr, max-dist_sqd), 235 | // storing results in temp-nearest and temp-dist-sqd 236 | nnbr(further_kd, target, further_hr, max_dist_sqd, lev + 1, K, nnl, checker, timeout); 237 | } 238 | } 239 | 240 | // constructor is used only by class; other methods are static 241 | private KDNode(final HPoint key, final T val) { 242 | 243 | k = key; 244 | v = val; 245 | left = null; 246 | right = null; 247 | deleted = false; 248 | } 249 | 250 | protected String toString(final int depth) { 251 | String s = k + " " + v + (deleted ? "*" : ""); 252 | if (left != null) { 253 | s = s + "\n" + pad(depth) + "L " + left.toString(depth + 1); 254 | } 255 | if (right != null) { 256 | s = s + "\n" + pad(depth) + "R " + right.toString(depth + 1); 257 | } 258 | return s; 259 | } 260 | 261 | private static String pad(final int n) { 262 | String s = ""; 263 | for (int i = 0; i < n; ++i) { 264 | s += " "; 265 | } 266 | return s; 267 | } 268 | 269 | private static void hrcopy(final HRect hr_src, final HRect hr_dst) { 270 | hpcopy(hr_src.min, hr_dst.min); 271 | hpcopy(hr_src.max, hr_dst.max); 272 | } 273 | 274 | private static void hpcopy(final HPoint hp_src, final HPoint hp_dst) { 275 | for (int i = 0; i < hp_dst.coord.length; ++i) { 276 | hp_dst.coord[i] = hp_src.coord[i]; 277 | } 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KDTree.java: -------------------------------------------------------------------------------- 1 | package edu.wlu.cs.levy.CG; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | import java.util.LinkedList; 6 | import java.util.Stack; 7 | 8 | /** 9 | * KDTree is a class supporting KD-tree insertion, deletion, equality search, 10 | * range search, and nearest neighbor(s) using double-precision floating-point 11 | * keys. Splitting dimension is chosen naively, by depth modulo K. Semantics are 12 | * as follows: 13 | * 14 | *
    15 | *
  • Two different keys containing identical numbers should retrieve the same 16 | * value from a given KD-tree. Therefore keys are cloned when a node is 17 | * inserted.
    18 | *
    19 | *
  • As with Hashtables, values inserted into a KD-tree are not cloned. 20 | * Modifying a value between insertion and retrieval will therefore modify the 21 | * value stored in the tree. 22 | *
23 | * 24 | * Implements the Nearest Neighbor algorithm (Table 6.4) of 25 | * 26 | *
 27 |  * &*064;techreport{AndrewMooreNearestNeighbor,
 28 |  *   author  = {Andrew Moore},
 29 |  *   title   = {An introductory tutorial on kd-trees},
 30 |  *   institution = {Robotics Institute, Carnegie Mellon University},
 31 |  *   year    = {1991},
 32 |  *   number  = {Technical Report No. 209, Computer Laboratory, 
 33 |  *              University of Cambridge},
 34 |  *   address = {Pittsburgh, PA}
 35 |  * }
 36 |  * 
37 | * 38 | * Copyright (C) Simon D. Levy and Bjoern Heckel 2014 39 | * 40 | * This code is free software: you can redistribute it and/or modify it under 41 | * the terms of the GNU Lesser General Public License as published by the Free 42 | * Software Foundation, either version 3 of the License, or (at your option) any 43 | * later version. 44 | * 45 | * This code is distributed in the hope that it will be useful, but WITHOUT ANY 46 | * WARRANTY without even the implied warranty of MERCHANTABILITY or FITNESS FOR 47 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 48 | * 49 | * You should have received a copy of the GNU Lesser General Public License 50 | * along with this code. If not, see . You should 51 | * also have received a copy of the Parrot Parrot AR.Drone Development License 52 | * and Parrot AR.Drone copyright notice and disclaimer and If not, see 53 | * and 54 | * . 56 | */ 57 | public class KDTree implements Serializable { 58 | // number of milliseconds 59 | final long m_timeout; 60 | 61 | // K = number of dimensions 62 | final private int m_K; 63 | 64 | // root of KD-tree 65 | private KDNode m_root; 66 | 67 | // count of nodes 68 | private int m_count; 69 | 70 | /** 71 | * Creates a KD-tree with specified number of dimensions. 72 | * 73 | * @param k 74 | * number of dimensions 75 | */ 76 | 77 | public KDTree(final int k) { 78 | this(k, 0); 79 | } 80 | public KDTree(final int k, final long timeout) { 81 | this.m_timeout = timeout; 82 | m_K = k; 83 | m_root = null; 84 | } 85 | 86 | /** 87 | * Insert a node in a KD-tree. Uses algorithm translated from 352.ins.c of 88 | * 89 | *
 90 |    *   &*064;Book{GonnetBaezaYates1991,                                   
 91 |    *     author =    {G.H. Gonnet and R. Baeza-Yates},
 92 |    *     title =     {Handbook of Algorithms and Data Structures},
 93 |    *     publisher = {Addison-Wesley},
 94 |    *     year =      {1991}
 95 |    *   }
 96 |    * 
97 | * 98 | * @param key 99 | * key for KD-tree node 100 | * @param value 101 | * value at that key 102 | * 103 | * @throws KeySizeException 104 | * if key.length mismatches K 105 | * @throws KeyDuplicateException 106 | * if key already in tree 107 | */ 108 | public void insert(final double[] key, final T value) throws KeySizeException, KeyDuplicateException { 109 | this.edit(key, new Editor.Inserter(value)); 110 | } 111 | 112 | /** 113 | * Edit a node in a KD-tree 114 | * 115 | * @param key 116 | * key for KD-tree node 117 | * @param editor 118 | * object to edit the value at that key 119 | * 120 | * @throws KeySizeException 121 | * if key.length mismatches K 122 | * @throws KeyDuplicateException 123 | * if key already in tree 124 | */ 125 | 126 | public void edit(final double[] key, final Editor editor) throws KeySizeException, KeyDuplicateException { 127 | 128 | if (key.length != m_K) { 129 | throw new KeySizeException(); 130 | } 131 | 132 | synchronized (this) { 133 | // the first insert has to be synchronized 134 | if (null == m_root) { 135 | m_root = KDNode.create(new HPoint(key), editor); 136 | m_count = m_root.deleted ? 0 : 1; 137 | return; 138 | } 139 | } 140 | 141 | m_count += KDNode.edit(new HPoint(key), editor, m_root, 0, m_K); 142 | } 143 | 144 | /** 145 | * Find KD-tree node whose key is identical to key. Uses algorithm translated 146 | * from 352.srch.c of Gonnet & Baeza-Yates. 147 | * 148 | * @param key 149 | * key for KD-tree node 150 | * 151 | * @return object at key, or null if not found 152 | * 153 | * @throws KeySizeException 154 | * if key.length mismatches K 155 | */ 156 | public T search(final double[] key) throws KeySizeException { 157 | 158 | if (key.length != m_K) { 159 | throw new KeySizeException(); 160 | } 161 | 162 | final KDNode kd = KDNode.srch(new HPoint(key), m_root, m_K); 163 | 164 | return (kd == null ? null : kd.v); 165 | } 166 | 167 | public void delete(final double[] key) throws KeySizeException, KeyMissingException { 168 | delete(key, false); 169 | } 170 | /** 171 | * Delete a node from a KD-tree. Instead of actually deleting node and 172 | * rebuilding tree, marks node as deleted. Hence, it is up to the caller to 173 | * rebuild the tree as needed for efficiency. 174 | * 175 | * @param key 176 | * key for KD-tree node 177 | * @param optional 178 | * if false and node not found, throw an exception 179 | * 180 | * @throws KeySizeException 181 | * if key.length mismatches K 182 | * @throws KeyMissingException 183 | * if no node in tree has key 184 | */ 185 | public void delete(final double[] key, final boolean optional) throws KeySizeException, KeyMissingException { 186 | 187 | if (key.length != m_K) { 188 | throw new KeySizeException(); 189 | } 190 | final KDNode t = KDNode.srch(new HPoint(key), m_root, m_K); 191 | if (t == null) { 192 | if (optional == false) { 193 | throw new KeyMissingException(); 194 | } 195 | } else { 196 | if (KDNode.del(t)) { 197 | m_count--; 198 | } 199 | } 200 | } 201 | 202 | /** 203 | * Find KD-tree node whose key is nearest neighbor to key. 204 | * 205 | * @param key 206 | * key for KD-tree node 207 | * 208 | * @return object at node nearest to key, or null on failure 209 | * 210 | * @throws KeySizeException 211 | * if key.length mismatches K 212 | */ 213 | public T nearest(final double[] key) throws KeySizeException { 214 | 215 | final List nbrs = nearest(key, 1, null); 216 | return nbrs.get(0); 217 | } 218 | 219 | /** 220 | * Find KD-tree nodes whose keys are n nearest neighbors to key. 221 | * 222 | * @param key 223 | * key for KD-tree node 224 | * @param n 225 | * number of nodes to return 226 | * 227 | * @return objects at nodes nearest to key, or null on failure 228 | * 229 | * @throws KeySizeException 230 | * if key.length mismatches K 231 | */ 232 | public List nearest(final double[] key, final int n) throws KeySizeException, IllegalArgumentException { 233 | return nearest(key, n, null); 234 | } 235 | 236 | /** 237 | * Find KD-tree nodes whose keys are within a given Euclidean distance of a 238 | * given key. 239 | * 240 | * @param key 241 | * key for KD-tree node 242 | * @param d 243 | * Euclidean distance 244 | * 245 | * @return objects at nodes with distance of key, or null on failure 246 | * 247 | * @throws KeySizeException 248 | * if key.length mismatches K 249 | */ 250 | public List nearestEuclidean(final double[] key, final double dist) throws KeySizeException { 251 | return nearestDistance(key, dist, new EuclideanDistance()); 252 | } 253 | 254 | /** 255 | * Find KD-tree nodes whose keys are within a given Hamming distance of a 256 | * given key. 257 | * 258 | * @param key 259 | * key for KD-tree node 260 | * @param d 261 | * Hamming distance 262 | * 263 | * @return objects at nodes with distance of key, or null on failure 264 | * 265 | * @throws KeySizeException 266 | * if key.length mismatches K 267 | */ 268 | public List nearestHamming(final double[] key, final double dist) throws KeySizeException { 269 | 270 | return nearestDistance(key, dist, new HammingDistance()); 271 | } 272 | 273 | /** 274 | * Find KD-tree nodes whose keys are n nearest neighbors to key. Uses 275 | * algorithm above. Neighbors are returned in ascending order of distance to 276 | * key. 277 | * 278 | * @param key 279 | * key for KD-tree node 280 | * @param n 281 | * how many neighbors to find 282 | * @param checker 283 | * an optional object to filter matches 284 | * 285 | * @return objects at node nearest to key, or null on failure 286 | * 287 | * @throws KeySizeException 288 | * if key.length mismatches K 289 | * @throws IllegalArgumentException 290 | * if n is negative or exceeds tree size 291 | */ 292 | public List nearest(final double[] key, int n, final Checker checker) throws KeySizeException, IllegalArgumentException { 293 | 294 | if (n <= 0) { 295 | return new LinkedList(); 296 | } 297 | 298 | final NearestNeighborList> nnl = getnbrs(key, n, checker); 299 | 300 | n = nnl.getSize(); 301 | final Stack nbrs = new Stack(); 302 | 303 | for (int i = 0; i < n; ++i) { 304 | final KDNode kd = nnl.removeHighest(); 305 | nbrs.push(kd.v); 306 | } 307 | 308 | return nbrs; 309 | } 310 | 311 | /** 312 | * Range search in a KD-tree. Uses algorithm translated from 352.range.c of 313 | * Gonnet & Baeza-Yates. 314 | * 315 | * @param lowk 316 | * lower-bounds for key 317 | * @param uppk 318 | * upper-bounds for key 319 | * 320 | * @return array of Objects whose keys fall in range [lowk,uppk] 321 | * 322 | * @throws KeySizeException 323 | * on mismatch among lowk.length, uppk.length, or K 324 | */ 325 | public List range(final double[] lowk, final double[] uppk) throws KeySizeException { 326 | 327 | if (lowk.length != uppk.length) { 328 | throw new KeySizeException(); 329 | } 330 | 331 | else if (lowk.length != m_K) { 332 | throw new KeySizeException(); 333 | } 334 | 335 | else { 336 | final List> found = new LinkedList>(); 337 | KDNode.rsearch(new HPoint(lowk), new HPoint(uppk), m_root, 0, m_K, found); 338 | final List o = new LinkedList(); 339 | for (final KDNode node : found) { 340 | o.add(node.v); 341 | } 342 | return o; 343 | } 344 | } 345 | 346 | public int size() { /* added by MSL */ 347 | return m_count; 348 | } 349 | 350 | public String toString() { 351 | return m_root != null ? m_root.toString(0) : ""; 352 | } 353 | 354 | private NearestNeighborList> getnbrs(final double[] key) throws KeySizeException { 355 | return getnbrs(key, m_count, null); 356 | } 357 | 358 | private NearestNeighborList> getnbrs(final double[] key, final int n, final Checker checker) throws KeySizeException { 359 | 360 | if (key.length != m_K) { 361 | throw new KeySizeException(); 362 | } 363 | 364 | final NearestNeighborList> nnl = new NearestNeighborList>(n); 365 | 366 | // initial call is with infinite hyper-rectangle and max distance 367 | final HRect hr = HRect.infiniteHRect(key.length); 368 | final double max_dist_sqd = Double.MAX_VALUE; 369 | final HPoint keyp = new HPoint(key); 370 | 371 | if (m_count > 0) { 372 | final long timeout = (this.m_timeout > 0) ? (System.currentTimeMillis() + this.m_timeout) : 0; 373 | KDNode.nnbr(m_root, keyp, hr, max_dist_sqd, 0, m_K, nnl, checker, timeout); 374 | } 375 | 376 | return nnl; 377 | 378 | } 379 | 380 | private List nearestDistance(final double[] key, final double dist, final DistanceMetric metric) throws KeySizeException { 381 | 382 | final NearestNeighborList> nnl = getnbrs(key); 383 | final int n = nnl.getSize(); 384 | final Stack nbrs = new Stack(); 385 | 386 | for (int i = 0; i < n; ++i) { 387 | final KDNode kd = nnl.removeHighest(); 388 | final HPoint p = kd.k; 389 | if (metric.distance(kd.k.coord, key) < dist) { 390 | nbrs.push(kd.v); 391 | } 392 | } 393 | 394 | return nbrs; 395 | } 396 | 397 | } 398 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KeyDuplicateException.java: -------------------------------------------------------------------------------- 1 | package edu.wlu.cs.levy.CG; 2 | 3 | /** 4 | * KeyDuplicateException is thrown when the KDTree.insert method is 5 | * invoked on a key already in the KDTree. 6 | * 7 | * 8 | * Copyright (C) Simon D. Levy 2014 9 | * 10 | * This code is free software: you can redistribute it and/or modify it under 11 | * the terms of the GNU Lesser General Public License as published by the Free 12 | * Software Foundation, either version 3 of the License, or (at your option) any 13 | * later version. 14 | * 15 | * This code is distributed in the hope that it will be useful, but WITHOUT ANY 16 | * WARRANTY without even the implied warranty of MERCHANTABILITY or FITNESS FOR 17 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU Lesser General Public License 20 | * along with this code. If not, see . You should 21 | * also have received a copy of the Parrot Parrot AR.Drone Development License 22 | * and Parrot AR.Drone copyright notice and disclaimer and If not, see 23 | * and 24 | * . 26 | */ 27 | public class KeyDuplicateException extends KDException { 28 | 29 | protected KeyDuplicateException() { 30 | super("Key already in tree"); 31 | } 32 | 33 | // arbitrary; every serializable class has to have one of these 34 | public static final long serialVersionUID = 1L; 35 | } 36 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KeyMissingException.java: -------------------------------------------------------------------------------- 1 | // KeyMissingException.java : cKey-size mismatch exception supporting KDTree class 2 | // 3 | // Copyright (C) Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | 24 | package edu.wlu.cs.levy.CG; 25 | 26 | public class KeyMissingException extends KDException { /* made public by MSL */ 27 | 28 | public KeyMissingException() { 29 | super("Key not found"); 30 | } 31 | 32 | // arbitrary; every serializable class has to have one of these 33 | public static final long serialVersionUID = 3L; 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/KeySizeException.java: -------------------------------------------------------------------------------- 1 | package edu.wlu.cs.levy.CG; 2 | 3 | /** 4 | * KeySizeException is thrown when a KDTree method is invoked on a key whose 5 | * size (array length) mismatches the one used in the that KDTree's constructor. 6 | * 7 | * Copyright (C) Simon D. Levy 2014 8 | * 9 | * This code is free software: you can redistribute it and/or modify it under 10 | * the terms of the GNU Lesser General Public License as published by the Free 11 | * Software Foundation, either version 3 of the License, or (at your option) any 12 | * later version. 13 | * 14 | * This code is distributed in the hope that it will be useful, but WITHOUT ANY 15 | * WARRANTY without even the implied warranty of MERCHANTABILITY or FITNESS FOR 16 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public License 19 | * along with this code. If not, see . You should 20 | * also have received a copy of the Parrot Parrot AR.Drone Development License 21 | * and Parrot AR.Drone copyright notice and disclaimer and If not, see 22 | * and 23 | * . 25 | * 26 | */ 27 | public class KeySizeException extends KDException { 28 | 29 | protected KeySizeException() { 30 | super("Key size mismatch"); 31 | } 32 | 33 | // arbitrary; every serializable class has to have one of these 34 | public static final long serialVersionUID = 2L; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/java/edu/wlu/cs/levy/CG/NearestNeighborList.java: -------------------------------------------------------------------------------- 1 | // NearestNeighborList.java : A solution to the KD-Tree n-nearest-neighbor problem 2 | // 3 | // Copyright (C) Bjoern Heckel and Simon D. Levy 2014 4 | // 5 | // This code is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU Lesser General Public License as 7 | // published by the Free Software Foundation, either version 3 of the 8 | // License, or (at your option) any later version. 9 | // 10 | // This code is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public License 16 | // along with this code. If not, see . 17 | // You should also have received a copy of the Parrot Parrot AR.Drone 18 | // Development License and Parrot AR.Drone copyright notice and disclaimer 19 | // and If not, see 20 | // 21 | // and 22 | // . 23 | // 24 | 25 | package edu.wlu.cs.levy.CG; 26 | 27 | import java.util.*; 28 | 29 | class NearestNeighborList { 30 | 31 | static class NeighborEntry implements Comparable> { 32 | final T data; 33 | final double value; 34 | 35 | public NeighborEntry(final T data, final double value) { 36 | this.data = data; 37 | this.value = value; 38 | } 39 | 40 | public int compareTo(final NeighborEntry t) { 41 | // note that the positions are reversed! 42 | return Double.compare(t.value, this.value); 43 | } 44 | }; 45 | 46 | java.util.PriorityQueue> m_Queue; 47 | int m_Capacity = 0; 48 | 49 | // constructor 50 | public NearestNeighborList(final int capacity) { 51 | m_Capacity = capacity; 52 | m_Queue = new java.util.PriorityQueue>(m_Capacity); 53 | } 54 | 55 | public double getMaxPriority() { 56 | final NeighborEntry p = m_Queue.peek(); 57 | return (p == null) ? Double.POSITIVE_INFINITY : p.value; 58 | } 59 | 60 | public boolean insert(final T object, final double priority) { 61 | if (isCapacityReached()) { 62 | if (priority > getMaxPriority()) { 63 | // do not insert - all elements in queue have lower priority 64 | return false; 65 | } 66 | m_Queue.add(new NeighborEntry(object, priority)); 67 | // remove object with highest priority 68 | m_Queue.poll(); 69 | } else { 70 | m_Queue.add(new NeighborEntry(object, priority)); 71 | } 72 | return true; 73 | } 74 | 75 | public boolean isCapacityReached() { 76 | return m_Queue.size() >= m_Capacity; 77 | } 78 | 79 | public T getHighest() { 80 | final NeighborEntry p = m_Queue.peek(); 81 | return (p == null) ? null : p.data; 82 | } 83 | 84 | public boolean isEmpty() { 85 | return m_Queue.size() == 0; 86 | } 87 | 88 | public int getSize() { 89 | return m_Queue.size(); 90 | } 91 | 92 | public T removeHighest() { 93 | // remove object with highest priority 94 | final NeighborEntry p = m_Queue.poll(); 95 | return (p == null) ? null : p.data; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /test/clj_similar/benchmark.clj: -------------------------------------------------------------------------------- 1 | (ns clj-similar.benchmark 2 | (:require [clojure.test :refer :all] 3 | [criterium.core :refer [quick-bench bench with-progress-reporting]] 4 | [clj-similar.core :refer :all])) 5 | 6 | 7 | (def dict 8 | ;; Upper case + lower case ASCII letters 9 | (map (comp str char) (concat (range 65 91) (range 97 123)))) 10 | 11 | (defn random-set [max-size] 12 | (let [size (+ 1 (rand-int max-size))] 13 | (set (take size (repeatedly #(rand-nth dict)))))) 14 | 15 | (defn generate-random 16 | [count max-size] 17 | (for [_ (range count)] 18 | (random-set max-size))) 19 | 20 | (defn omit-random 21 | [s n] 22 | (let [omit (set (take n (shuffle s)))] 23 | (apply (partial disj s) omit))) 24 | 25 | (deftest benchmark 26 | (let [count 1E6 27 | max-size 10 28 | coll (do 29 | (println "Generating" (long count) "random sets with max-size" max-size) 30 | (generate-random count max-size)) 31 | s (do 32 | (println "Generating similar data structure") 33 | (time (similar coll 10 3)))] 34 | (println "Testing speed of nearest neighbor retrieval") 35 | (println "Sample output for random target sets") 36 | (doseq [_ (range 10)] 37 | (let [in (random-set max-size) 38 | out (nearest s in 2 :exact? true)] 39 | (println "in" in "out" out "exact" (map meta out)))) 40 | 41 | (println "Sample output for existing sets") 42 | (doseq [in (take 10 (random-sample 0.25 coll))] 43 | (let [part (omit-random in 2) 44 | out (nearest s part 2 :exact? true)] 45 | (println "in" part "original" in "out" out "exact" (map meta out)))) 46 | #_(bench (nearest s (random-set max-size))) 47 | )) 48 | -------------------------------------------------------------------------------- /test/clj_similar/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-similar.core-test 2 | (:require [clojure.test :refer :all] 3 | [clj-similar.core :refer :all])) 4 | 5 | (defn in? 6 | "true if coll contains elm" 7 | [coll el] 8 | (some #(= el %) coll)) 9 | 10 | (defn all-in? 11 | [coll els] 12 | (every? (partial in? coll) els)) 13 | 14 | (deftest simple-test 15 | (let [coll [#{"a" "b" "c"} #{"d" "e" "c"} #{"f" "e" "a" "b"}] 16 | s (similar coll)] 17 | (testing "Return the nearest set when exact match" 18 | (is (all-in? (nearest s #{"f" "e" "a" "b"}) #{"f" "e" "a" "b"}))) 19 | #_(testing "Return nil if element is unseen" 20 | (is (= (nearest s #{"x"}) nil))) 21 | #_(testing "Return the nearest set when fuzzy match with extra element" 22 | (is (all-in? (nearest s #{"f" "e" "a" "b" "x"}) #{"f" "e" "a" "b"}))) 23 | #_(testing "Return the nearest set when fuzzy match with omitted element" 24 | (is (all-in? (nearest s #{"a"}) #{"a" "b" "c"}))) 25 | #_(testing "Return the nearest two sets" 26 | (is (all-in? (nearest s #{"a" "b"} 2) '(#{"a" "b" "c"} #{"f" "e" "a" "b"})))))) 27 | 28 | (deftest threshold-test 29 | (let [coll [#{"a" "b" "c"} #{"d" "e" "c"} #{"f" "e" "a" "b"}] 30 | s (similar coll)] 31 | (testing "omit too values with a too low jaccard-index" 32 | (is (all-in? (nearest s #{"x"} 1 :threshold 0.8) '()))) 33 | #_(testing "omit too values with a too low jaccard-index" 34 | (is (all-in? (nearest s #{"a" "b"} 3 :threshold 0.4) '(#{"a" "b" "c"} #{"f" "e" "a" "b"})))) 35 | )) 36 | --------------------------------------------------------------------------------