├── LICENSE ├── README.md └── EMSimilarity.swift /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Evan Moss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SwiftSim 2 | Vector similarity utility in Swift. Currently, only arrays of Double are supported as input. In the next update, this will be changed to be more generic. The hope is that the numerical methods can operate on any arbitrary numerical input, and the set-based metrics can operate on any Hashable type. Future versions may have a more functional object Class called SwiftSwim. Future versions will also support arbitrary objects to be used and their data paths configurable (i.e., use objects). Here, the input vectors will be lazily constructed via closures. Another feature will be to pass in arrays of objects or arrays to find the top or bottom k entries according to some specified element. 3 | 4 | ## Basic Usage 5 | ```swift 6 | let Similarity = EMSimilarity() 7 | let A = [0.0, 1.5, 3.0, 4.5, 6.0] 8 | let B = [2.0, 4.0, 6.0, 8.0, 10.0] 9 | 10 | // compute the cosine similarity of A and B 11 | Similarity.compute(A, B: B) 12 | // 0.984731927834662 13 | ``` 14 | 15 | ## Computation Modes 16 | SwiftSim uses a stack to store its modes. To specify a new mode, you simply push a new computation mode to the stack. You can also pop modes off. 17 | 18 | ```swift 19 | enum EMSimilarityMode { 20 | case Cosine 21 | case Tanimoto 22 | case Ochiai 23 | case JaccardIndex 24 | case JaccardDistance 25 | case Dice 26 | case Hamming 27 | } 28 | ``` 29 | 30 | ```swift 31 | // push a new computation mode to the stack 32 | Similarity.pushSimMode(.Hamming) 33 | // compute the Hamming distance of A and B 34 | Similarity.compute(A, B: B) 35 | // 5.0 36 | // go back to previous mode 37 | Similarity.popSimMode() 38 | Similarity.compute(A, B: B) 39 | // 0.984731927834662 40 | ``` 41 | 42 | ## Vector Size Mismatch Mode 43 | By default, if A and B have different lengths, and the mode is Cosine, Tanimoto, or Hamming, it will trigger a bail and return -1.0. However, you can specify a different mismatch mode like you do similarity mode. 44 | 45 | ```swift 46 | enum EMVectorSizeMismatchMode { 47 | case Bail 48 | case Truncate 49 | } 50 | 51 | // truncate the larger vector 52 | Similarity.pushMismatchMode(.Truncate) 53 | ``` 54 | 55 | ## Other Bail Conditions 56 | Currently, another condition that will cause -1.0 to be returned is if both input arrays are empty and the similarity mode is not Hamming, or one of them is empty and the current mode is Cosine, Tanimoto, or Ochiai. This is to mitigate a divide by zero error, or to adhere to metric prequisites. 57 | -------------------------------------------------------------------------------- /EMSimilarity.swift: -------------------------------------------------------------------------------- 1 | // 2 | // EMSimilarity.swift 3 | // SwiftSim 4 | // 5 | // Created by Evan Moss on 8/1/16. 6 | // Copyright © 2016 Enterprising Technologies LLC. All rights reserved. 7 | // 8 | // The MIT License (MIT) 9 | // 10 | // Copyright (c) 2016 Evan Moss 11 | // 12 | // Permission is hereby granted, free of charge, to any person obtaining a copy 13 | // of this software and associated documentation files (the "Software"), to deal 14 | // in the Software without restriction, including without limitation the rights 15 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | // copies of the Software, and to permit persons to whom the Software is 17 | // furnished to do so, subject to the following conditions: 18 | // 19 | // The above copyright notice and this permission notice shall be included in all 20 | // copies or substantial portions of the Software. 21 | // 22 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 28 | // SOFTWARE. 29 | 30 | import Foundation 31 | 32 | enum EMSimilarityMode { 33 | case Cosine 34 | case Tanimoto 35 | case Ochiai 36 | case JaccardIndex 37 | case JaccardDistance 38 | case Dice 39 | case Hamming 40 | } 41 | 42 | enum EMVectorSizeMismatchMode { 43 | case Bail 44 | case Truncate 45 | } 46 | 47 | class EMSimilarity { 48 | /** Similarity metric mode **/ 49 | private var currentSimMode = [EMSimilarityMode.Cosine] 50 | 51 | /** Set the currentSimMode via push **/ 52 | func pushSimMode(mode: EMSimilarityMode) { 53 | self.currentSimMode.append(mode) 54 | } 55 | 56 | /** Pop the currentSimMode via pop if it won't make the stack empty **/ 57 | func popSimMode() { 58 | if self.currentSimMode.count > 1 { 59 | let _ = self.currentSimMode.popLast() 60 | } 61 | } 62 | 63 | /** Get the currently set similarity mode **/ 64 | func getCurrentSimMode() -> EMSimilarityMode? { 65 | return self.currentSimMode.last 66 | } 67 | 68 | /** Mismatch Mode **/ 69 | private var currentMismatchMode = [EMVectorSizeMismatchMode.Bail] 70 | 71 | /** Set the currentMismatcMode via push **/ 72 | func pushMismatchMode(mode: EMVectorSizeMismatchMode) { 73 | self.currentMismatchMode.append(mode) 74 | } 75 | 76 | /** Pop the currentMismatchMode via pop if it won't make the stack empty **/ 77 | func popMismatchMode() { 78 | if self.currentMismatchMode.count > 1 { 79 | let _ = self.currentMismatchMode.popLast() 80 | } 81 | } 82 | 83 | /** Get the currently set mistmatch mode **/ 84 | func getCurrentMismatchMode() -> EMVectorSizeMismatchMode? { 85 | return self.currentMismatchMode.last 86 | } 87 | 88 | /** Dot Product **/ 89 | private func dot(A: [Double], B: [Double]) -> Double { 90 | var x: Double = 0 91 | for i in 0...A.count-1 { 92 | x += A[i] * B[i] 93 | } 94 | return x 95 | } 96 | 97 | /** Vector Magnitude **/ 98 | private func magnitude(A: [Double]) -> Double { 99 | var x: Double = 0 100 | for elt in A { 101 | x += elt * elt 102 | } 103 | return sqrt(x) 104 | } 105 | 106 | /** Cosine similarity **/ 107 | private func cosineSim(A: [Double], B: [Double]) -> Double { 108 | return dot(A: A, B: B) / (magnitude(A: A) * magnitude(A: B)) 109 | } 110 | 111 | /** Tanimoto similarity **/ 112 | private func tanimotoSim(A: [Double], B: [Double]) -> Double { 113 | let Amag = magnitude(A: A) 114 | let Bmag = magnitude(A: B) 115 | let AdotB = dot(A: A, B: B) 116 | return AdotB / (Amag * Amag + Bmag * Bmag - AdotB) 117 | } 118 | 119 | /** Ochiai similarity **/ 120 | private func ochiaiSim(A: [Double], B: [Double]) -> Double { 121 | let a = Set(A) 122 | let b = Set(B) 123 | 124 | return Double(a.intersection(b).count) / sqrt(Double(a.count) * Double(b.count)) 125 | } 126 | 127 | /** Jaccard index **/ 128 | private func jaccardIndex(A: [Double], B: [Double]) -> Double { 129 | let a = Set(A) 130 | let b = Set(B) 131 | 132 | return Double(a.intersection(b).count) / Double(a.union(b).count) 133 | } 134 | 135 | /** Jaccard distance **/ 136 | private func jaccardDist(A: [Double], B: [Double]) -> Double { 137 | return 1.0 - jaccardIndex(A: A, B: B) 138 | } 139 | 140 | /** Dice coeeficient **/ 141 | private func diceCoef(A: [Double], B: [Double]) -> Double { 142 | let a = Set(A) 143 | let b = Set(B) 144 | 145 | return 2.0 * Double(a.intersection(b).count) / (Double(a.count) + Double(b.count)) 146 | } 147 | 148 | /** Hamming distance **/ 149 | private func hammingDist(A: [Double], B: [Double]) -> Double { 150 | var x: Double = 0 151 | 152 | if A.isEmpty { 153 | return x 154 | } 155 | 156 | for i in 0...A.count-1 { 157 | if A[i] != B[i] { 158 | x += 1 159 | } 160 | } 161 | 162 | return x 163 | } 164 | 165 | private let encforceEqualVectorSizes: Set = [.Cosine, .Tanimoto, .Hamming] 166 | private let bailOnEmptyInput: Set = [.Cosine, .Tanimoto, .Ochiai] 167 | private let allowEmptyInputs: Set = [.Hamming] 168 | 169 | /** 170 | * Main compute mode 171 | * Double types 172 | * Returns the similarity results or -1.0 on caught error 173 | */ 174 | func compute(A: [Double], B: [Double]) -> Double { 175 | // get the mode 176 | var mode = EMSimilarityMode.Cosine 177 | if let _mode = self.getCurrentSimMode() { 178 | mode = _mode 179 | } 180 | else { 181 | return -1 182 | } 183 | 184 | // are both vectors empty? 185 | if A.isEmpty && B.isEmpty && !allowEmptyInputs.contains(mode) { 186 | // divide by zero -> D.N.E. 187 | return -1 188 | } 189 | 190 | // is one of the vectors empty and would this case a divide by zero error? 191 | if bailOnEmptyInput.contains(mode) && (A.isEmpty || B.isEmpty) { 192 | return -1 193 | } 194 | 195 | // look for vector size mismatch for modes in encforceEqualVectorSizes 196 | if encforceEqualVectorSizes.contains(mode) && A.count != B.count { 197 | if let mismatchMode = self.getCurrentMismatchMode() { 198 | switch mismatchMode { 199 | case .Bail: 200 | return -1 201 | case .Truncate: 202 | let a = A.count < B.count ? A : B 203 | let _b = A.count < B.count ? B : A 204 | var b = [Double]() 205 | if a.count > 0 { 206 | for i in 0...a.count-1 { 207 | b.append(_b[i]) 208 | } 209 | } 210 | return compute(A: a, B: b) 211 | } 212 | } 213 | else { 214 | return -1 215 | } 216 | } 217 | 218 | switch mode { 219 | case .Cosine: 220 | return cosineSim(A: A, B: B) 221 | case .Tanimoto: 222 | return tanimotoSim(A: A, B: B) 223 | case .Ochiai: 224 | return ochiaiSim(A: A, B: B) 225 | case .JaccardIndex: 226 | return jaccardIndex(A: A, B: B) 227 | case .JaccardDistance: 228 | return jaccardDist(A: A, B: B) 229 | case .Dice: 230 | return diceCoef(A: A, B: B) 231 | case .Hamming: 232 | return hammingDist(A: A, B: B) 233 | } 234 | } 235 | } 236 | --------------------------------------------------------------------------------