├── .gitignore ├── LICENSE ├── README.md ├── Word2Vec.scala └── vectors.bin /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.class 3 | *.log 4 | .idea 5 | .idea/* 6 | *.iml 7 | out 8 | 9 | # Scala-IDE specific 10 | .scala_dependencies 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | word2vec-scala 2 | ============== 3 | 4 | 5 | This is a Scala implementation of the [word2vec](https://code.google.com/p/word2vec/) 6 | toolkit's model representation. 7 | 8 | This Scala interface allows the user to access the vector representation output 9 | by the word2vec toolkit. It also implements example operations that can be done 10 | on the vectors (e.g., word-distance, word-analogy). 11 | 12 | Note that it does **NOT** implement the actual training algorithms. You will 13 | still need to download and compile the original word2vec tool if you wish to 14 | train new models. 15 | 16 | 17 | ## Includes 18 | 19 | The included model (vectors.bin) was trained on the [text8](http://mattmahoney.net/dc/text8.zip) corpus, which contains 20 | the first 100 MB of the "clean" English Wikipedia corpus. The following training parameters 21 | were used: 22 | 23 | ```bash 24 | ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 25 | ``` 26 | 27 | 28 | ## Usage 29 | 30 | #### Load model 31 | ```scala 32 | val model = new Word2Vec() 33 | model.load("vectors.bin") 34 | ``` 35 | 36 | #### Distance - Find N best matches 37 | ```scala 38 | val results = model.distance(List("france"), N = 10) 39 | model.pprint(results) 40 | ``` 41 | ``` 42 | Word Cosine distance 43 | ------------------------------------------------------------------------ 44 | belgium 0.706633 45 | spain 0.672767 46 | netherlands 0.668178 47 | italy 0.616545 48 | switzerland 0.595572 49 | luxembourg 0.591839 50 | portugal 0.564891 51 | germany 0.549196 52 | russia 0.543569 53 | hungary 0.519036 54 | ``` 55 | 56 | ```scala 57 | model.pprint( model.distance(List("france", "usa")) ) 58 | ``` 59 | ``` 60 | Word Cosine distance 61 | ------------------------------------------------------------------------ 62 | netherlands 0.691459 63 | switzerland 0.672526 64 | belgium 0.656425 65 | canada 0.641793 66 | russia 0.612469 67 | . . 68 | . . 69 | . . 70 | croatia 0.451900 71 | vantaa 0.450767 72 | roissy 0.448256 73 | norway 0.447392 74 | cuba 0.446168 75 | ``` 76 | 77 | ```scala 78 | model.pprint( model.distance(List("france", "usa", "usa")) ) 79 | ``` 80 | ``` 81 | Word Cosine distance 82 | ------------------------------------------------------------------------ 83 | canada 0.631119 84 | switzerland 0.626366 85 | netherlands 0.621275 86 | russia 0.569951 87 | belgium 0.560368 88 | . . 89 | . . 90 | . . 91 | osaka 0.418143 92 | eas 0.417097 93 | antholz 0.415458 94 | fukuoka 0.414105 95 | zealand 0.413075 96 | ``` 97 | 98 | #### Analogy - King is to Queen, as Man is to ??? 99 | ```scala 100 | model.pprint( model.analogy("king", "queen", "man", N = 10) ) 101 | ``` 102 | ``` 103 | Word Cosine distance 104 | ------------------------------------------------------------------------ 105 | woman 0.547376 106 | girl 0.509787 107 | baby 0.473137 108 | spider 0.450589 109 | love 0.433065 110 | prostitute 0.433034 111 | loves 0.422127 112 | beauty 0.421060 113 | bride 0.413417 114 | lady 0.406856 115 | ``` 116 | 117 | #### Ranking - Rank a set of words by their respective distance to search term 118 | ```scala 119 | model.pprint( model.rank("apple", Set("orange", "soda", "lettuce")) ) 120 | ``` 121 | ``` 122 | Word Cosine distance 123 | ------------------------------------------------------------------------ 124 | orange 0.203808 125 | lettuce 0.132007 126 | soda 0.075649 127 | ``` 128 | 129 | 130 | ## Compatibility 131 | 132 | - **[09/2013]** The code was tested to work with models trained using revision 133 | [r33](http://word2vec.googlecode.com/svn/trunk/?p=33) of the word2vec toolkit. 134 | It should also work with future revisions, assuming that the output format does 135 | not change. 136 | -------------------------------------------------------------------------------- /Word2Vec.scala: -------------------------------------------------------------------------------- 1 | // Copyright 2013 trananh 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | import java.io._ 16 | import scala.Array 17 | import scala.collection.mutable 18 | import scala.collection.mutable.ArrayBuffer 19 | 20 | 21 | /** A simple binary file reader. 22 | * @constructor Create a binary file reader. 23 | * @param file The binary file to be read. 24 | * 25 | * @author trananh 26 | */ 27 | class VecBinaryReader(val file: File) { 28 | 29 | /** Overloaded constructor */ 30 | def this(filename: String) = this(new File(filename)) 31 | 32 | /** ASCII values for common delimiter characters */ 33 | private val SPACE = 32 34 | private val LF = 10 35 | 36 | /** Open input streams */ 37 | private val fis = new FileInputStream(file) 38 | private val bis = new BufferedInputStream(fis) 39 | private val dis = new DataInputStream(bis) 40 | 41 | /** Close the stream. */ 42 | def close() { dis.close(); bis.close(); fis.close() } 43 | 44 | /** Read the next byte. 45 | * @return The next byte from the file. 46 | */ 47 | def read(): Byte = dis.readByte() 48 | 49 | /** Read the next token as a string, using the provided delimiters as breaking points. 50 | * @param delimiters ASCII code of delimiter characters (default to SPACE and LINE-FEED). 51 | * @return String representation of the next token. 52 | */ 53 | def readToken(delimiters: Set[Int] = Set(SPACE, LF)): String = { 54 | val bytes = new ArrayBuffer[Byte]() 55 | val sb = new StringBuilder() 56 | var byte = dis.readByte() 57 | while (!delimiters.contains(byte)) { 58 | bytes.append(byte) 59 | byte = dis.readByte() 60 | } 61 | sb.append(new String(bytes.toArray[Byte])).toString() 62 | } 63 | 64 | /** Read next 4 bytes as a floating-point number. 65 | * @return The floating-point value of the next 4 bytes. 66 | */ 67 | def readFloat(): Float = { 68 | // We need to reverse the byte order here due to endian-compatibility. 69 | java.lang.Float.intBitsToFloat(java.lang.Integer.reverseBytes(dis.readInt())) 70 | } 71 | 72 | } 73 | 74 | 75 | /** A Scala port of the word2vec model. This interface allows the user to access the vector representations 76 | * output by the word2vec tool, as well as perform some common operations on those vectors. It does NOT 77 | * implement the actual continuous bag-of-words and skip-gram architectures for computing the vectors. 78 | * 79 | * More information on word2vec can be found here: https://code.google.com/p/word2vec/ 80 | * 81 | * Example usage: 82 | * {{{ 83 | * val model = new Word2Vec() 84 | * model.load("vectors.bin") 85 | * val results = model.distance(List("france"), N = 10) 86 | * 87 | * model.pprint(results) 88 | * }}} 89 | * 90 | * @constructor Create a word2vec model. 91 | * 92 | * @author trananh 93 | */ 94 | class Word2Vec { 95 | 96 | /** Map of words and their associated vector representations */ 97 | private val vocab = new mutable.HashMap[String, Array[Float]]() 98 | 99 | /** Number of words */ 100 | private var numWords = 0 101 | 102 | /** Number of floating-point values associated with each word (i.e., length of the vectors) */ 103 | private var vecSize = 0 104 | 105 | /** Load data from a binary file. 106 | * @param filename Path to file containing word projections in the BINARY FORMAT. 107 | * @param limit Maximum number of words to load from file (a.k.a. max vocab size). 108 | * @param normalize Normalize the loaded vectors if true (default to true). 109 | */ 110 | def load(filename: String, limit: Integer = Int.MaxValue, normalize: Boolean = true): Unit = { 111 | // Check edge case 112 | val file = new File(filename) 113 | if (!file.exists()) { 114 | throw new FileNotFoundException("Binary vector file not found <" + file.toString + ">") 115 | } 116 | 117 | // Create new reader to read data 118 | val reader = new VecBinaryReader(file) 119 | 120 | // Read header info 121 | numWords = Integer.parseInt(reader.readToken()) 122 | vecSize = Integer.parseInt(reader.readToken()) 123 | println("\nFile contains " + numWords + " words with vector size " + vecSize) 124 | 125 | // Read the vocab words and their associated vector representations 126 | var word = "" 127 | val vector = new Array[Float](vecSize) 128 | var normFactor = 1f 129 | for (_ <- 0 until math.min(numWords, limit)) { 130 | // Read the word 131 | word = reader.readToken() 132 | 133 | // Read the vector representation (each vector contains vecSize number of floats) 134 | for (i <- 0 until vector.length) vector(i) = reader.readFloat() 135 | 136 | // Store the normalized vector representation, keyed by the word 137 | normFactor = if (normalize) magnitude(vector).toFloat else 1f 138 | vocab.put(word, vector.map(_ / normFactor) ) 139 | 140 | // Eat up the next delimiter character 141 | reader.read() 142 | } 143 | println("Loaded " + math.min(numWords, limit) + " words.\n") 144 | 145 | // Finally, close the reader 146 | reader.close() 147 | } 148 | 149 | /** Return the number of words in the vocab. 150 | * @return Number of words in the vocab. 151 | */ 152 | def wordsCount: Int = numWords 153 | 154 | /** Size of the vectors. 155 | * @return Size of the vectors. 156 | */ 157 | def vectorSize: Int = vecSize 158 | 159 | /** Clear internal data. */ 160 | def clear() { 161 | vocab.clear() 162 | numWords = 0 163 | vecSize = 0 164 | } 165 | 166 | /** Check if the word is present in the vocab map. 167 | * @param word Word to be checked. 168 | * @return True if the word is in the vocab map. 169 | */ 170 | def contains(word: String): Boolean = { 171 | vocab.get(word).isDefined 172 | } 173 | 174 | /** Get the vector representation for the word. 175 | * @param word Word to retrieve vector for. 176 | * @return The vector representation of the word. 177 | */ 178 | def vector(word: String): Array[Float] = { 179 | vocab.getOrElse(word, Array[Float]()) 180 | } 181 | 182 | /** Compute the Euclidean distance between two vectors. 183 | * @param vec1 The first vector. 184 | * @param vec2 The other vector. 185 | * @return The Euclidean distance between the two vectors. 186 | */ 187 | def euclidean(vec1: Array[Float], vec2: Array[Float]): Double = { 188 | assert(vec1.length == vec2.length, "Uneven vectors!") 189 | var sum = 0.0 190 | for (i <- 0 until vec1.length) sum += math.pow(vec1(i) - vec2(i), 2) 191 | math.sqrt(sum) 192 | } 193 | 194 | /** Compute the Euclidean distance between the vector representations of the words. 195 | * @param word1 The first word. 196 | * @param word2 The other word. 197 | * @return The Euclidean distance between the vector representations of the words. 198 | */ 199 | def euclidean(word1: String, word2: String): Double = { 200 | assert(contains(word1) && contains(word2), "Out of dictionary word! " + word1 + " or " + word2) 201 | euclidean(vocab.get(word1).get, vocab.get(word2).get) 202 | } 203 | 204 | /** Compute the cosine similarity score between two vectors. 205 | * @param vec1 The first vector. 206 | * @param vec2 The other vector. 207 | * @return The cosine similarity score of the two vectors. 208 | */ 209 | def cosine(vec1: Array[Float], vec2: Array[Float]): Double = { 210 | assert(vec1.length == vec2.length, "Uneven vectors!") 211 | var dot, sum1, sum2 = 0.0 212 | for (i <- 0 until vec1.length) { 213 | dot += (vec1(i) * vec2(i)) 214 | sum1 += (vec1(i) * vec1(i)) 215 | sum2 += (vec2(i) * vec2(i)) 216 | } 217 | dot / (math.sqrt(sum1) * math.sqrt(sum2)) 218 | } 219 | 220 | /** Compute the cosine similarity score between the vector representations of the words. 221 | * @param word1 The first word. 222 | * @param word2 The other word. 223 | * @return The cosine similarity score between the vector representations of the words. 224 | */ 225 | def cosine(word1: String, word2: String): Double = { 226 | assert(contains(word1) && contains(word2), "Out of dictionary word! " + word1 + " or " + word2) 227 | cosine(vocab.get(word1).get, vocab.get(word2).get) 228 | } 229 | 230 | /** Compute the magnitude of the vector. 231 | * @param vec The vector. 232 | * @return The magnitude of the vector. 233 | */ 234 | def magnitude(vec: Array[Float]): Double = { 235 | math.sqrt(vec.foldLeft(0.0){(sum, x) => sum + (x * x)}) 236 | } 237 | 238 | /** Normalize the vector. 239 | * @param vec The vector. 240 | * @return A normalized vector. 241 | */ 242 | def normalize(vec: Array[Float]): Array[Float] = { 243 | val mag = magnitude(vec).toFloat 244 | vec.map(_ / mag) 245 | } 246 | 247 | /** Find the vector representation for the given list of word(s) by aggregating (summing) the 248 | * vector for each word. 249 | * @param input The input word(s). 250 | * @return The sum vector (aggregated from the input vectors). 251 | */ 252 | def sumVector(input: List[String]): Array[Float] = { 253 | // Find the vector representation for the input. If multiple words, then aggregate (sum) their vectors. 254 | input.foreach(w => assert(contains(w), "Out of dictionary word! " + w)) 255 | val vector = new Array[Float](vecSize) 256 | input.foreach(w => for (j <- 0 until vector.length) vector(j) += vocab.get(w).get(j)) 257 | vector 258 | } 259 | 260 | /** Find N closest terms in the vocab to the given vector, using only words from the in-set (if defined) 261 | * and excluding all words from the out-set (if non-empty). Although you can, it doesn't make much 262 | * sense to define both in and out sets. 263 | * @param vector The vector. 264 | * @param inSet Set of words to consider. Specify None to use all words in the vocab (default behavior). 265 | * @param outSet Set of words to exclude (default to empty). 266 | * @param N The maximum number of terms to return (default to 40). 267 | * @return The N closest terms in the vocab to the given vector and their associated cosine similarity scores. 268 | */ 269 | def nearestNeighbors(vector: Array[Float], inSet: Option[Set[String]] = None, 270 | outSet: Set[String] = Set[String](), N: Integer = 40) 271 | : List[(String, Float)] = { 272 | // For performance efficiency, we maintain the top/closest terms using a priority queue. 273 | // Note: We invert the distance here because a priority queue will dequeue the highest priority element, 274 | // but we would like it to dequeue the lowest scoring element instead. 275 | val top = new mutable.PriorityQueue[(String, Float)]()(Ordering.by(-_._2)) 276 | 277 | // Iterate over each token in the vocab and compute its cosine score to the input. 278 | var dist = 0f 279 | val iterator = if (inSet.isDefined) vocab.filterKeys(k => inSet.get.contains(k)).iterator else vocab.iterator 280 | iterator.foreach(entry => { 281 | // Skip tokens in the out set 282 | if (!outSet.contains(entry._1)) { 283 | dist = cosine(vector, entry._2).toFloat 284 | if (top.size < N || top.head._2 < dist) { 285 | top.enqueue((entry._1, dist)) 286 | if (top.length > N) { 287 | // If the queue contains over N elements, then dequeue the highest priority element 288 | // (which will be the element with the lowest cosine score). 289 | top.dequeue() 290 | } 291 | } 292 | } 293 | }) 294 | 295 | // Return the top N results as a sorted list. 296 | assert(top.length <= N) 297 | top.toList.sortWith(_._2 > _._2) 298 | } 299 | 300 | /** Find the N closest terms in the vocab to the input word(s). 301 | * @param input The input word(s). 302 | * @param N The maximum number of terms to return (default to 40). 303 | * @return The N closest terms in the vocab to the input word(s) and their associated cosine similarity scores. 304 | */ 305 | def distance(input: List[String], N: Integer = 40): List[(String, Float)] = { 306 | // Check for edge cases 307 | if (input.size == 0) return List[(String, Float)]() 308 | input.foreach(w => { 309 | if (!contains(w)) { 310 | println("Out of dictionary word! " + w) 311 | return List[(String, Float)]() 312 | } 313 | }) 314 | 315 | // Find the vector representation for the input. If multiple words, then aggregate (sum) their vectors. 316 | val vector = sumVector(input) 317 | 318 | nearestNeighbors(normalize(vector), outSet = input.toSet, N = N) 319 | } 320 | 321 | /** Find the N closest terms in the vocab to the analogy: 322 | * - [word1] is to [word2] as [word3] is to ??? 323 | * 324 | * The algorithm operates as follow: 325 | * - Find a vector approximation of the missing word = vec([word2]) - vec([word1]) + vec([word3]). 326 | * - Return words closest to the approximated vector. 327 | * 328 | * @param word1 First word in the analogy [word1] is to [word2] as [word3] is to ???. 329 | * @param word2 Second word in the analogy [word1] is to [word2] as [word3] is to ??? 330 | * @param word3 Third word in the analogy [word1] is to [word2] as [word3] is to ???. 331 | * @param N The maximum number of terms to return (default to 40). 332 | * 333 | * @return The N closest terms in the vocab to the analogy and their associated cosine similarity scores. 334 | */ 335 | def analogy(word1: String, word2: String, word3: String, N: Integer = 40): List[(String, Float)] = { 336 | // Check for edge cases 337 | if (!contains(word1) || !contains(word2) || !contains(word3)) { 338 | println("Out of dictionary word! " + Array(word1, word2, word3).mkString(" or ")) 339 | return List[(String, Float)]() 340 | } 341 | 342 | // Find the vector approximation for the missing analogy. 343 | val vector = new Array[Float](vecSize) 344 | for (j <- 0 until vector.length) 345 | vector(j) = vocab.get(word2).get(j) - vocab.get(word1).get(j) + vocab.get(word3).get(j) 346 | 347 | nearestNeighbors(normalize(vector), outSet = Set(word1, word2, word3), N = N) 348 | } 349 | 350 | /** Rank a set of words by their respective distance to some central term. 351 | * @param word The central word. 352 | * @param set Set of words to rank. 353 | * @return Ordered list of words and their associated scores. 354 | */ 355 | def rank(word: String, set: Set[String]): List[(String, Float)] = { 356 | // Check for edge cases 357 | if (set.size == 0) return List[(String, Float)]() 358 | (set + word).foreach(w => { 359 | if (!contains(w)) { 360 | println("Out of dictionary word! " + w) 361 | return List[(String, Float)]() 362 | } 363 | }) 364 | 365 | nearestNeighbors(vocab.get(word).get, inSet = Option(set), N = set.size) 366 | } 367 | 368 | /** Pretty print the list of words and their associated scores. 369 | * @param words List of (word, score) pairs to be printed. 370 | */ 371 | def pprint(words: List[(String, Float)]) = { 372 | println("\n%50s".format("Word") + (" " * 7) + "Cosine distance\n" + ("-" * 72)) 373 | println(words.map(s => "%50s".format(s._1) + (" " * 7) + "%15f".format(s._2)).mkString("\n")) 374 | } 375 | 376 | } 377 | 378 | 379 | /** ******************************************************************************** 380 | * Demo of the Scala ported word2vec model. 381 | * ******************************************************************************** 382 | */ 383 | object RunWord2Vec { 384 | 385 | /** Demo. */ 386 | def main(args: Array[String]) { 387 | // Load word2vec model from binary file. 388 | val model = new Word2Vec() 389 | model.load("../word2vec-scala/vectors.bin") 390 | 391 | // distance: Find N closest words 392 | model.pprint(model.distance(List("france"), N = 10)) 393 | model.pprint(model.distance(List("france", "usa"))) 394 | model.pprint(model.distance(List("france", "usa", "usa"))) 395 | 396 | // analogy: "king" is to "queen", as "man" is to ? 397 | model.pprint(model.analogy("king", "queen", "man", N = 10)) 398 | 399 | // rank: Rank a set of words by their respective distance to the central term 400 | model.pprint(model.rank("apple", Set("orange", "soda", "lettuce"))) 401 | } 402 | 403 | } 404 | -------------------------------------------------------------------------------- /vectors.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trananh/word2vec-scala/998ee706d61cb6b05cba3ec2c8e38b064f7c5049/vectors.bin --------------------------------------------------------------------------------