├── .github └── workflows │ └── ruby.yml ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.markdown ├── Rakefile ├── bin ├── bayes.rb └── summarize.rb ├── classifier.gemspec ├── cloving.json ├── install.rb ├── lib ├── classifier.rb └── classifier │ ├── bayes.rb │ ├── extensions │ ├── string.rb │ ├── vector.rb │ ├── vector_serialize.rb │ └── word_hash.rb │ ├── lsi.rb │ └── lsi │ ├── content_node.rb │ ├── summary.rb │ └── word_list.rb └── test ├── bayes └── bayesian_test.rb ├── extensions └── word_hash_test.rb ├── lsi └── lsi_test.rb └── test_helper.rb /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Ruby 9 | 10 | on: 11 | push: 12 | branches: [ "master" ] 13 | pull_request: 14 | branches: [ "master" ] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | test: 21 | 22 | runs-on: ubuntu-latest 23 | strategy: 24 | matrix: 25 | ruby-version: ['2.7', 'head'] 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | - name: Set up Ruby 30 | # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby, 31 | # change this to (see https://github.com/ruby/setup-ruby#versioning): 32 | # uses: ruby/setup-ruby@v1 33 | uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0 34 | with: 35 | ruby-version: ${{ matrix.ruby-version }} 36 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 37 | - name: Run tests 38 | run: bundle exec rake test 39 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gemspec 3 | 4 | gem 'fast-stemmer' 5 | gem 'matrix' 6 | gem 'mutex_m' 7 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | classifier (1.4.4) 5 | fast-stemmer (~> 1.0) 6 | mutex_m (~> 0.2) 7 | rake 8 | 9 | GEM 10 | remote: https://rubygems.org/ 11 | specs: 12 | fast-stemmer (1.0.2) 13 | matrix (0.4.2) 14 | minitest (5.18.1) 15 | mutex_m (0.2.0) 16 | psych (5.1.2) 17 | stringio 18 | rake (13.0.6) 19 | rdoc (6.5.1.1) 20 | psych (>= 4.0.0) 21 | stringio (3.1.0) 22 | 23 | PLATFORMS 24 | arm64-darwin-22 25 | arm64-darwin-23 26 | x86_64-linux 27 | 28 | DEPENDENCIES 29 | classifier! 30 | fast-stemmer 31 | matrix 32 | minitest 33 | mutex_m 34 | rdoc 35 | 36 | BUNDLED WITH 37 | 2.4.17 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by 430 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | ## Welcome to Classifier 2 | 3 | Classifier is a general module to allow Bayesian and other types of classifications. 4 | 5 | ## Download 6 | 7 | * https://github.com/cardmagic/classifier 8 | * gem install classifier 9 | * git clone https://github.com/cardmagic/classifier.git 10 | 11 | ## Dependencies 12 | 13 | If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows: 14 | 15 | gem install fast-stemmer 16 | 17 | If you would like to speed up LSI classification by at least 10x, please install the following libraries: 18 | GNU GSL:: http://www.gnu.org/software/gsl 19 | rb-gsl:: https://github.com/SciRuby/rb-gsl 20 | 21 | Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you. 22 | 23 | ## Bayes 24 | 25 | A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements. 26 | 27 | ### Usage 28 | 29 | require 'classifier' 30 | b = Classifier::Bayes.new 'Interesting', 'Uninteresting' 31 | b.train_interesting "here are some good words. I hope you love them" 32 | b.train_uninteresting "here are some bad words, I hate you" 33 | b.classify "I hate bad words and you" # returns 'Uninteresting' 34 | 35 | require 'madeleine' 36 | m = SnapshotMadeleine.new("bayes_data") { 37 | Classifier::Bayes.new 'Interesting', 'Uninteresting' 38 | } 39 | m.system.train_interesting "here are some good words. I hope you love them" 40 | m.system.train_uninteresting "here are some bad words, I hate you" 41 | m.take_snapshot 42 | m.system.classify "I love you" # returns 'Interesting' 43 | 44 | Using Madeleine, your application can persist the learned data over time. 45 | 46 | ### Bayesian Classification 47 | 48 | * http://www.process.com/precisemail/bayesian_filtering.htm 49 | * http://en.wikipedia.org/wiki/Bayesian_filtering 50 | * http://www.paulgraham.com/spam.html 51 | 52 | ## LSI 53 | 54 | A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines 55 | are not as fast or as small as Bayesian classifiers, but are more flexible, providing 56 | fast search and clustering detection as well as semantic analysis of the text that 57 | theoretically simulates human learning. 58 | 59 | ### Usage 60 | 61 | require 'classifier' 62 | lsi = Classifier::LSI.new 63 | strings = [ ["This text deals with dogs. Dogs.", :dog], 64 | ["This text involves dogs too. Dogs! ", :dog], 65 | ["This text revolves around cats. Cats.", :cat], 66 | ["This text also involves cats. Cats!", :cat], 67 | ["This text involves birds. Birds.",:bird ]] 68 | strings.each {|x| lsi.add_item x.first, x.last} 69 | 70 | lsi.search("dog", 3) 71 | # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ", 72 | # "This text also involves cats. Cats!"] 73 | 74 | lsi.find_related(strings[2], 2) 75 | # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"] 76 | 77 | lsi.classify "This text is also about dogs!" 78 | # returns => :dog 79 | 80 | lsi.classify_with_confidence "This text is also about dogs!" 81 | # returns => [:dog, 1.0] 82 | 83 | Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify 84 | with more than just simple strings. 85 | 86 | ### Latent Semantic Indexing 87 | 88 | * http://www.c2.com/cgi/wiki?LatentSemanticIndexing 89 | * http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc 90 | * http://en.wikipedia.org/wiki/Latent_semantic_analysis 91 | 92 | ## Authors 93 | 94 | * Lucas Carlson (lucas@rufy.com) 95 | * David Fayram II (dfayram@gmail.com) 96 | * Cameron McBride (cameron.mcbride@gmail.com) 97 | * Ivan Acosta-Rubio (ivan@softwarecriollo.com) 98 | 99 | This library is released under the terms of the GNU LGPL. See LICENSE for more details. 100 | 101 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake' 2 | require 'rake/testtask' 3 | require 'rdoc/task' 4 | 5 | desc 'Default Task' 6 | task default: [:test] 7 | 8 | # Run the unit tests 9 | desc 'Run all unit tests' 10 | Rake::TestTask.new('test') do |t| 11 | t.libs << 'lib' 12 | t.pattern = 'test/*/*_test.rb' 13 | t.verbose = true 14 | end 15 | 16 | # Make a console, useful when working on tests 17 | desc 'Generate a test console' 18 | task :console do 19 | verbose(false) { sh "irb -I lib/ -r 'classifier'" } 20 | end 21 | 22 | # Genereate the RDoc documentation 23 | desc 'Create documentation' 24 | Rake::RDocTask.new('doc') do |rdoc| 25 | rdoc.title = 'Ruby Classifier - Bayesian and LSI classification library' 26 | rdoc.rdoc_dir = 'html' 27 | rdoc.rdoc_files.include('README.markdown') 28 | rdoc.rdoc_files.include('lib/**/*.rb') 29 | end 30 | 31 | desc 'Report code statistics (KLOCs, etc) from the application' 32 | task :stats do 33 | require 'code_statistics' 34 | CodeStatistics.new( 35 | %w[Library lib], 36 | %w[Units test] 37 | ).to_s 38 | end 39 | 40 | desc 'Publish new documentation' 41 | task :publish do 42 | `ssh rufy update-classifier-doc` 43 | Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload 44 | end 45 | -------------------------------------------------------------------------------- /bin/bayes.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | begin 4 | require 'rubygems' 5 | require 'classifier' 6 | rescue 7 | require 'classifier' 8 | end 9 | 10 | require 'madeleine' 11 | 12 | m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) { 13 | Classifier::Bayes.new 'Interesting', 'Uninteresting' 14 | } 15 | 16 | case ARGV[0] 17 | when "add" 18 | case ARGV[1].downcase 19 | when "interesting" 20 | m.system.train_interesting File.open(ARGV[2]).read 21 | puts "#{ARGV[2]} has been classified as interesting" 22 | when "uninteresting" 23 | m.system.train_uninteresting File.open(ARGV[2]).read 24 | puts "#{ARGV[2]} has been classified as uninteresting" 25 | else 26 | puts "Invalid category: choose between interesting and uninteresting" 27 | exit(1) 28 | end 29 | when "classify" 30 | puts m.system.classify(File.open(ARGV[1]).read) 31 | else 32 | puts "Invalid option: choose add [category] [file] or clasify [file]" 33 | exit(-1) 34 | end 35 | 36 | m.take_snapshot 37 | -------------------------------------------------------------------------------- /bin/summarize.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | begin 4 | require 'rubygems' 5 | require 'classifier' 6 | rescue 7 | require 'classifier' 8 | end 9 | 10 | require 'open-uri' 11 | 12 | num = ARGV[1].to_i 13 | num = num < 1 ? 10 : num 14 | 15 | text = open(ARGV.first).read 16 | puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num) 17 | -------------------------------------------------------------------------------- /classifier.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = 'classifier' 3 | s.version = '1.4.4' 4 | s.summary = 'A general classifier module to allow Bayesian and other types of classifications.' 5 | s.description = 'A general classifier module to allow Bayesian and other types of classifications.' 6 | s.author = 'Lucas Carlson' 7 | s.email = 'lucas@rufy.com' 8 | s.homepage = 'https://github.com/cardmagic/classifier' 9 | s.files = Dir['{lib}/**/*.rb', 'bin/*', 'LICENSE', '*.md', 'test/*'] 10 | s.license = 'LGPL' 11 | 12 | s.add_dependency 'fast-stemmer', '~> 1.0' 13 | s.add_dependency 'mutex_m', '~> 0.2' 14 | s.add_dependency 'rake' 15 | s.add_development_dependency 'minitest' 16 | s.add_development_dependency 'rdoc' 17 | end 18 | -------------------------------------------------------------------------------- /cloving.json: -------------------------------------------------------------------------------- 1 | { 2 | "languages": [ 3 | { 4 | "name": "Ruby", 5 | "primary": true, 6 | "directory": "lib", 7 | "extension": ".rb" 8 | } 9 | ], 10 | "frameworks": [], 11 | "testingFrameworks": [ 12 | { 13 | "name": "Test::Unit", 14 | "type": "Testing framework", 15 | "directory": "test" 16 | } 17 | ], 18 | "buildTools": [ 19 | { 20 | "name": "Rake", 21 | "type": "Build tool" 22 | } 23 | ], 24 | "packageManager": "Bundler", 25 | "linters": [], 26 | "projectType": "Ruby Gem" 27 | } -------------------------------------------------------------------------------- /install.rb: -------------------------------------------------------------------------------- 1 | require 'rbconfig' 2 | require 'find' 3 | require 'ftools' 4 | 5 | include Config 6 | 7 | # this was adapted from rdoc's install.rb by ways of Log4r 8 | 9 | $sitedir = CONFIG["sitelibdir"] 10 | unless $sitedir 11 | version = CONFIG["MAJOR"] + "." + CONFIG["MINOR"] 12 | $libdir = File.join(CONFIG["libdir"], "ruby", version) 13 | $sitedir = $:.find {|x| x =~ /site_ruby/ } 14 | if !$sitedir 15 | $sitedir = File.join($libdir, "site_ruby") 16 | elsif $sitedir !~ Regexp.quote(version) 17 | $sitedir = File.join($sitedir, version) 18 | end 19 | end 20 | 21 | makedirs = %w{ classifier } 22 | makedirs = %w{ classifier/extensions } 23 | makedirs = %w{ classifier/lsi } 24 | makedirs.each {|f| File::makedirs(File.join($sitedir, *f.split(/\//)))} 25 | 26 | Dir.chdir("lib") 27 | begin 28 | require 'rubygems' 29 | require 'rake' 30 | rescue LoadError 31 | puts 32 | puts "Please install Gem and Rake from http://rubyforge.org/projects/rubygems and http://rubyforge.org/projects/rake" 33 | puts 34 | exit(-1) 35 | end 36 | 37 | files = FileList["**/*"] 38 | 39 | # File::safe_unlink *deprecated.collect{|f| File.join($sitedir, f.split(/\//))} 40 | files.each {|f| 41 | File::install(f, File.join($sitedir, *f.split(/\//)), 0644, true) 42 | } 43 | 44 | begin 45 | require 'stemmer' 46 | rescue LoadError 47 | puts 48 | puts "Please install Stemmer from http://rubyforge.org/projects/stemmer or via 'gem install stemmer'" 49 | puts 50 | end 51 | -------------------------------------------------------------------------------- /lib/classifier.rb: -------------------------------------------------------------------------------- 1 | #-- 2 | # Copyright (c) 2005 Lucas Carlson 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining 5 | # a copy of this software and associated documentation files (the 6 | # "Software"), to deal in the Software without restriction, including 7 | # without limitation the rights to use, copy, modify, merge, publish, 8 | # distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so, subject to 10 | # the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be 13 | # included in all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | #++ 23 | # Author:: Lucas Carlson (mailto:lucas@rufy.com) 24 | # Copyright:: Copyright (c) 2005 Lucas Carlson 25 | # License:: LGPL 26 | 27 | require 'rubygems' 28 | require 'classifier/extensions/string' 29 | require 'classifier/extensions/vector' 30 | require 'classifier/bayes' 31 | require 'classifier/lsi' 32 | -------------------------------------------------------------------------------- /lib/classifier/bayes.rb: -------------------------------------------------------------------------------- 1 | # Author:: Lucas Carlson (mailto:lucas@rufy.com) 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson 3 | # License:: LGPL 4 | 5 | module Classifier 6 | class Bayes 7 | # The class can be created with one or more categories, each of which will be 8 | # initialized and given a training method. E.g., 9 | # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam' 10 | def initialize(*categories) 11 | @categories = {} 12 | categories.each { |category| @categories[category.prepare_category_name] = {} } 13 | @total_words = 0 14 | @category_counts = Hash.new(0) 15 | @category_word_count = Hash.new(0) 16 | end 17 | 18 | # 19 | # Provides a general training method for all categories specified in Bayes#new 20 | # For example: 21 | # b = Classifier::Bayes.new 'This', 'That', 'the_other' 22 | # b.train :this, "This text" 23 | # b.train "that", "That text" 24 | # b.train "The other", "The other text" 25 | def train(category, text) 26 | category = category.prepare_category_name 27 | @category_counts[category] += 1 28 | text.word_hash.each do |word, count| 29 | @categories[category][word] ||= 0 30 | @categories[category][word] += count 31 | @total_words += count 32 | @category_word_count[category] += count 33 | end 34 | end 35 | 36 | # 37 | # Provides a untraining method for all categories specified in Bayes#new 38 | # Be very careful with this method. 39 | # 40 | # For example: 41 | # b = Classifier::Bayes.new 'This', 'That', 'the_other' 42 | # b.train :this, "This text" 43 | # b.untrain :this, "This text" 44 | def untrain(category, text) 45 | category = category.prepare_category_name 46 | @category_counts[category] -= 1 47 | text.word_hash.each do |word, count| 48 | next unless @total_words >= 0 49 | 50 | orig = @categories[category][word] || 0 51 | @categories[category][word] ||= 0 52 | @categories[category][word] -= count 53 | if @categories[category][word] <= 0 54 | @categories[category].delete(word) 55 | count = orig 56 | end 57 | @category_word_count[category] -= count if @category_word_count[category] >= count 58 | @total_words -= count 59 | end 60 | end 61 | 62 | # 63 | # Returns the scores in each category the provided +text+. E.g., 64 | # b.classifications "I hate bad words and you" 65 | # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524} 66 | # The largest of these scores (the one closest to 0) is the one picked out by #classify 67 | def classifications(text) 68 | score = {} 69 | word_hash = text.word_hash 70 | training_count = @category_counts.values.inject { |x, y| x + y }.to_f 71 | @categories.each do |category, category_words| 72 | score[category.to_s] = 0 73 | total = (@category_word_count[category] || 1).to_f 74 | word_hash.each_key do |word| 75 | s = category_words.key?(word) ? category_words[word] : 0.1 76 | score[category.to_s] += Math.log(s / total) 77 | end 78 | # now add prior probability for the category 79 | s = @category_counts.key?(category) ? @category_counts[category] : 0.1 80 | score[category.to_s] += Math.log(s / training_count) 81 | end 82 | score 83 | end 84 | 85 | # 86 | # Returns the classification of the provided +text+, which is one of the 87 | # categories given in the initializer. E.g., 88 | # b.classify "I hate bad words and you" 89 | # => 'Uninteresting' 90 | def classify(text) 91 | (classifications(text).sort_by { |a| -a[1] })[0][0] 92 | end 93 | 94 | # 95 | # Provides training and untraining methods for the categories specified in Bayes#new 96 | # For example: 97 | # b = Classifier::Bayes.new 'This', 'That', 'the_other' 98 | # b.train_this "This text" 99 | # b.train_that "That text" 100 | # b.untrain_that "That text" 101 | # b.train_the_other "The other text" 102 | def method_missing(name, *args) 103 | category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name 104 | if @categories.key?(category) 105 | args.each do |text| 106 | if name.to_s.start_with?('untrain_') 107 | untrain(category, text) 108 | else 109 | train(category, text) 110 | end 111 | end 112 | elsif name.to_s =~ /(un)?train_(\w+)/ 113 | raise StandardError, "No such category: #{category}" 114 | else 115 | super 116 | end 117 | end 118 | 119 | # 120 | # Provides a list of category names 121 | # For example: 122 | # b.categories 123 | # => ['This', 'That', 'the_other'] 124 | def categories # :nodoc: 125 | @categories.keys.collect(&:to_s) 126 | end 127 | 128 | # 129 | # Allows you to add categories to the classifier. 130 | # For example: 131 | # b.add_category "Not spam" 132 | # 133 | # WARNING: Adding categories to a trained classifier will 134 | # result in an undertrained category that will tend to match 135 | # more criteria than the trained selective categories. In short, 136 | # try to initialize your categories at initialization. 137 | def add_category(category) 138 | @categories[category.prepare_category_name] = {} 139 | end 140 | 141 | alias append_category add_category 142 | 143 | # 144 | # Allows you to remove categories from the classifier. 145 | # For example: 146 | # b.remove_category "Spam" 147 | # 148 | # WARNING: Removing categories from a trained classifier will 149 | # result in the loss of all training data for that category. 150 | # Make sure you really want to do this before calling this method. 151 | def remove_category(category) 152 | category = category.prepare_category_name 153 | raise StandardError, "No such category: #{category}" unless @categories.key?(category) 154 | 155 | @total_words -= @category_word_count[category].to_i 156 | 157 | @categories.delete(category) 158 | @category_counts.delete(category) 159 | @category_word_count.delete(category) 160 | end 161 | end 162 | end 163 | -------------------------------------------------------------------------------- /lib/classifier/extensions/string.rb: -------------------------------------------------------------------------------- 1 | # Author:: Lucas Carlson (mailto:lucas@rufy.com) 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson 3 | # License:: LGPL 4 | 5 | require 'fast_stemmer' 6 | require 'classifier/extensions/word_hash' 7 | 8 | class Object 9 | def prepare_category_name 10 | to_s.gsub('_', ' ').capitalize.intern 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/classifier/extensions/vector.rb: -------------------------------------------------------------------------------- 1 | # Author:: Ernest Ellingson 2 | # Copyright:: Copyright (c) 2005 3 | 4 | # These are extensions to the std-lib 'matrix' to allow an all ruby SVD 5 | 6 | require 'matrix' 7 | 8 | class Array 9 | def sum_with_identity(identity = 0.0, &block) 10 | return identity unless size.to_i.positive? 11 | 12 | if block_given? 13 | map(&block).sum_with_identity(identity) 14 | else 15 | compact.reduce(:+).to_f || identity.to_f 16 | end 17 | end 18 | end 19 | 20 | module VectorExtensions 21 | def magnitude 22 | sum_of_squares = 0.to_r 23 | size.times do |i| 24 | sum_of_squares += self[i]**2.to_r 25 | end 26 | Math.sqrt(sum_of_squares.to_f) 27 | end 28 | 29 | def normalize 30 | normalized_values = [] 31 | magnitude_value = magnitude.to_r 32 | size.times do |i| 33 | normalized_values << (self[i] / magnitude_value) 34 | end 35 | Vector[*normalized_values] 36 | end 37 | end 38 | 39 | class Vector 40 | include VectorExtensions 41 | end 42 | 43 | class Matrix 44 | def self.diag(diagonal_elements) 45 | Matrix.diagonal(*diagonal_elements) 46 | end 47 | 48 | alias trans transpose 49 | 50 | def SV_decomp(max_sweeps = 20) 51 | q_matrix = if row_size >= column_size 52 | trans * self 53 | else 54 | self * trans 55 | end 56 | 57 | q_rotation_matrix = q_matrix.dup 58 | v_matrix = Matrix.identity(q_matrix.row_size) 59 | iteration_count = 0 60 | previous_s_matrix = nil 61 | 62 | loop do 63 | iteration_count += 1 64 | (0...q_rotation_matrix.row_size - 1).each do |row| 65 | (1..q_rotation_matrix.row_size - 1).each do |col| 66 | next if row == col 67 | 68 | angle = Math.atan((2.to_r * q_rotation_matrix[row, 69 | col]) / (q_rotation_matrix[row, 70 | row] - q_rotation_matrix[col, 71 | col])) / 2.0 72 | cosine = Math.cos(angle) 73 | sine = Math.sin(angle) 74 | rotation_matrix = Matrix.identity(q_rotation_matrix.row_size) 75 | rotation_matrix[row, row] = cosine 76 | rotation_matrix[row, col] = -sine 77 | rotation_matrix[col, row] = sine 78 | rotation_matrix[col, col] = cosine 79 | q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix 80 | v_matrix *= rotation_matrix 81 | end 82 | end 83 | previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1 84 | sum_of_differences = 0.to_r 85 | if iteration_count > 1 86 | q_rotation_matrix.row_size.times do |r| 87 | difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs 88 | sum_of_differences += difference.to_r if difference > 0.001 89 | end 90 | previous_s_matrix = q_rotation_matrix.dup 91 | end 92 | break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps 93 | end 94 | 95 | singular_values = [] 96 | q_rotation_matrix.row_size.times do |r| 97 | singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f) 98 | end 99 | u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse 100 | [u_matrix, v_matrix, singular_values] 101 | end 102 | 103 | def []=(row_index, col_index, value) 104 | @rows[row_index][col_index] = value 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /lib/classifier/extensions/vector_serialize.rb: -------------------------------------------------------------------------------- 1 | module GSL 2 | class Vector 3 | def _dump(_v) 4 | Marshal.dump(to_a) 5 | end 6 | 7 | def self._load(arr) 8 | arry = Marshal.load(arr) 9 | GSL::Vector.alloc(arry) 10 | end 11 | end 12 | 13 | class Matrix 14 | class << self 15 | alias diag diagonal 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/classifier/extensions/word_hash.rb: -------------------------------------------------------------------------------- 1 | # Author:: Lucas Carlson (mailto:lucas@rufy.com) 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson 3 | # License:: LGPL 4 | 5 | require 'set' 6 | 7 | # These are extensions to the String class to provide convenience 8 | # methods for the Classifier package. 9 | class String 10 | # Removes common punctuation symbols, returning a new string. 11 | # E.g., 12 | # "Hello (greeting's), with {braces} < >...?".without_punctuation 13 | # => "Hello greetings with braces " 14 | def without_punctuation 15 | tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '') 16 | end 17 | 18 | # Return a Hash of strings => ints. Each word in the string is stemmed, 19 | # interned, and indexes to its frequency in the document. 20 | def word_hash 21 | word_hash = clean_word_hash 22 | symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split) 23 | word_hash.merge(symbol_hash) 24 | end 25 | 26 | # Return a word hash without extra punctuation or short symbols, just stemmed words 27 | def clean_word_hash 28 | word_hash_for_words gsub(/[^\w\s]/, '').split 29 | end 30 | 31 | private 32 | 33 | def word_hash_for_words(words) 34 | d = Hash.new(0) 35 | words.each do |word| 36 | word.downcase! 37 | d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2 38 | end 39 | d 40 | end 41 | 42 | def word_hash_for_symbols(words) 43 | d = Hash.new(0) 44 | words.each do |word| 45 | d[word.intern] += 1 46 | end 47 | d 48 | end 49 | 50 | CORPUS_SKIP_WORDS = ::Set.new(%w[ 51 | a 52 | again 53 | all 54 | along 55 | are 56 | also 57 | an 58 | and 59 | as 60 | at 61 | but 62 | by 63 | came 64 | can 65 | cant 66 | couldnt 67 | did 68 | didn 69 | didnt 70 | do 71 | doesnt 72 | dont 73 | ever 74 | first 75 | from 76 | have 77 | her 78 | here 79 | him 80 | how 81 | i 82 | if 83 | in 84 | into 85 | is 86 | isnt 87 | it 88 | itll 89 | just 90 | last 91 | least 92 | like 93 | most 94 | my 95 | new 96 | no 97 | not 98 | now 99 | of 100 | on 101 | or 102 | should 103 | sinc 104 | so 105 | some 106 | th 107 | than 108 | this 109 | that 110 | the 111 | their 112 | then 113 | those 114 | to 115 | told 116 | too 117 | true 118 | try 119 | until 120 | url 121 | us 122 | were 123 | when 124 | whether 125 | while 126 | with 127 | within 128 | yes 129 | you 130 | youll 131 | ]) 132 | end 133 | -------------------------------------------------------------------------------- /lib/classifier/lsi.rb: -------------------------------------------------------------------------------- 1 | # Author:: David Fayram (mailto:dfayram@lensmen.net) 2 | # Copyright:: Copyright (c) 2005 David Fayram II 3 | # License:: LGPL 4 | 5 | module Classifier 6 | class LSI 7 | @gsl_available = false 8 | 9 | class << self 10 | attr_accessor :gsl_available 11 | end 12 | end 13 | end 14 | 15 | begin 16 | # to test the native vector class, try `rake test NATIVE_VECTOR=true` 17 | raise LoadError if ENV['NATIVE_VECTOR'] == 'true' 18 | raise LoadError unless Gem::Specification.find_all_by_name('gsl').any? 19 | 20 | require 'gsl' # requires https://github.com/SciRuby/rb-gsl/ 21 | require 'classifier/extensions/vector_serialize' 22 | Classifier::LSI.gsl_available = true 23 | rescue LoadError 24 | warn 'Notice: for 10x faster LSI support in the classifier gem, please install the gsl gem' 25 | Classifier::LSI.gsl_available = false 26 | require 'classifier/extensions/vector' 27 | end 28 | 29 | require 'classifier/lsi/word_list' 30 | require 'classifier/lsi/content_node' 31 | require 'classifier/lsi/summary' 32 | 33 | module Classifier 34 | # This class implements a Latent Semantic Indexer, which can search, classify and cluster 35 | # data based on underlying semantic relations. For more information on the algorithms used, 36 | # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing]. 37 | class LSI 38 | attr_reader :word_list 39 | attr_accessor :auto_rebuild 40 | 41 | # Create a fresh index. 42 | # If you want to call #build_index manually, use 43 | # Classifier::LSI.new auto_rebuild: false 44 | # 45 | def initialize(options = {}) 46 | @auto_rebuild = true unless options[:auto_rebuild] == false 47 | @word_list = WordList.new 48 | @items = {} 49 | @version = 0 50 | @built_at_version = -1 51 | end 52 | 53 | # Returns true if the index needs to be rebuilt. The index needs 54 | # to be built after all informaton is added, but before you start 55 | # using it for search, classification and cluster detection. 56 | def needs_rebuild? 57 | (@items.keys.size > 1) && (@version != @built_at_version) 58 | end 59 | 60 | # Adds an item to the index. item is assumed to be a string, but 61 | # any item may be indexed so long as it responds to #to_s or if 62 | # you provide an optional block explaining how the indexer can 63 | # fetch fresh string data. This optional block is passed the item, 64 | # so the item may only be a reference to a URL or file name. 65 | # 66 | # For example: 67 | # lsi = Classifier::LSI.new 68 | # lsi.add_item "This is just plain text" 69 | # lsi.add_item "/home/me/filename.txt" { |x| File.read x } 70 | # ar = ActiveRecordObject.find( :all ) 71 | # lsi.add_item ar, *ar.categories { |x| ar.content } 72 | # 73 | def add_item(item, *categories, &block) 74 | clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash 75 | @items[item] = ContentNode.new(clean_word_hash, *categories) 76 | @version += 1 77 | build_index if @auto_rebuild 78 | end 79 | 80 | # A less flexible shorthand for add_item that assumes 81 | # you are passing in a string with no categorries. item 82 | # will be duck typed via to_s . 83 | # 84 | def <<(item) 85 | add_item(item) 86 | end 87 | 88 | # Returns the categories for a given indexed items. You are free to add and remove 89 | # items from this as you see fit. It does not invalide an index to change its categories. 90 | def categories_for(item) 91 | return [] unless @items[item] 92 | 93 | @items[item].categories 94 | end 95 | 96 | # Removes an item from the database, if it is indexed. 97 | # 98 | def remove_item(item) 99 | return unless @items.key?(item) 100 | 101 | @items.delete(item) 102 | @version += 1 103 | end 104 | 105 | # Returns an array of items that are indexed. 106 | def items 107 | @items.keys 108 | end 109 | 110 | # This function rebuilds the index if needs_rebuild? returns true. 111 | # For very large document spaces, this indexing operation may take some 112 | # time to complete, so it may be wise to place the operation in another 113 | # thread. 114 | # 115 | # As a rule, indexing will be fairly swift on modern machines until 116 | # you have well over 500 documents indexed, or have an incredibly diverse 117 | # vocabulary for your documents. 118 | # 119 | # The optional parameter "cutoff" is a tuning parameter. When the index is 120 | # built, a certain number of s-values are discarded from the system. The 121 | # cutoff parameter tells the indexer how many of these values to keep. 122 | # A value of 1 for cutoff means that no semantic analysis will take place, 123 | # turning the LSI class into a simple vector search engine. 124 | def build_index(cutoff = 0.75) 125 | return unless needs_rebuild? 126 | 127 | make_word_list 128 | 129 | doc_list = @items.values 130 | tda = doc_list.collect { |node| node.raw_vector_with(@word_list) } 131 | 132 | if self.class.gsl_available 133 | tdm = GSL::Matrix.alloc(*tda).trans 134 | ntdm = build_reduced_matrix(tdm, cutoff) 135 | 136 | ntdm.size[1].times do |col| 137 | vec = GSL::Vector.alloc(ntdm.column(col)).row 138 | doc_list[col].lsi_vector = vec 139 | doc_list[col].lsi_norm = vec.normalize 140 | end 141 | else 142 | tdm = Matrix.rows(tda).trans 143 | ntdm = build_reduced_matrix(tdm, cutoff) 144 | 145 | ntdm.row_size.times do |col| 146 | doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col] 147 | doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col] 148 | end 149 | end 150 | 151 | @built_at_version = @version 152 | end 153 | 154 | # This method returns max_chunks entries, ordered by their average semantic rating. 155 | # Essentially, the average distance of each entry from all other entries is calculated, 156 | # the highest are returned. 157 | # 158 | # This can be used to build a summary service, or to provide more information about 159 | # your dataset's general content. For example, if you were to use categorize on the 160 | # results of this data, you could gather information on what your dataset is generally 161 | # about. 162 | def highest_relative_content(max_chunks = 10) 163 | return [] if needs_rebuild? 164 | 165 | avg_density = {} 166 | @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |i, j| i + j[1] } } 167 | 168 | avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map 169 | end 170 | 171 | # This function is the primitive that find_related and classify 172 | # build upon. It returns an array of 2-element arrays. The first element 173 | # of this array is a document, and the second is its "score", defining 174 | # how "close" it is to other indexed items. 175 | # 176 | # These values are somewhat arbitrary, having to do with the vector space 177 | # created by your content, so the magnitude is interpretable but not always 178 | # meaningful between indexes. 179 | # 180 | # The parameter doc is the content to compare. If that content is not 181 | # indexed, you can pass an optional block to define how to create the 182 | # text data. See add_item for examples of how this works. 183 | def proximity_array_for_content(doc, &block) 184 | return [] if needs_rebuild? 185 | 186 | content_node = node_for_content(doc, &block) 187 | result = 188 | @items.keys.collect do |item| 189 | val = if self.class.gsl_available 190 | content_node.search_vector * @items[item].search_vector.col 191 | else 192 | (Matrix[content_node.search_vector] * @items[item].search_vector)[0] 193 | end 194 | [item, val] 195 | end 196 | result.sort_by { |x| x[1] }.reverse 197 | end 198 | 199 | # Similar to proximity_array_for_content, this function takes similar 200 | # arguments and returns a similar array. However, it uses the normalized 201 | # calculated vectors instead of their full versions. This is useful when 202 | # you're trying to perform operations on content that is much smaller than 203 | # the text you're working with. search uses this primitive. 204 | def proximity_norms_for_content(doc, &block) 205 | return [] if needs_rebuild? 206 | 207 | content_node = node_for_content(doc, &block) 208 | result = 209 | @items.keys.collect do |item| 210 | val = if self.class.gsl_available 211 | content_node.search_norm * @items[item].search_norm.col 212 | else 213 | (Matrix[content_node.search_norm] * @items[item].search_norm)[0] 214 | end 215 | [item, val] 216 | end 217 | result.sort_by { |x| x[1] }.reverse 218 | end 219 | 220 | # This function allows for text-based search of your index. Unlike other functions 221 | # like find_related and classify, search only takes short strings. It will also ignore 222 | # factors like repeated words. It is best for short, google-like search terms. 223 | # A search will first priortize lexical relationships, then semantic ones. 224 | # 225 | # While this may seem backwards compared to the other functions that LSI supports, 226 | # it is actually the same algorithm, just applied on a smaller document. 227 | def search(string, max_nearest = 3) 228 | return [] if needs_rebuild? 229 | 230 | carry = proximity_norms_for_content(string) 231 | result = carry.collect { |x| x[0] } 232 | result[0..max_nearest - 1] 233 | end 234 | 235 | # This function takes content and finds other documents 236 | # that are semantically "close", returning an array of documents sorted 237 | # from most to least relavant. 238 | # max_nearest specifies the number of documents to return. A value of 239 | # 0 means that it returns all the indexed documents, sorted by relavence. 240 | # 241 | # This is particularly useful for identifing clusters in your document space. 242 | # For example you may want to identify several "What's Related" items for weblog 243 | # articles, or find paragraphs that relate to each other in an essay. 244 | def find_related(doc, max_nearest = 3, &block) 245 | carry = 246 | proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc } 247 | result = carry.collect { |x| x[0] } 248 | result[0..max_nearest - 1] 249 | end 250 | 251 | # This function uses a voting system to categorize documents, based on 252 | # the categories of other documents. It uses the same logic as the 253 | # find_related function to find related documents, then returns the 254 | # most obvious category from this list. 255 | # 256 | # cutoff signifies the number of documents to consider when clasifying 257 | # text. A cutoff of 1 means that every document in the index votes on 258 | # what category the document is in. This may not always make sense. 259 | # 260 | def classify(doc, cutoff = 0.30, &block) 261 | votes = vote(doc, cutoff, &block) 262 | 263 | ranking = votes.keys.sort_by { |x| votes[x] } 264 | ranking[-1] 265 | end 266 | 267 | def vote(doc, cutoff = 0.30, &block) 268 | icutoff = (@items.size * cutoff).round 269 | carry = proximity_array_for_content(doc, &block) 270 | carry = carry[0..icutoff - 1] 271 | votes = {} 272 | carry.each do |pair| 273 | categories = @items[pair[0]].categories 274 | categories.each do |category| 275 | votes[category] ||= 0.0 276 | votes[category] += pair[1] 277 | end 278 | end 279 | votes 280 | end 281 | 282 | # Returns the same category as classify() but also returns 283 | # a confidence value derived from the vote share that the 284 | # winning category got. 285 | # 286 | # e.g. 287 | # category,confidence = classify_with_confidence(doc) 288 | # if confidence < 0.3 289 | # category = nil 290 | # end 291 | # 292 | # 293 | # See classify() for argument docs 294 | def classify_with_confidence(doc, cutoff = 0.30, &block) 295 | votes = vote(doc, cutoff, &block) 296 | votes_sum = votes.values.inject(0.0) { |sum, v| sum + v } 297 | return [nil, nil] if votes_sum.zero? 298 | 299 | ranking = votes.keys.sort_by { |x| votes[x] } 300 | winner = ranking[-1] 301 | vote_share = votes[winner] / votes_sum.to_f 302 | [winner, vote_share] 303 | end 304 | 305 | # Prototype, only works on indexed documents. 306 | # I have no clue if this is going to work, but in theory 307 | # it's supposed to. 308 | def highest_ranked_stems(doc, count = 3) 309 | raise 'Requested stem ranking on non-indexed content!' unless @items[doc] 310 | 311 | arr = node_for_content(doc).lsi_vector.to_a 312 | top_n = arr.sort.reverse[0..count - 1] 313 | top_n.collect { |x| @word_list.word_for_index(arr.index(x)) } 314 | end 315 | 316 | private 317 | 318 | def build_reduced_matrix(matrix, cutoff = 0.75) 319 | # TODO: Check that M>=N on these dimensions! Transpose helps assure this 320 | u, v, s = matrix.SV_decomp 321 | 322 | # TODO: Better than 75% term, please. :\ 323 | s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1] 324 | s.size.times do |ord| 325 | s[ord] = 0.0 if s[ord] < s_cutoff 326 | end 327 | # Reconstruct the term document matrix, only with reduced rank 328 | u * (self.class.gsl_available ? GSL::Matrix : ::Matrix).diag(s) * v.trans 329 | end 330 | 331 | def node_for_content(item, &block) 332 | return @items[item] if @items[item] 333 | 334 | clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash 335 | 336 | cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data 337 | 338 | unless needs_rebuild? 339 | cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors 340 | end 341 | 342 | cn 343 | end 344 | 345 | def make_word_list 346 | @word_list = WordList.new 347 | @items.each_value do |node| 348 | node.word_hash.each_key { |key| @word_list.add_word key } 349 | end 350 | end 351 | end 352 | end 353 | -------------------------------------------------------------------------------- /lib/classifier/lsi/content_node.rb: -------------------------------------------------------------------------------- 1 | # Author:: David Fayram (mailto:dfayram@lensmen.net) 2 | # Copyright:: Copyright (c) 2005 David Fayram II 3 | # License:: LGPL 4 | 5 | module Classifier 6 | # This is an internal data structure class for the LSI node. Save for 7 | # raw_vector_with, it should be fairly straightforward to understand. 8 | # You should never have to use it directly. 9 | class ContentNode 10 | attr_accessor :raw_vector, :raw_norm, 11 | :lsi_vector, :lsi_norm, 12 | :categories 13 | 14 | attr_reader :word_hash 15 | 16 | # If text_proc is not specified, the source will be duck-typed 17 | # via source.to_s 18 | def initialize(word_frequencies, *categories) 19 | @categories = categories || [] 20 | @word_hash = word_frequencies 21 | end 22 | 23 | # Use this to fetch the appropriate search vector. 24 | def search_vector 25 | @lsi_vector || @raw_vector 26 | end 27 | 28 | # Use this to fetch the appropriate search vector in normalized form. 29 | def search_norm 30 | @lsi_norm || @raw_norm 31 | end 32 | 33 | # Creates the raw vector out of word_hash using word_list as the 34 | # key for mapping the vector space. 35 | def raw_vector_with(word_list) 36 | vec = if Classifier::LSI.gsl_available 37 | GSL::Vector.alloc(word_list.size) 38 | else 39 | Array.new(word_list.size, 0) 40 | end 41 | 42 | @word_hash.each_key do |word| 43 | vec[word_list[word]] = @word_hash[word] if word_list[word] 44 | end 45 | 46 | # Perform the scaling transform 47 | total_words = Classifier::LSI.gsl_available ? vec.sum : vec.sum_with_identity 48 | total_unique_words = vec.count { |word| word != 0 } 49 | 50 | # Perform first-order association transform if this vector has more 51 | # than one word in it. 52 | if total_words > 1.0 && total_unique_words > 1 53 | weighted_total = 0.0 54 | 55 | vec.each do |term| 56 | next unless term.positive? 57 | next if total_words.zero? 58 | 59 | term_over_total = term / total_words 60 | val = term_over_total * Math.log(term_over_total) 61 | weighted_total += val unless val.nan? 62 | end 63 | vec = vec.collect { |val| Math.log(val + 1) / -weighted_total } 64 | end 65 | 66 | if Classifier::LSI.gsl_available 67 | @raw_norm = vec.normalize 68 | @raw_vector = vec 69 | else 70 | @raw_norm = Vector[*vec].normalize 71 | @raw_vector = Vector[*vec] 72 | end 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /lib/classifier/lsi/summary.rb: -------------------------------------------------------------------------------- 1 | # Author:: Lucas Carlson (mailto:lucas@rufy.com) 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson 3 | # License:: LGPL 4 | 5 | class String 6 | def summary(count = 10, separator = ' [...] ') 7 | perform_lsi split_sentences, count, separator 8 | end 9 | 10 | def paragraph_summary(count = 1, separator = ' [...] ') 11 | perform_lsi split_paragraphs, count, separator 12 | end 13 | 14 | def split_sentences 15 | split(/(\.|!|\?)/) # TODO: make this less primitive 16 | end 17 | 18 | def split_paragraphs 19 | split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive 20 | end 21 | 22 | private 23 | 24 | def perform_lsi(chunks, count, separator) 25 | lsi = Classifier::LSI.new auto_rebuild: false 26 | chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 } 27 | lsi.build_index 28 | summaries = lsi.highest_relative_content count 29 | summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator) 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/classifier/lsi/word_list.rb: -------------------------------------------------------------------------------- 1 | # Author:: David Fayram (mailto:dfayram@lensmen.net) 2 | # Copyright:: Copyright (c) 2005 David Fayram II 3 | # License:: LGPL 4 | 5 | module Classifier 6 | # This class keeps a word => index mapping. It is used to map stemmed words 7 | # to dimensions of a vector. 8 | 9 | class WordList 10 | def initialize 11 | @location_table = {} 12 | end 13 | 14 | # Adds a word (if it is new) and assigns it a unique dimension. 15 | def add_word(word) 16 | term = word 17 | @location_table[term] = @location_table.size unless @location_table[term] 18 | end 19 | 20 | # Returns the dimension of the word or nil if the word is not in the space. 21 | def [](lookup) 22 | term = lookup 23 | @location_table[term] 24 | end 25 | 26 | def word_for_index(ind) 27 | @location_table.invert[ind] 28 | end 29 | 30 | # Returns the number of words mapped. 31 | def size 32 | @location_table.size 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /test/bayes/bayesian_test.rb: -------------------------------------------------------------------------------- 1 | require_relative '../test_helper' 2 | 3 | class BayesianTest < Minitest::Test 4 | def setup 5 | @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting' 6 | end 7 | 8 | def test_bad_training 9 | assert_raises(StandardError) { @classifier.train_no_category 'words' } 10 | end 11 | 12 | def test_bad_method 13 | assert_raises(NoMethodError) { @classifier.forget_everything_you_know '' } 14 | end 15 | 16 | def test_categories 17 | assert_equal %w[Interesting Uninteresting].sort, @classifier.categories.sort 18 | end 19 | 20 | def test_add_category 21 | @classifier.add_category 'Test' 22 | assert_equal %w[Test Interesting Uninteresting].sort, @classifier.categories.sort 23 | end 24 | 25 | def test_classification 26 | @classifier.train_interesting 'here are some good words. I hope you love them' 27 | @classifier.train_uninteresting 'here are some bad words, I hate you' 28 | assert_equal 'Uninteresting', @classifier.classify('I hate bad words and you') 29 | end 30 | 31 | def test_safari_animals 32 | bayes = Classifier::Bayes.new 'Lion', 'Elephant' 33 | bayes.train_lion 'lion' 34 | bayes.train_lion 'zebra' 35 | bayes.train_elephant 'elephant' 36 | bayes.train_elephant 'trunk' 37 | bayes.train_elephant 'tusk' 38 | 39 | assert_equal 'Lion', bayes.classify('zebra') 40 | assert_equal 'Elephant', bayes.classify('trunk') 41 | assert_equal 'Elephant', bayes.classify('tusk') 42 | assert_equal 'Lion', bayes.classify('lion') 43 | assert_equal 'Elephant', bayes.classify('elephant') 44 | end 45 | 46 | def test_remove_category 47 | @classifier.train_interesting 'This is interesting content' 48 | @classifier.train_uninteresting 'This is uninteresting content' 49 | 50 | assert_equal %w[Interesting Uninteresting].sort, @classifier.categories.sort 51 | 52 | @classifier.remove_category 'Uninteresting' 53 | 54 | assert_equal ['Interesting'], @classifier.categories 55 | end 56 | 57 | def test_remove_category_affects_classification 58 | @classifier.train_interesting 'This is interesting content' 59 | @classifier.train_uninteresting 'This is uninteresting content' 60 | 61 | assert_equal 'Uninteresting', @classifier.classify('This is uninteresting') 62 | 63 | @classifier.remove_category 'Uninteresting' 64 | 65 | assert_equal 'Interesting', @classifier.classify('This is uninteresting') 66 | end 67 | 68 | def test_remove_all_categories 69 | @classifier.remove_category 'Interesting' 70 | @classifier.remove_category 'Uninteresting' 71 | 72 | assert_empty @classifier.categories 73 | end 74 | 75 | def test_remove_and_add_category 76 | @classifier.remove_category 'Uninteresting' 77 | @classifier.add_category 'Neutral' 78 | 79 | assert_equal %w[Interesting Neutral].sort, @classifier.categories.sort 80 | end 81 | 82 | def test_remove_category_preserves_other_category_data 83 | @classifier.train_interesting 'This is interesting content' 84 | @classifier.train_uninteresting 'This is uninteresting content' 85 | 86 | interesting_classification = @classifier.classify('This is interesting') 87 | @classifier.remove_category 'Uninteresting' 88 | 89 | assert_equal interesting_classification, @classifier.classify('This is interesting') 90 | end 91 | 92 | def test_remove_category_check_counts 93 | initial_total_words = @classifier.instance_variable_get(:@total_words) 94 | category_word_count = @classifier.instance_variable_get(:@category_word_count)['Interesting'] 95 | 96 | @classifier.remove_category('Interesting') 97 | 98 | assert_nil @classifier.instance_variable_get(:@categories)['Interesting'] 99 | assert_equal @classifier.instance_variable_get(:@category_counts)['Interesting'], 0 100 | assert_equal @classifier.instance_variable_get(:@category_word_count)['Interesting'], 0 101 | 102 | new_total_words = @classifier.instance_variable_get(:@total_words) 103 | assert_equal initial_total_words - category_word_count, new_total_words 104 | end 105 | 106 | def test_remove_category_updates_total_words_before_deletion 107 | initial_total_words = @classifier.instance_variable_get(:@total_words) 108 | category_word_count = @classifier.instance_variable_get(:@category_word_count)['Interesting'] 109 | 110 | @classifier.remove_category('Interesting') 111 | 112 | new_total_words = @classifier.instance_variable_get(:@total_words) 113 | assert_equal initial_total_words - category_word_count, new_total_words 114 | end 115 | 116 | def test_remove_nonexistent_category 117 | assert_raises(StandardError, 'No such category: Nonexistent Category') do 118 | @classifier.remove_category('Nonexistent Category') 119 | end 120 | end 121 | end 122 | -------------------------------------------------------------------------------- /test/extensions/word_hash_test.rb: -------------------------------------------------------------------------------- 1 | require_relative '../test_helper' 2 | 3 | class StringExtensionsTest < Minitest::Test 4 | def test_word_hash 5 | hash = { good: 1, "!": 1, hope: 1, "'": 1, ".": 1, love: 1, word: 1, them: 1, test: 1 } 6 | assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash 7 | end 8 | 9 | def test_clean_word_hash 10 | hash = { good: 1, word: 1, hope: 1, love: 1, them: 1, test: 1 } 11 | assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash 12 | end 13 | end 14 | 15 | class ArrayExtensionsTest < Minitest::Test 16 | def test_monkey_path_array_sum 17 | assert_equal [1, 2, 3].sum_with_identity, 6 18 | end 19 | 20 | def test_summing_a_nil_array 21 | assert_equal [nil].sum_with_identity, 0 22 | end 23 | 24 | def test_summing_an_empty_array 25 | assert_equal Array[].sum_with_identity, 0 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /test/lsi/lsi_test.rb: -------------------------------------------------------------------------------- 1 | require_relative '../test_helper' 2 | 3 | class LSITest < Minitest::Test 4 | def setup 5 | # we repeat principle words to help weight them. 6 | # This test is rather delicate, since this system is mostly noise. 7 | @str1 = 'This text deals with dogs. Dogs.' 8 | @str2 = 'This text involves dogs too. Dogs! ' 9 | @str3 = 'This text revolves around cats. Cats.' 10 | @str4 = 'This text also involves cats. Cats!' 11 | @str5 = 'This text involves birds. Birds.' 12 | end 13 | 14 | def test_basic_indexing 15 | lsi = Classifier::LSI.new 16 | [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } 17 | assert !lsi.needs_rebuild? 18 | 19 | # NOTE: that the closest match to str1 is str2, even though it is not 20 | # the closest text match. 21 | assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3) 22 | end 23 | 24 | def test_not_auto_rebuild 25 | lsi = Classifier::LSI.new auto_rebuild: false 26 | lsi.add_item @str1, 'Dog' 27 | lsi.add_item @str2, 'Dog' 28 | assert lsi.needs_rebuild? 29 | lsi.build_index 30 | assert !lsi.needs_rebuild? 31 | end 32 | 33 | def test_basic_categorizing 34 | lsi = Classifier::LSI.new 35 | lsi.add_item @str2, 'Dog' 36 | lsi.add_item @str3, 'Cat' 37 | lsi.add_item @str4, 'Cat' 38 | lsi.add_item @str5, 'Bird' 39 | 40 | assert_equal 'Dog', lsi.classify(@str1) 41 | assert_equal 'Cat', lsi.classify(@str3) 42 | assert_equal 'Bird', lsi.classify(@str5) 43 | assert_equal 'Bird', lsi.classify('Bird me to Bird') 44 | end 45 | 46 | def test_external_classifying 47 | lsi = Classifier::LSI.new 48 | bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird' 49 | lsi.add_item @str1, 'Dog' 50 | bayes.train_dog @str1 51 | lsi.add_item @str2, 'Dog' 52 | bayes.train_dog @str2 53 | lsi.add_item @str3, 'Cat' 54 | bayes.train_cat @str3 55 | lsi.add_item @str4, 'Cat' 56 | bayes.train_cat @str4 57 | lsi.add_item @str5, 'Bird' 58 | bayes.train_bird @str5 59 | 60 | # We're talking about dogs. Even though the text matches the corpus on 61 | # cats better. Dogs have more semantic weight than cats. So bayes 62 | # will fail here, but the LSI recognizes content. 63 | tricky_case = 'This text revolves around dogs.' 64 | assert_equal 'Dog', lsi.classify(tricky_case) 65 | assert_equal 'Cat', bayes.classify(tricky_case) 66 | end 67 | 68 | def test_recategorize_interface 69 | lsi = Classifier::LSI.new 70 | lsi.add_item @str1, 'Dog' 71 | lsi.add_item @str2, 'Dog' 72 | lsi.add_item @str3, 'Cat' 73 | lsi.add_item @str4, 'Cat' 74 | lsi.add_item @str5, 'Bird' 75 | 76 | tricky_case = 'This text revolves around dogs.' 77 | assert_equal 'Dog', lsi.classify(tricky_case) 78 | 79 | # Recategorize as needed. 80 | lsi.categories_for(@str1).clear.push 'Cow' 81 | lsi.categories_for(@str2).clear.push 'Cow' 82 | 83 | assert !lsi.needs_rebuild? 84 | assert_equal 'Cow', lsi.classify(tricky_case) 85 | end 86 | 87 | def test_classify_with_confidence 88 | lsi = Classifier::LSI.new 89 | lsi.add_item @str2, 'Dog' 90 | lsi.add_item @str3, 'Cat' 91 | lsi.add_item @str4, 'Cat' 92 | lsi.add_item @str5, 'Bird' 93 | 94 | category, confidence = lsi.classify_with_confidence(@str1) 95 | assert_equal 'Dog', category 96 | assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}" 97 | 98 | category, confidence = lsi.classify_with_confidence(@str3) 99 | assert_equal 'Cat', category 100 | assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}" 101 | 102 | category, confidence = lsi.classify_with_confidence(@str5) 103 | assert_equal 'Bird', category 104 | assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}" 105 | 106 | tricky_case = 'This text revolves around dogs.' 107 | category, confidence = lsi.classify_with_confidence(tricky_case) 108 | assert_equal 'Dog', category 109 | assert confidence > 0.3, "Confidence should be greater than 0.3, but was #{confidence}" 110 | end 111 | 112 | def test_search 113 | lsi = Classifier::LSI.new 114 | [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } 115 | 116 | # Searching by content and text, note that @str2 comes up first, because 117 | # both "dog" and "involve" are present. But, the next match is @str1 instead 118 | # of @str4, because "dog" carries more weight than involves. 119 | assert_equal([@str2, @str1, @str4, @str5, @str3], 120 | lsi.search('dog involves', 100)) 121 | 122 | # Keyword search shows how the space is mapped out in relation to 123 | # dog when magnitude is remove. Note the relations. We move from dog 124 | # through involve and then finally to other words. 125 | assert_equal([@str1, @str2, @str4, @str5, @str3], 126 | lsi.search('dog', 5)) 127 | end 128 | 129 | def test_serialize_safe 130 | lsi = Classifier::LSI.new 131 | [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } 132 | 133 | lsi_md = Marshal.dump lsi 134 | lsi_m = Marshal.load lsi_md 135 | 136 | assert_equal lsi_m.search('cat', 3), lsi.search('cat', 3) 137 | assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3) 138 | end 139 | 140 | def test_keyword_search 141 | lsi = Classifier::LSI.new 142 | lsi.add_item @str1, 'Dog' 143 | lsi.add_item @str2, 'Dog' 144 | lsi.add_item @str3, 'Cat' 145 | lsi.add_item @str4, 'Cat' 146 | lsi.add_item @str5, 'Bird' 147 | 148 | assert_equal %i[dog text deal], lsi.highest_ranked_stems(@str1) 149 | end 150 | 151 | def test_summary 152 | assert_equal 'This text involves dogs too [...] This text also involves cats', 153 | [@str1, @str2, @str3, @str4, @str5].join.summary(2) 154 | end 155 | end 156 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | $:.unshift(File.dirname(__FILE__) + '/../lib') 2 | 3 | require 'minitest' 4 | require 'minitest/autorun' 5 | require 'classifier' 6 | --------------------------------------------------------------------------------