├── .github
    └── workflows
    │   └── ruby.yml
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.markdown
├── Rakefile
├── bin
    ├── bayes.rb
    └── summarize.rb
├── classifier.gemspec
├── cloving.json
├── install.rb
├── lib
    ├── classifier.rb
    └── classifier
    │   ├── bayes.rb
    │   ├── extensions
    │       ├── string.rb
    │       ├── vector.rb
    │       ├── vector_serialize.rb
    │       └── word_hash.rb
    │   ├── lsi.rb
    │   └── lsi
    │       ├── content_node.rb
    │       ├── summary.rb
    │       └── word_list.rb
└── test
    ├── bayes
        └── bayesian_test.rb
    ├── extensions
        └── word_hash_test.rb
    ├── lsi
        └── lsi_test.rb
    └── test_helper.rb


/.github/workflows/ruby.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
 7 | 
 8 | name: Ruby
 9 | 
10 | on:
11 |   push:
12 |     branches: [ "master" ]
13 |   pull_request:
14 |     branches: [ "master" ]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   test:
21 | 
22 |     runs-on: ubuntu-latest
23 |     strategy:
24 |       matrix:
25 |         ruby-version: ['2.7', 'head']
26 | 
27 |     steps:
28 |     - uses: actions/checkout@v4
29 |     - name: Set up Ruby
30 |     # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
31 |     # change this to (see https://github.com/ruby/setup-ruby#versioning):
32 |     # uses: ruby/setup-ruby@v1
33 |       uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0
34 |       with:
35 |         ruby-version: ${{ matrix.ruby-version }}
36 |         bundler-cache: true # runs 'bundle install' and caches installed gems automatically
37 |     - name: Run tests
38 |       run: bundle exec rake test
39 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gemspec
3 | 
4 | gem 'fast-stemmer'
5 | gem 'matrix'
6 | gem 'mutex_m'
7 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | PATH
 2 |   remote: .
 3 |   specs:
 4 |     classifier (1.4.4)
 5 |       fast-stemmer (~> 1.0)
 6 |       mutex_m (~> 0.2)
 7 |       rake
 8 | 
 9 | GEM
10 |   remote: https://rubygems.org/
11 |   specs:
12 |     fast-stemmer (1.0.2)
13 |     matrix (0.4.2)
14 |     minitest (5.18.1)
15 |     mutex_m (0.2.0)
16 |     psych (5.1.2)
17 |       stringio
18 |     rake (13.0.6)
19 |     rdoc (6.5.1.1)
20 |       psych (>= 4.0.0)
21 |     stringio (3.1.0)
22 | 
23 | PLATFORMS
24 |   arm64-darwin-22
25 |   arm64-darwin-23
26 |   x86_64-linux
27 | 
28 | DEPENDENCIES
29 |   classifier!
30 |   fast-stemmer
31 |   matrix
32 |   minitest
33 |   mutex_m
34 |   rdoc
35 | 
36 | BUNDLED WITH
37 |    2.4.17
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 		  GNU LESSER GENERAL PUBLIC LICENSE
  2 | 		       Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |      59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 | 			    Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 | 		  GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 | 
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by
430 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | ## Welcome to Classifier
  2 | 
  3 | Classifier is a general module to allow Bayesian and other types of classifications.
  4 | 
  5 | ## Download
  6 | 
  7 | * https://github.com/cardmagic/classifier
  8 | * gem install classifier
  9 | * git clone https://github.com/cardmagic/classifier.git
 10 | 
 11 | ## Dependencies
 12 | 
 13 | If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
 14 | 
 15 |     gem install fast-stemmer
 16 | 
 17 | If you would like to speed up LSI classification by at least 10x, please install the following libraries:
 18 | GNU GSL:: http://www.gnu.org/software/gsl
 19 | rb-gsl:: https://github.com/SciRuby/rb-gsl
 20 | 
 21 | Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
 22 | 
 23 | ## Bayes
 24 | 
 25 | A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
 26 | 
 27 | ### Usage
 28 | 
 29 |     require 'classifier'
 30 |     b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
 31 |     b.train_interesting "here are some good words. I hope you love them"
 32 |     b.train_uninteresting "here are some bad words, I hate you"
 33 |     b.classify "I hate bad words and you" # returns 'Uninteresting'
 34 | 
 35 |     require 'madeleine'
 36 |     m = SnapshotMadeleine.new("bayes_data") {
 37 |         Classifier::Bayes.new 'Interesting', 'Uninteresting'
 38 |     }
 39 |     m.system.train_interesting "here are some good words. I hope you love them"
 40 |     m.system.train_uninteresting "here are some bad words, I hate you"
 41 |     m.take_snapshot
 42 |     m.system.classify "I love you" # returns 'Interesting'
 43 | 
 44 | Using Madeleine, your application can persist the learned data over time.
 45 | 
 46 | ### Bayesian Classification
 47 | 
 48 | * http://www.process.com/precisemail/bayesian_filtering.htm
 49 | * http://en.wikipedia.org/wiki/Bayesian_filtering
 50 | * http://www.paulgraham.com/spam.html
 51 | 
 52 | ## LSI
 53 | 
 54 | A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
 55 | are not as fast or as small as Bayesian classifiers, but are more flexible, providing
 56 | fast search and clustering detection as well as semantic analysis of the text that
 57 | theoretically simulates human learning.
 58 | 
 59 | ### Usage
 60 | 
 61 |     require 'classifier'
 62 |     lsi = Classifier::LSI.new
 63 |     strings = [ ["This text deals with dogs. Dogs.", :dog],
 64 |               ["This text involves dogs too. Dogs! ", :dog],
 65 |               ["This text revolves around cats. Cats.", :cat],
 66 |               ["This text also involves cats. Cats!", :cat],
 67 |               ["This text involves birds. Birds.",:bird ]]
 68 |     strings.each {|x| lsi.add_item x.first, x.last}
 69 | 
 70 |     lsi.search("dog", 3)
 71 |     # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
 72 |     #             "This text also involves cats. Cats!"]
 73 | 
 74 |     lsi.find_related(strings[2], 2)
 75 |     # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
 76 | 
 77 |     lsi.classify "This text is also about dogs!"
 78 |     # returns => :dog
 79 | 
 80 |     lsi.classify_with_confidence "This text is also about dogs!"
 81 |     # returns => [:dog, 1.0]
 82 | 
 83 | Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
 84 | with more than just simple strings.
 85 | 
 86 | ### Latent Semantic Indexing
 87 | 
 88 | * http://www.c2.com/cgi/wiki?LatentSemanticIndexing
 89 | * http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
 90 | * http://en.wikipedia.org/wiki/Latent_semantic_analysis
 91 | 
 92 | ## Authors
 93 | 
 94 | * Lucas Carlson  (lucas@rufy.com)
 95 | * David Fayram II (dfayram@gmail.com)
 96 | * Cameron McBride (cameron.mcbride@gmail.com)
 97 | * Ivan Acosta-Rubio (ivan@softwarecriollo.com)
 98 | 
 99 | This library is released under the terms of the GNU LGPL. See LICENSE for more details.
100 | 
101 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rake'
 2 | require 'rake/testtask'
 3 | require 'rdoc/task'
 4 | 
 5 | desc 'Default Task'
 6 | task default: [:test]
 7 | 
 8 | # Run the unit tests
 9 | desc 'Run all unit tests'
10 | Rake::TestTask.new('test') do |t|
11 |   t.libs << 'lib'
12 |   t.pattern = 'test/*/*_test.rb'
13 |   t.verbose = true
14 | end
15 | 
16 | # Make a console, useful when working on tests
17 | desc 'Generate a test console'
18 | task :console do
19 |   verbose(false) { sh "irb -I lib/ -r 'classifier'" }
20 | end
21 | 
22 | # Genereate the RDoc documentation
23 | desc 'Create documentation'
24 | Rake::RDocTask.new('doc') do |rdoc|
25 |   rdoc.title = 'Ruby Classifier - Bayesian and LSI classification library'
26 |   rdoc.rdoc_dir = 'html'
27 |   rdoc.rdoc_files.include('README.markdown')
28 |   rdoc.rdoc_files.include('lib/**/*.rb')
29 | end
30 | 
31 | desc 'Report code statistics (KLOCs, etc) from the application'
32 | task :stats do
33 |   require 'code_statistics'
34 |   CodeStatistics.new(
35 |     %w[Library lib],
36 |     %w[Units test]
37 |   ).to_s
38 | end
39 | 
40 | desc 'Publish new documentation'
41 | task :publish do
42 |   `ssh rufy update-classifier-doc`
43 |   Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
44 | end
45 | 


--------------------------------------------------------------------------------
/bin/bayes.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | begin
 4 | 	require 'rubygems'
 5 | 	require 'classifier'
 6 | rescue
 7 | 	require 'classifier'
 8 | end
 9 | 
10 | require 'madeleine'
11 | 
12 | m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13 | 	Classifier::Bayes.new 'Interesting', 'Uninteresting'
14 | }
15 | 
16 | case ARGV[0]
17 | when "add"
18 | 	case ARGV[1].downcase
19 | 	when "interesting"
20 | 		m.system.train_interesting File.open(ARGV[2]).read
21 | 		puts "#{ARGV[2]} has been classified as interesting"
22 | 	when "uninteresting"
23 | 		m.system.train_uninteresting File.open(ARGV[2]).read
24 | 		puts "#{ARGV[2]} has been classified as uninteresting"
25 | 	else
26 | 		puts "Invalid category: choose between interesting and uninteresting"
27 | 		exit(1)
28 | 	end
29 | when "classify"
30 | 	puts m.system.classify(File.open(ARGV[1]).read)
31 | else
32 | 	puts "Invalid option: choose add [category] [file] or clasify [file]"
33 | 	exit(-1)
34 | end
35 | 
36 | m.take_snapshot
37 | 


--------------------------------------------------------------------------------
/bin/summarize.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | begin
 4 |         require 'rubygems'
 5 |         require 'classifier'
 6 | rescue
 7 |         require 'classifier'
 8 | end
 9 | 
10 | require 'open-uri'
11 | 
12 | num = ARGV[1].to_i
13 | num = num < 1 ? 10 : num
14 | 
15 | text = open(ARGV.first).read
16 | puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
17 | 


--------------------------------------------------------------------------------
/classifier.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |s|
 2 |   s.name        = 'classifier'
 3 |   s.version     = '1.4.4'
 4 |   s.summary     = 'A general classifier module to allow Bayesian and other types of classifications.'
 5 |   s.description = 'A general classifier module to allow Bayesian and other types of classifications.'
 6 |   s.author = 'Lucas Carlson'
 7 |   s.email = 'lucas@rufy.com'
 8 |   s.homepage = 'https://github.com/cardmagic/classifier'
 9 |   s.files = Dir['{lib}/**/*.rb', 'bin/*', 'LICENSE', '*.md', 'test/*']
10 |   s.license = 'LGPL'
11 | 
12 |   s.add_dependency 'fast-stemmer', '~> 1.0'
13 |   s.add_dependency 'mutex_m', '~> 0.2'
14 |   s.add_dependency 'rake'
15 |   s.add_development_dependency 'minitest'
16 |   s.add_development_dependency 'rdoc'
17 | end
18 | 


--------------------------------------------------------------------------------
/cloving.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "languages": [
 3 |     {
 4 |       "name": "Ruby",
 5 |       "primary": true,
 6 |       "directory": "lib",
 7 |       "extension": ".rb"
 8 |     }
 9 |   ],
10 |   "frameworks": [],
11 |   "testingFrameworks": [
12 |     {
13 |       "name": "Test::Unit",
14 |       "type": "Testing framework",
15 |       "directory": "test"
16 |     }
17 |   ],
18 |   "buildTools": [
19 |     {
20 |       "name": "Rake",
21 |       "type": "Build tool"
22 |     }
23 |   ],
24 |   "packageManager": "Bundler",
25 |   "linters": [],
26 |   "projectType": "Ruby Gem"
27 | }


--------------------------------------------------------------------------------
/install.rb:
--------------------------------------------------------------------------------
 1 | require 'rbconfig'
 2 | require 'find'
 3 | require 'ftools'
 4 | 
 5 | include Config
 6 | 
 7 | # this was adapted from rdoc's install.rb by ways of Log4r
 8 | 
 9 | $sitedir = CONFIG["sitelibdir"]
10 | unless $sitedir
11 |   version = CONFIG["MAJOR"] + "." + CONFIG["MINOR"]
12 |   $libdir = File.join(CONFIG["libdir"], "ruby", version)
13 |   $sitedir = $:.find {|x| x =~ /site_ruby/ }
14 |   if !$sitedir
15 |     $sitedir = File.join($libdir, "site_ruby")
16 |   elsif $sitedir !~ Regexp.quote(version)
17 |     $sitedir = File.join($sitedir, version)
18 |   end
19 | end
20 | 
21 | makedirs = %w{ classifier }
22 | makedirs = %w{ classifier/extensions }
23 | makedirs = %w{ classifier/lsi }
24 | makedirs.each {|f| File::makedirs(File.join($sitedir, *f.split(/\//)))}
25 | 
26 | Dir.chdir("lib")
27 | begin
28 |   require 'rubygems'
29 |   require 'rake'
30 | rescue LoadError
31 |   puts
32 |   puts "Please install Gem and Rake from http://rubyforge.org/projects/rubygems and http://rubyforge.org/projects/rake"
33 |   puts
34 |   exit(-1)
35 | end
36 | 
37 | files = FileList["**/*"]
38 | 
39 | # File::safe_unlink *deprecated.collect{|f| File.join($sitedir, f.split(/\//))}
40 | files.each {|f|
41 |   File::install(f, File.join($sitedir, *f.split(/\//)), 0644, true)
42 | }
43 | 
44 | begin
45 |   require 'stemmer'
46 | rescue LoadError
47 |   puts
48 |   puts "Please install Stemmer from http://rubyforge.org/projects/stemmer or via 'gem install stemmer'"
49 |   puts
50 | end
51 | 


--------------------------------------------------------------------------------
/lib/classifier.rb:
--------------------------------------------------------------------------------
 1 | #--
 2 | # Copyright (c) 2005 Lucas Carlson
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining
 5 | # a copy of this software and associated documentation files (the
 6 | # "Software"), to deal in the Software without restriction, including
 7 | # without limitation the rights to use, copy, modify, merge, publish,
 8 | # distribute, sublicense, and/or sell copies of the Software, and to
 9 | # permit persons to whom the Software is furnished to do so, subject to
10 | # the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be
13 | # included in all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | #++
23 | # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
24 | # Copyright:: Copyright (c) 2005 Lucas Carlson
25 | # License::   LGPL
26 | 
27 | require 'rubygems'
28 | require 'classifier/extensions/string'
29 | require 'classifier/extensions/vector'
30 | require 'classifier/bayes'
31 | require 'classifier/lsi'
32 | 


--------------------------------------------------------------------------------
/lib/classifier/bayes.rb:
--------------------------------------------------------------------------------
  1 | # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
  2 | # Copyright:: Copyright (c) 2005 Lucas Carlson
  3 | # License::   LGPL
  4 | 
  5 | module Classifier
  6 |   class Bayes
  7 |     # The class can be created with one or more categories, each of which will be
  8 |     # initialized and given a training method. E.g.,
  9 |     #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
 10 |     def initialize(*categories)
 11 |       @categories = {}
 12 |       categories.each { |category| @categories[category.prepare_category_name] = {} }
 13 |       @total_words = 0
 14 |       @category_counts = Hash.new(0)
 15 |       @category_word_count = Hash.new(0)
 16 |     end
 17 | 
 18 |     #
 19 |     # Provides a general training method for all categories specified in Bayes#new
 20 |     # For example:
 21 |     #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
 22 |     #     b.train :this, "This text"
 23 |     #     b.train "that", "That text"
 24 |     #     b.train "The other", "The other text"
 25 |     def train(category, text)
 26 |       category = category.prepare_category_name
 27 |       @category_counts[category] += 1
 28 |       text.word_hash.each do |word, count|
 29 |         @categories[category][word] ||= 0
 30 |         @categories[category][word] += count
 31 |         @total_words += count
 32 |         @category_word_count[category] += count
 33 |       end
 34 |     end
 35 | 
 36 |     #
 37 |     # Provides a untraining method for all categories specified in Bayes#new
 38 |     # Be very careful with this method.
 39 |     #
 40 |     # For example:
 41 |     #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
 42 |     #     b.train :this, "This text"
 43 |     #     b.untrain :this, "This text"
 44 |     def untrain(category, text)
 45 |       category = category.prepare_category_name
 46 |       @category_counts[category] -= 1
 47 |       text.word_hash.each do |word, count|
 48 |         next unless @total_words >= 0
 49 | 
 50 |         orig = @categories[category][word] || 0
 51 |         @categories[category][word] ||= 0
 52 |         @categories[category][word] -= count
 53 |         if @categories[category][word] <= 0
 54 |           @categories[category].delete(word)
 55 |           count = orig
 56 |         end
 57 |         @category_word_count[category] -= count if @category_word_count[category] >= count
 58 |         @total_words -= count
 59 |       end
 60 |     end
 61 | 
 62 |     #
 63 |     # Returns the scores in each category the provided +text+. E.g.,
 64 |     #    b.classifications "I hate bad words and you"
 65 |     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
 66 |     # The largest of these scores (the one closest to 0) is the one picked out by #classify
 67 |     def classifications(text)
 68 |       score = {}
 69 |       word_hash = text.word_hash
 70 |       training_count = @category_counts.values.inject { |x, y| x + y }.to_f
 71 |       @categories.each do |category, category_words|
 72 |         score[category.to_s] = 0
 73 |         total = (@category_word_count[category] || 1).to_f
 74 |         word_hash.each_key do |word|
 75 |           s = category_words.key?(word) ? category_words[word] : 0.1
 76 |           score[category.to_s] += Math.log(s / total)
 77 |         end
 78 |         # now add prior probability for the category
 79 |         s = @category_counts.key?(category) ? @category_counts[category] : 0.1
 80 |         score[category.to_s] += Math.log(s / training_count)
 81 |       end
 82 |       score
 83 |     end
 84 | 
 85 |     #
 86 |     # Returns the classification of the provided +text+, which is one of the
 87 |     # categories given in the initializer. E.g.,
 88 |     #    b.classify "I hate bad words and you"
 89 |     #    =>  'Uninteresting'
 90 |     def classify(text)
 91 |       (classifications(text).sort_by { |a| -a[1] })[0][0]
 92 |     end
 93 | 
 94 |     #
 95 |     # Provides training and untraining methods for the categories specified in Bayes#new
 96 |     # For example:
 97 |     #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
 98 |     #     b.train_this "This text"
 99 |     #     b.train_that "That text"
100 |     #     b.untrain_that "That text"
101 |     #     b.train_the_other "The other text"
102 |     def method_missing(name, *args)
103 |       category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
104 |       if @categories.key?(category)
105 |         args.each do |text|
106 |           if name.to_s.start_with?('untrain_')
107 |             untrain(category, text)
108 |           else
109 |             train(category, text)
110 |           end
111 |         end
112 |       elsif name.to_s =~ /(un)?train_(\w+)/
113 |         raise StandardError, "No such category: #{category}"
114 |       else
115 |         super
116 |       end
117 |     end
118 | 
119 |     #
120 |     # Provides a list of category names
121 |     # For example:
122 |     #     b.categories
123 |     #     =>   ['This', 'That', 'the_other']
124 |     def categories # :nodoc:
125 |       @categories.keys.collect(&:to_s)
126 |     end
127 | 
128 |     #
129 |     # Allows you to add categories to the classifier.
130 |     # For example:
131 |     #     b.add_category "Not spam"
132 |     #
133 |     # WARNING: Adding categories to a trained classifier will
134 |     # result in an undertrained category that will tend to match
135 |     # more criteria than the trained selective categories. In short,
136 |     # try to initialize your categories at initialization.
137 |     def add_category(category)
138 |       @categories[category.prepare_category_name] = {}
139 |     end
140 | 
141 |     alias append_category add_category
142 | 
143 |     #
144 |     # Allows you to remove categories from the classifier.
145 |     # For example:
146 |     #     b.remove_category "Spam"
147 |     #
148 |     # WARNING: Removing categories from a trained classifier will
149 |     # result in the loss of all training data for that category.
150 |     # Make sure you really want to do this before calling this method.
151 |     def remove_category(category)
152 |       category = category.prepare_category_name
153 |       raise StandardError, "No such category: #{category}" unless @categories.key?(category)
154 | 
155 |       @total_words -= @category_word_count[category].to_i
156 | 
157 |       @categories.delete(category)
158 |       @category_counts.delete(category)
159 |       @category_word_count.delete(category)
160 |     end
161 |   end
162 | end
163 | 


--------------------------------------------------------------------------------
/lib/classifier/extensions/string.rb:
--------------------------------------------------------------------------------
 1 | # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson
 3 | # License::   LGPL
 4 | 
 5 | require 'fast_stemmer'
 6 | require 'classifier/extensions/word_hash'
 7 | 
 8 | class Object
 9 |   def prepare_category_name
10 |     to_s.gsub('_', ' ').capitalize.intern
11 |   end
12 | end
13 | 


--------------------------------------------------------------------------------
/lib/classifier/extensions/vector.rb:
--------------------------------------------------------------------------------
  1 | # Author::    Ernest Ellingson
  2 | # Copyright:: Copyright (c) 2005
  3 | 
  4 | # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
  5 | 
  6 | require 'matrix'
  7 | 
  8 | class Array
  9 |   def sum_with_identity(identity = 0.0, &block)
 10 |     return identity unless size.to_i.positive?
 11 | 
 12 |     if block_given?
 13 |       map(&block).sum_with_identity(identity)
 14 |     else
 15 |       compact.reduce(:+).to_f || identity.to_f
 16 |     end
 17 |   end
 18 | end
 19 | 
 20 | module VectorExtensions
 21 |   def magnitude
 22 |     sum_of_squares = 0.to_r
 23 |     size.times do |i|
 24 |       sum_of_squares += self[i]**2.to_r
 25 |     end
 26 |     Math.sqrt(sum_of_squares.to_f)
 27 |   end
 28 | 
 29 |   def normalize
 30 |     normalized_values = []
 31 |     magnitude_value = magnitude.to_r
 32 |     size.times do |i|
 33 |       normalized_values << (self[i] / magnitude_value)
 34 |     end
 35 |     Vector[*normalized_values]
 36 |   end
 37 | end
 38 | 
 39 | class Vector
 40 |   include VectorExtensions
 41 | end
 42 | 
 43 | class Matrix
 44 |   def self.diag(diagonal_elements)
 45 |     Matrix.diagonal(*diagonal_elements)
 46 |   end
 47 | 
 48 |   alias trans transpose
 49 | 
 50 |   def SV_decomp(max_sweeps = 20)
 51 |     q_matrix = if row_size >= column_size
 52 |                  trans * self
 53 |                else
 54 |                  self * trans
 55 |                end
 56 | 
 57 |     q_rotation_matrix = q_matrix.dup
 58 |     v_matrix = Matrix.identity(q_matrix.row_size)
 59 |     iteration_count = 0
 60 |     previous_s_matrix = nil
 61 | 
 62 |     loop do
 63 |       iteration_count += 1
 64 |       (0...q_rotation_matrix.row_size - 1).each do |row|
 65 |         (1..q_rotation_matrix.row_size - 1).each do |col|
 66 |           next if row == col
 67 | 
 68 |           angle = Math.atan((2.to_r * q_rotation_matrix[row,
 69 |                                                         col]) / (q_rotation_matrix[row,
 70 |                                                                                    row] - q_rotation_matrix[col,
 71 |                                                                                                             col])) / 2.0
 72 |           cosine = Math.cos(angle)
 73 |           sine = Math.sin(angle)
 74 |           rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
 75 |           rotation_matrix[row, row] = cosine
 76 |           rotation_matrix[row, col] = -sine
 77 |           rotation_matrix[col, row] = sine
 78 |           rotation_matrix[col, col] = cosine
 79 |           q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
 80 |           v_matrix *= rotation_matrix
 81 |         end
 82 |       end
 83 |       previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
 84 |       sum_of_differences = 0.to_r
 85 |       if iteration_count > 1
 86 |         q_rotation_matrix.row_size.times do |r|
 87 |           difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
 88 |           sum_of_differences += difference.to_r if difference > 0.001
 89 |         end
 90 |         previous_s_matrix = q_rotation_matrix.dup
 91 |       end
 92 |       break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
 93 |     end
 94 | 
 95 |     singular_values = []
 96 |     q_rotation_matrix.row_size.times do |r|
 97 |       singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
 98 |     end
 99 |     u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
100 |     [u_matrix, v_matrix, singular_values]
101 |   end
102 | 
103 |   def []=(row_index, col_index, value)
104 |     @rows[row_index][col_index] = value
105 |   end
106 | end
107 | 


--------------------------------------------------------------------------------
/lib/classifier/extensions/vector_serialize.rb:
--------------------------------------------------------------------------------
 1 | module GSL
 2 |   class Vector
 3 |     def _dump(_v)
 4 |       Marshal.dump(to_a)
 5 |     end
 6 | 
 7 |     def self._load(arr)
 8 |       arry = Marshal.load(arr)
 9 |       GSL::Vector.alloc(arry)
10 |     end
11 |   end
12 | 
13 |   class Matrix
14 |     class << self
15 |       alias diag diagonal
16 |     end
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/lib/classifier/extensions/word_hash.rb:
--------------------------------------------------------------------------------
  1 | # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
  2 | # Copyright:: Copyright (c) 2005 Lucas Carlson
  3 | # License::   LGPL
  4 | 
  5 | require 'set'
  6 | 
  7 | # These are extensions to the String class to provide convenience
  8 | # methods for the Classifier package.
  9 | class String
 10 |   # Removes common punctuation symbols, returning a new string.
 11 |   # E.g.,
 12 |   #   "Hello (greeting's), with {braces} < >...?".without_punctuation
 13 |   #   => "Hello  greetings   with  braces         "
 14 |   def without_punctuation
 15 |     tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
 16 |   end
 17 | 
 18 |   # Return a Hash of strings => ints. Each word in the string is stemmed,
 19 |   # interned, and indexes to its frequency in the document.
 20 |   def word_hash
 21 |     word_hash = clean_word_hash
 22 |     symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
 23 |     word_hash.merge(symbol_hash)
 24 |   end
 25 | 
 26 |   # Return a word hash without extra punctuation or short symbols, just stemmed words
 27 |   def clean_word_hash
 28 |     word_hash_for_words gsub(/[^\w\s]/, '').split
 29 |   end
 30 | 
 31 |   private
 32 | 
 33 |   def word_hash_for_words(words)
 34 |     d = Hash.new(0)
 35 |     words.each do |word|
 36 |       word.downcase!
 37 |       d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
 38 |     end
 39 |     d
 40 |   end
 41 | 
 42 |   def word_hash_for_symbols(words)
 43 |     d = Hash.new(0)
 44 |     words.each do |word|
 45 |       d[word.intern] += 1
 46 |     end
 47 |     d
 48 |   end
 49 | 
 50 |   CORPUS_SKIP_WORDS = ::Set.new(%w[
 51 |                                   a
 52 |                                   again
 53 |                                   all
 54 |                                   along
 55 |                                   are
 56 |                                   also
 57 |                                   an
 58 |                                   and
 59 |                                   as
 60 |                                   at
 61 |                                   but
 62 |                                   by
 63 |                                   came
 64 |                                   can
 65 |                                   cant
 66 |                                   couldnt
 67 |                                   did
 68 |                                   didn
 69 |                                   didnt
 70 |                                   do
 71 |                                   doesnt
 72 |                                   dont
 73 |                                   ever
 74 |                                   first
 75 |                                   from
 76 |                                   have
 77 |                                   her
 78 |                                   here
 79 |                                   him
 80 |                                   how
 81 |                                   i
 82 |                                   if
 83 |                                   in
 84 |                                   into
 85 |                                   is
 86 |                                   isnt
 87 |                                   it
 88 |                                   itll
 89 |                                   just
 90 |                                   last
 91 |                                   least
 92 |                                   like
 93 |                                   most
 94 |                                   my
 95 |                                   new
 96 |                                   no
 97 |                                   not
 98 |                                   now
 99 |                                   of
100 |                                   on
101 |                                   or
102 |                                   should
103 |                                   sinc
104 |                                   so
105 |                                   some
106 |                                   th
107 |                                   than
108 |                                   this
109 |                                   that
110 |                                   the
111 |                                   their
112 |                                   then
113 |                                   those
114 |                                   to
115 |                                   told
116 |                                   too
117 |                                   true
118 |                                   try
119 |                                   until
120 |                                   url
121 |                                   us
122 |                                   were
123 |                                   when
124 |                                   whether
125 |                                   while
126 |                                   with
127 |                                   within
128 |                                   yes
129 |                                   you
130 |                                   youll
131 |                                 ])
132 | end
133 | 


--------------------------------------------------------------------------------
/lib/classifier/lsi.rb:
--------------------------------------------------------------------------------
  1 | # Author::    David Fayram  (mailto:dfayram@lensmen.net)
  2 | # Copyright:: Copyright (c) 2005 David Fayram II
  3 | # License::   LGPL
  4 | 
  5 | module Classifier
  6 |   class LSI
  7 |     @gsl_available = false
  8 | 
  9 |     class << self
 10 |       attr_accessor :gsl_available
 11 |     end
 12 |   end
 13 | end
 14 | 
 15 | begin
 16 |   # to test the native vector class, try `rake test NATIVE_VECTOR=true`
 17 |   raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
 18 |   raise LoadError unless Gem::Specification.find_all_by_name('gsl').any?
 19 | 
 20 |   require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
 21 |   require 'classifier/extensions/vector_serialize'
 22 |   Classifier::LSI.gsl_available = true
 23 | rescue LoadError
 24 |   warn 'Notice: for 10x faster LSI support in the classifier gem, please install the gsl gem'
 25 |   Classifier::LSI.gsl_available = false
 26 |   require 'classifier/extensions/vector'
 27 | end
 28 | 
 29 | require 'classifier/lsi/word_list'
 30 | require 'classifier/lsi/content_node'
 31 | require 'classifier/lsi/summary'
 32 | 
 33 | module Classifier
 34 |   # This class implements a Latent Semantic Indexer, which can search, classify and cluster
 35 |   # data based on underlying semantic relations. For more information on the algorithms used,
 36 |   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
 37 |   class LSI
 38 |     attr_reader :word_list
 39 |     attr_accessor :auto_rebuild
 40 | 
 41 |     # Create a fresh index.
 42 |     # If you want to call #build_index manually, use
 43 |     #      Classifier::LSI.new auto_rebuild: false
 44 |     #
 45 |     def initialize(options = {})
 46 |       @auto_rebuild = true unless options[:auto_rebuild] == false
 47 |       @word_list = WordList.new
 48 |       @items = {}
 49 |       @version = 0
 50 |       @built_at_version = -1
 51 |     end
 52 | 
 53 |     # Returns true if the index needs to be rebuilt.  The index needs
 54 |     # to be built after all informaton is added, but before you start
 55 |     # using it for search, classification and cluster detection.
 56 |     def needs_rebuild?
 57 |       (@items.keys.size > 1) && (@version != @built_at_version)
 58 |     end
 59 | 
 60 |     # Adds an item to the index. item is assumed to be a string, but
 61 |     # any item may be indexed so long as it responds to #to_s or if
 62 |     # you provide an optional block explaining how the indexer can
 63 |     # fetch fresh string data. This optional block is passed the item,
 64 |     # so the item may only be a reference to a URL or file name.
 65 |     #
 66 |     # For example:
 67 |     #   lsi = Classifier::LSI.new
 68 |     #   lsi.add_item "This is just plain text"
 69 |     #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
 70 |     #   ar = ActiveRecordObject.find( :all )
 71 |     #   lsi.add_item ar, *ar.categories { |x| ar.content }
 72 |     #
 73 |     def add_item(item, *categories, &block)
 74 |       clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
 75 |       @items[item] = ContentNode.new(clean_word_hash, *categories)
 76 |       @version += 1
 77 |       build_index if @auto_rebuild
 78 |     end
 79 | 
 80 |     # A less flexible shorthand for add_item that assumes
 81 |     # you are passing in a string with no categorries. item
 82 |     # will be duck typed via to_s .
 83 |     #
 84 |     def <<(item)
 85 |       add_item(item)
 86 |     end
 87 | 
 88 |     # Returns the categories for a given indexed items. You are free to add and remove
 89 |     # items from this as you see fit. It does not invalide an index to change its categories.
 90 |     def categories_for(item)
 91 |       return [] unless @items[item]
 92 | 
 93 |       @items[item].categories
 94 |     end
 95 | 
 96 |     # Removes an item from the database, if it is indexed.
 97 |     #
 98 |     def remove_item(item)
 99 |       return unless @items.key?(item)
100 | 
101 |       @items.delete(item)
102 |       @version += 1
103 |     end
104 | 
105 |     # Returns an array of items that are indexed.
106 |     def items
107 |       @items.keys
108 |     end
109 | 
110 |     # This function rebuilds the index if needs_rebuild? returns true.
111 |     # For very large document spaces, this indexing operation may take some
112 |     # time to complete, so it may be wise to place the operation in another
113 |     # thread.
114 |     #
115 |     # As a rule, indexing will be fairly swift on modern machines until
116 |     # you have well over 500 documents indexed, or have an incredibly diverse
117 |     # vocabulary for your documents.
118 |     #
119 |     # The optional parameter "cutoff" is a tuning parameter. When the index is
120 |     # built, a certain number of s-values are discarded from the system. The
121 |     # cutoff parameter tells the indexer how many of these values to keep.
122 |     # A value of 1 for cutoff means that no semantic analysis will take place,
123 |     # turning the LSI class into a simple vector search engine.
124 |     def build_index(cutoff = 0.75)
125 |       return unless needs_rebuild?
126 | 
127 |       make_word_list
128 | 
129 |       doc_list = @items.values
130 |       tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
131 | 
132 |       if self.class.gsl_available
133 |         tdm = GSL::Matrix.alloc(*tda).trans
134 |         ntdm = build_reduced_matrix(tdm, cutoff)
135 | 
136 |         ntdm.size[1].times do |col|
137 |           vec = GSL::Vector.alloc(ntdm.column(col)).row
138 |           doc_list[col].lsi_vector = vec
139 |           doc_list[col].lsi_norm = vec.normalize
140 |         end
141 |       else
142 |         tdm = Matrix.rows(tda).trans
143 |         ntdm = build_reduced_matrix(tdm, cutoff)
144 | 
145 |         ntdm.row_size.times do |col|
146 |           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
147 |           doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
148 |         end
149 |       end
150 | 
151 |       @built_at_version = @version
152 |     end
153 | 
154 |     # This method returns max_chunks entries, ordered by their average semantic rating.
155 |     # Essentially, the average distance of each entry from all other entries is calculated,
156 |     # the highest are returned.
157 |     #
158 |     # This can be used to build a summary service, or to provide more information about
159 |     # your dataset's general content. For example, if you were to use categorize on the
160 |     # results of this data, you could gather information on what your dataset is generally
161 |     # about.
162 |     def highest_relative_content(max_chunks = 10)
163 |       return [] if needs_rebuild?
164 | 
165 |       avg_density = {}
166 |       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |i, j| i + j[1] } }
167 | 
168 |       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
169 |     end
170 | 
171 |     # This function is the primitive that find_related and classify
172 |     # build upon. It returns an array of 2-element arrays. The first element
173 |     # of this array is a document, and the second is its "score", defining
174 |     # how "close" it is to other indexed items.
175 |     #
176 |     # These values are somewhat arbitrary, having to do with the vector space
177 |     # created by your content, so the magnitude is interpretable but not always
178 |     # meaningful between indexes.
179 |     #
180 |     # The parameter doc is the content to compare. If that content is not
181 |     # indexed, you can pass an optional block to define how to create the
182 |     # text data. See add_item for examples of how this works.
183 |     def proximity_array_for_content(doc, &block)
184 |       return [] if needs_rebuild?
185 | 
186 |       content_node = node_for_content(doc, &block)
187 |       result =
188 |         @items.keys.collect do |item|
189 |           val = if self.class.gsl_available
190 |                   content_node.search_vector * @items[item].search_vector.col
191 |                 else
192 |                   (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
193 |                 end
194 |           [item, val]
195 |         end
196 |       result.sort_by { |x| x[1] }.reverse
197 |     end
198 | 
199 |     # Similar to proximity_array_for_content, this function takes similar
200 |     # arguments and returns a similar array. However, it uses the normalized
201 |     # calculated vectors instead of their full versions. This is useful when
202 |     # you're trying to perform operations on content that is much smaller than
203 |     # the text you're working with. search uses this primitive.
204 |     def proximity_norms_for_content(doc, &block)
205 |       return [] if needs_rebuild?
206 | 
207 |       content_node = node_for_content(doc, &block)
208 |       result =
209 |         @items.keys.collect do |item|
210 |           val = if self.class.gsl_available
211 |                   content_node.search_norm * @items[item].search_norm.col
212 |                 else
213 |                   (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
214 |                 end
215 |           [item, val]
216 |         end
217 |       result.sort_by { |x| x[1] }.reverse
218 |     end
219 | 
220 |     # This function allows for text-based search of your index. Unlike other functions
221 |     # like find_related and classify, search only takes short strings. It will also ignore
222 |     # factors like repeated words. It is best for short, google-like search terms.
223 |     # A search will first priortize lexical relationships, then semantic ones.
224 |     #
225 |     # While this may seem backwards compared to the other functions that LSI supports,
226 |     # it is actually the same algorithm, just applied on a smaller document.
227 |     def search(string, max_nearest = 3)
228 |       return [] if needs_rebuild?
229 | 
230 |       carry = proximity_norms_for_content(string)
231 |       result = carry.collect { |x| x[0] }
232 |       result[0..max_nearest - 1]
233 |     end
234 | 
235 |     # This function takes content and finds other documents
236 |     # that are semantically "close", returning an array of documents sorted
237 |     # from most to least relavant.
238 |     # max_nearest specifies the number of documents to return. A value of
239 |     # 0 means that it returns all the indexed documents, sorted by relavence.
240 |     #
241 |     # This is particularly useful for identifing clusters in your document space.
242 |     # For example you may want to identify several "What's Related" items for weblog
243 |     # articles, or find paragraphs that relate to each other in an essay.
244 |     def find_related(doc, max_nearest = 3, &block)
245 |       carry =
246 |         proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
247 |       result = carry.collect { |x| x[0] }
248 |       result[0..max_nearest - 1]
249 |     end
250 | 
251 |     # This function uses a voting system to categorize documents, based on
252 |     # the categories of other documents. It uses the same logic as the
253 |     # find_related function to find related documents, then returns the
254 |     # most obvious category from this list.
255 |     #
256 |     # cutoff signifies the number of documents to consider when clasifying
257 |     # text. A cutoff of 1 means that every document in the index votes on
258 |     # what category the document is in. This may not always make sense.
259 |     #
260 |     def classify(doc, cutoff = 0.30, &block)
261 |       votes = vote(doc, cutoff, &block)
262 | 
263 |       ranking = votes.keys.sort_by { |x| votes[x] }
264 |       ranking[-1]
265 |     end
266 | 
267 |     def vote(doc, cutoff = 0.30, &block)
268 |       icutoff = (@items.size * cutoff).round
269 |       carry = proximity_array_for_content(doc, &block)
270 |       carry = carry[0..icutoff - 1]
271 |       votes = {}
272 |       carry.each do |pair|
273 |         categories = @items[pair[0]].categories
274 |         categories.each do |category|
275 |           votes[category] ||= 0.0
276 |           votes[category] += pair[1]
277 |         end
278 |       end
279 |       votes
280 |     end
281 | 
282 |     # Returns the same category as classify() but also returns
283 |     # a confidence value derived from the vote share that the
284 |     # winning category got.
285 |     #
286 |     # e.g.
287 |     # category,confidence = classify_with_confidence(doc)
288 |     # if confidence < 0.3
289 |     #   category = nil
290 |     # end
291 |     #
292 |     #
293 |     # See classify() for argument docs
294 |     def classify_with_confidence(doc, cutoff = 0.30, &block)
295 |       votes = vote(doc, cutoff, &block)
296 |       votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
297 |       return [nil, nil] if votes_sum.zero?
298 | 
299 |       ranking = votes.keys.sort_by { |x| votes[x] }
300 |       winner = ranking[-1]
301 |       vote_share = votes[winner] / votes_sum.to_f
302 |       [winner, vote_share]
303 |     end
304 | 
305 |     # Prototype, only works on indexed documents.
306 |     # I have no clue if this is going to work, but in theory
307 |     # it's supposed to.
308 |     def highest_ranked_stems(doc, count = 3)
309 |       raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
310 | 
311 |       arr = node_for_content(doc).lsi_vector.to_a
312 |       top_n = arr.sort.reverse[0..count - 1]
313 |       top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
314 |     end
315 | 
316 |     private
317 | 
318 |     def build_reduced_matrix(matrix, cutoff = 0.75)
319 |       # TODO: Check that M>=N on these dimensions! Transpose helps assure this
320 |       u, v, s = matrix.SV_decomp
321 | 
322 |       # TODO: Better than 75% term, please. :\
323 |       s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
324 |       s.size.times do |ord|
325 |         s[ord] = 0.0 if s[ord] < s_cutoff
326 |       end
327 |       # Reconstruct the term document matrix, only with reduced rank
328 |       u * (self.class.gsl_available ? GSL::Matrix : ::Matrix).diag(s) * v.trans
329 |     end
330 | 
331 |     def node_for_content(item, &block)
332 |       return @items[item] if @items[item]
333 | 
334 |       clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
335 | 
336 |       cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
337 | 
338 |       unless needs_rebuild?
339 |         cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
340 |       end
341 | 
342 |       cn
343 |     end
344 | 
345 |     def make_word_list
346 |       @word_list = WordList.new
347 |       @items.each_value do |node|
348 |         node.word_hash.each_key { |key| @word_list.add_word key }
349 |       end
350 |     end
351 |   end
352 | end
353 | 


--------------------------------------------------------------------------------
/lib/classifier/lsi/content_node.rb:
--------------------------------------------------------------------------------
 1 | # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 2 | # Copyright:: Copyright (c) 2005 David Fayram II
 3 | # License::   LGPL
 4 | 
 5 | module Classifier
 6 |   # This is an internal data structure class for the LSI node. Save for
 7 |   # raw_vector_with, it should be fairly straightforward to understand.
 8 |   # You should never have to use it directly.
 9 |   class ContentNode
10 |     attr_accessor :raw_vector, :raw_norm,
11 |                   :lsi_vector, :lsi_norm,
12 |                   :categories
13 | 
14 |     attr_reader :word_hash
15 | 
16 |     # If text_proc is not specified, the source will be duck-typed
17 |     # via source.to_s
18 |     def initialize(word_frequencies, *categories)
19 |       @categories = categories || []
20 |       @word_hash = word_frequencies
21 |     end
22 | 
23 |     # Use this to fetch the appropriate search vector.
24 |     def search_vector
25 |       @lsi_vector || @raw_vector
26 |     end
27 | 
28 |     # Use this to fetch the appropriate search vector in normalized form.
29 |     def search_norm
30 |       @lsi_norm || @raw_norm
31 |     end
32 | 
33 |     # Creates the raw vector out of word_hash using word_list as the
34 |     # key for mapping the vector space.
35 |     def raw_vector_with(word_list)
36 |       vec = if Classifier::LSI.gsl_available
37 |               GSL::Vector.alloc(word_list.size)
38 |             else
39 |               Array.new(word_list.size, 0)
40 |             end
41 | 
42 |       @word_hash.each_key do |word|
43 |         vec[word_list[word]] = @word_hash[word] if word_list[word]
44 |       end
45 | 
46 |       # Perform the scaling transform
47 |       total_words = Classifier::LSI.gsl_available ? vec.sum : vec.sum_with_identity
48 |       total_unique_words = vec.count { |word| word != 0 }
49 | 
50 |       # Perform first-order association transform if this vector has more
51 |       # than one word in it.
52 |       if total_words > 1.0 && total_unique_words > 1
53 |         weighted_total = 0.0
54 | 
55 |         vec.each do |term|
56 |           next unless term.positive?
57 |           next if total_words.zero?
58 | 
59 |           term_over_total = term / total_words
60 |           val = term_over_total * Math.log(term_over_total)
61 |           weighted_total += val unless val.nan?
62 |         end
63 |         vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
64 |       end
65 | 
66 |       if Classifier::LSI.gsl_available
67 |         @raw_norm   = vec.normalize
68 |         @raw_vector = vec
69 |       else
70 |         @raw_norm   = Vector[*vec].normalize
71 |         @raw_vector = Vector[*vec]
72 |       end
73 |     end
74 |   end
75 | end
76 | 


--------------------------------------------------------------------------------
/lib/classifier/lsi/summary.rb:
--------------------------------------------------------------------------------
 1 | # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 2 | # Copyright:: Copyright (c) 2005 Lucas Carlson
 3 | # License::   LGPL
 4 | 
 5 | class String
 6 |   def summary(count = 10, separator = ' [...] ')
 7 |     perform_lsi split_sentences, count, separator
 8 |   end
 9 | 
10 |   def paragraph_summary(count = 1, separator = ' [...] ')
11 |     perform_lsi split_paragraphs, count, separator
12 |   end
13 | 
14 |   def split_sentences
15 |     split(/(\.|!|\?)/) # TODO: make this less primitive
16 |   end
17 | 
18 |   def split_paragraphs
19 |     split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
20 |   end
21 | 
22 |   private
23 | 
24 |   def perform_lsi(chunks, count, separator)
25 |     lsi = Classifier::LSI.new auto_rebuild: false
26 |     chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27 |     lsi.build_index
28 |     summaries = lsi.highest_relative_content count
29 |     summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/classifier/lsi/word_list.rb:
--------------------------------------------------------------------------------
 1 | # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 2 | # Copyright:: Copyright (c) 2005 David Fayram II
 3 | # License::   LGPL
 4 | 
 5 | module Classifier
 6 |   # This class keeps a word => index mapping. It is used to map stemmed words
 7 |   # to dimensions of a vector.
 8 | 
 9 |   class WordList
10 |     def initialize
11 |       @location_table = {}
12 |     end
13 | 
14 |     # Adds a word (if it is new) and assigns it a unique dimension.
15 |     def add_word(word)
16 |       term = word
17 |       @location_table[term] = @location_table.size unless @location_table[term]
18 |     end
19 | 
20 |     # Returns the dimension of the word or nil if the word is not in the space.
21 |     def [](lookup)
22 |       term = lookup
23 |       @location_table[term]
24 |     end
25 | 
26 |     def word_for_index(ind)
27 |       @location_table.invert[ind]
28 |     end
29 | 
30 |     # Returns the number of words mapped.
31 |     def size
32 |       @location_table.size
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/test/bayes/bayesian_test.rb:
--------------------------------------------------------------------------------
  1 | require_relative '../test_helper'
  2 | 
  3 | class BayesianTest < Minitest::Test
  4 |   def setup
  5 |     @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
  6 |   end
  7 | 
  8 |   def test_bad_training
  9 |     assert_raises(StandardError) { @classifier.train_no_category 'words' }
 10 |   end
 11 | 
 12 |   def test_bad_method
 13 |     assert_raises(NoMethodError) { @classifier.forget_everything_you_know '' }
 14 |   end
 15 | 
 16 |   def test_categories
 17 |     assert_equal %w[Interesting Uninteresting].sort, @classifier.categories.sort
 18 |   end
 19 | 
 20 |   def test_add_category
 21 |     @classifier.add_category 'Test'
 22 |     assert_equal %w[Test Interesting Uninteresting].sort, @classifier.categories.sort
 23 |   end
 24 | 
 25 |   def test_classification
 26 |     @classifier.train_interesting 'here are some good words. I hope you love them'
 27 |     @classifier.train_uninteresting 'here are some bad words, I hate you'
 28 |     assert_equal 'Uninteresting', @classifier.classify('I hate bad words and you')
 29 |   end
 30 | 
 31 |   def test_safari_animals
 32 |     bayes = Classifier::Bayes.new 'Lion', 'Elephant'
 33 |     bayes.train_lion 'lion'
 34 |     bayes.train_lion 'zebra'
 35 |     bayes.train_elephant 'elephant'
 36 |     bayes.train_elephant 'trunk'
 37 |     bayes.train_elephant 'tusk'
 38 | 
 39 |     assert_equal 'Lion', bayes.classify('zebra')
 40 |     assert_equal 'Elephant', bayes.classify('trunk')
 41 |     assert_equal 'Elephant', bayes.classify('tusk')
 42 |     assert_equal 'Lion', bayes.classify('lion')
 43 |     assert_equal 'Elephant', bayes.classify('elephant')
 44 |   end
 45 | 
 46 |   def test_remove_category
 47 |     @classifier.train_interesting 'This is interesting content'
 48 |     @classifier.train_uninteresting 'This is uninteresting content'
 49 | 
 50 |     assert_equal %w[Interesting Uninteresting].sort, @classifier.categories.sort
 51 | 
 52 |     @classifier.remove_category 'Uninteresting'
 53 | 
 54 |     assert_equal ['Interesting'], @classifier.categories
 55 |   end
 56 | 
 57 |   def test_remove_category_affects_classification
 58 |     @classifier.train_interesting 'This is interesting content'
 59 |     @classifier.train_uninteresting 'This is uninteresting content'
 60 | 
 61 |     assert_equal 'Uninteresting', @classifier.classify('This is uninteresting')
 62 | 
 63 |     @classifier.remove_category 'Uninteresting'
 64 | 
 65 |     assert_equal 'Interesting', @classifier.classify('This is uninteresting')
 66 |   end
 67 | 
 68 |   def test_remove_all_categories
 69 |     @classifier.remove_category 'Interesting'
 70 |     @classifier.remove_category 'Uninteresting'
 71 | 
 72 |     assert_empty @classifier.categories
 73 |   end
 74 | 
 75 |   def test_remove_and_add_category
 76 |     @classifier.remove_category 'Uninteresting'
 77 |     @classifier.add_category 'Neutral'
 78 | 
 79 |     assert_equal %w[Interesting Neutral].sort, @classifier.categories.sort
 80 |   end
 81 | 
 82 |   def test_remove_category_preserves_other_category_data
 83 |     @classifier.train_interesting 'This is interesting content'
 84 |     @classifier.train_uninteresting 'This is uninteresting content'
 85 | 
 86 |     interesting_classification = @classifier.classify('This is interesting')
 87 |     @classifier.remove_category 'Uninteresting'
 88 | 
 89 |     assert_equal interesting_classification, @classifier.classify('This is interesting')
 90 |   end
 91 | 
 92 |   def test_remove_category_check_counts
 93 |     initial_total_words = @classifier.instance_variable_get(:@total_words)
 94 |     category_word_count = @classifier.instance_variable_get(:@category_word_count)['Interesting']
 95 | 
 96 |     @classifier.remove_category('Interesting')
 97 | 
 98 |     assert_nil @classifier.instance_variable_get(:@categories)['Interesting']
 99 |     assert_equal @classifier.instance_variable_get(:@category_counts)['Interesting'], 0
100 |     assert_equal @classifier.instance_variable_get(:@category_word_count)['Interesting'], 0
101 | 
102 |     new_total_words = @classifier.instance_variable_get(:@total_words)
103 |     assert_equal initial_total_words - category_word_count, new_total_words
104 |   end
105 | 
106 |   def test_remove_category_updates_total_words_before_deletion
107 |     initial_total_words = @classifier.instance_variable_get(:@total_words)
108 |     category_word_count = @classifier.instance_variable_get(:@category_word_count)['Interesting']
109 | 
110 |     @classifier.remove_category('Interesting')
111 | 
112 |     new_total_words = @classifier.instance_variable_get(:@total_words)
113 |     assert_equal initial_total_words - category_word_count, new_total_words
114 |   end
115 | 
116 |   def test_remove_nonexistent_category
117 |     assert_raises(StandardError, 'No such category: Nonexistent Category') do
118 |       @classifier.remove_category('Nonexistent Category')
119 |     end
120 |   end
121 | end
122 | 


--------------------------------------------------------------------------------
/test/extensions/word_hash_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative '../test_helper'
 2 | 
 3 | class StringExtensionsTest < Minitest::Test
 4 |   def test_word_hash
 5 |     hash = { good: 1, "!": 1, hope: 1, "'": 1, ".": 1, love: 1, word: 1, them: 1, test: 1 }
 6 |     assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
 7 |   end
 8 | 
 9 |   def test_clean_word_hash
10 |     hash = { good: 1, word: 1, hope: 1, love: 1, them: 1, test: 1 }
11 |     assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12 |   end
13 | end
14 | 
15 | class ArrayExtensionsTest < Minitest::Test
16 |   def test_monkey_path_array_sum
17 |     assert_equal [1, 2, 3].sum_with_identity, 6
18 |   end
19 | 
20 |   def test_summing_a_nil_array
21 |     assert_equal [nil].sum_with_identity, 0
22 |   end
23 | 
24 |   def test_summing_an_empty_array
25 |     assert_equal Array[].sum_with_identity, 0
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/test/lsi/lsi_test.rb:
--------------------------------------------------------------------------------
  1 | require_relative '../test_helper'
  2 | 
  3 | class LSITest < Minitest::Test
  4 |   def setup
  5 |     # we repeat principle words to help weight them.
  6 |     # This test is rather delicate, since this system is mostly noise.
  7 |     @str1 = 'This text deals with dogs. Dogs.'
  8 |     @str2 = 'This text involves dogs too. Dogs! '
  9 |     @str3 = 'This text revolves around cats. Cats.'
 10 |     @str4 = 'This text also involves cats. Cats!'
 11 |     @str5 = 'This text involves birds. Birds.'
 12 |   end
 13 | 
 14 |   def test_basic_indexing
 15 |     lsi = Classifier::LSI.new
 16 |     [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
 17 |     assert !lsi.needs_rebuild?
 18 | 
 19 |     # NOTE: that the closest match to str1 is str2, even though it is not
 20 |     # the closest text match.
 21 |     assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
 22 |   end
 23 | 
 24 |   def test_not_auto_rebuild
 25 |     lsi = Classifier::LSI.new auto_rebuild: false
 26 |     lsi.add_item @str1, 'Dog'
 27 |     lsi.add_item @str2, 'Dog'
 28 |     assert lsi.needs_rebuild?
 29 |     lsi.build_index
 30 |     assert !lsi.needs_rebuild?
 31 |   end
 32 | 
 33 |   def test_basic_categorizing
 34 |     lsi = Classifier::LSI.new
 35 |     lsi.add_item @str2, 'Dog'
 36 |     lsi.add_item @str3, 'Cat'
 37 |     lsi.add_item @str4, 'Cat'
 38 |     lsi.add_item @str5, 'Bird'
 39 | 
 40 |     assert_equal 'Dog', lsi.classify(@str1)
 41 |     assert_equal 'Cat', lsi.classify(@str3)
 42 |     assert_equal 'Bird', lsi.classify(@str5)
 43 |     assert_equal 'Bird', lsi.classify('Bird me to Bird')
 44 |   end
 45 | 
 46 |   def test_external_classifying
 47 |     lsi = Classifier::LSI.new
 48 |     bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
 49 |     lsi.add_item @str1, 'Dog'
 50 |     bayes.train_dog @str1
 51 |     lsi.add_item @str2, 'Dog'
 52 |     bayes.train_dog @str2
 53 |     lsi.add_item @str3, 'Cat'
 54 |     bayes.train_cat @str3
 55 |     lsi.add_item @str4, 'Cat'
 56 |     bayes.train_cat @str4
 57 |     lsi.add_item @str5, 'Bird'
 58 |     bayes.train_bird @str5
 59 | 
 60 |     # We're talking about dogs. Even though the text matches the corpus on
 61 |     # cats better.  Dogs have more semantic weight than cats. So bayes
 62 |     # will fail here, but the LSI recognizes content.
 63 |     tricky_case = 'This text revolves around dogs.'
 64 |     assert_equal 'Dog', lsi.classify(tricky_case)
 65 |     assert_equal 'Cat', bayes.classify(tricky_case)
 66 |   end
 67 | 
 68 |   def test_recategorize_interface
 69 |     lsi = Classifier::LSI.new
 70 |     lsi.add_item @str1, 'Dog'
 71 |     lsi.add_item @str2, 'Dog'
 72 |     lsi.add_item @str3, 'Cat'
 73 |     lsi.add_item @str4, 'Cat'
 74 |     lsi.add_item @str5, 'Bird'
 75 | 
 76 |     tricky_case = 'This text revolves around dogs.'
 77 |     assert_equal 'Dog', lsi.classify(tricky_case)
 78 | 
 79 |     # Recategorize as needed.
 80 |     lsi.categories_for(@str1).clear.push 'Cow'
 81 |     lsi.categories_for(@str2).clear.push 'Cow'
 82 | 
 83 |     assert !lsi.needs_rebuild?
 84 |     assert_equal 'Cow', lsi.classify(tricky_case)
 85 |   end
 86 | 
 87 |   def test_classify_with_confidence
 88 |     lsi = Classifier::LSI.new
 89 |     lsi.add_item @str2, 'Dog'
 90 |     lsi.add_item @str3, 'Cat'
 91 |     lsi.add_item @str4, 'Cat'
 92 |     lsi.add_item @str5, 'Bird'
 93 | 
 94 |     category, confidence = lsi.classify_with_confidence(@str1)
 95 |     assert_equal 'Dog', category
 96 |     assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}"
 97 | 
 98 |     category, confidence = lsi.classify_with_confidence(@str3)
 99 |     assert_equal 'Cat', category
100 |     assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}"
101 | 
102 |     category, confidence = lsi.classify_with_confidence(@str5)
103 |     assert_equal 'Bird', category
104 |     assert confidence > 0.5, "Confidence should be greater than 0.5, but was #{confidence}"
105 | 
106 |     tricky_case = 'This text revolves around dogs.'
107 |     category, confidence = lsi.classify_with_confidence(tricky_case)
108 |     assert_equal 'Dog', category
109 |     assert confidence > 0.3, "Confidence should be greater than 0.3, but was #{confidence}"
110 |   end
111 | 
112 |   def test_search
113 |     lsi = Classifier::LSI.new
114 |     [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
115 | 
116 |     # Searching by content and text, note that @str2 comes up first, because
117 |     # both "dog" and "involve" are present. But, the next match is @str1 instead
118 |     # of @str4, because "dog" carries more weight than involves.
119 |     assert_equal([@str2, @str1, @str4, @str5, @str3],
120 |                  lsi.search('dog involves', 100))
121 | 
122 |     # Keyword search shows how the space is mapped out in relation to
123 |     # dog when magnitude is remove. Note the relations. We move from dog
124 |     # through involve and then finally to other words.
125 |     assert_equal([@str1, @str2, @str4, @str5, @str3],
126 |                  lsi.search('dog', 5))
127 |   end
128 | 
129 |   def test_serialize_safe
130 |     lsi = Classifier::LSI.new
131 |     [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
132 | 
133 |     lsi_md = Marshal.dump lsi
134 |     lsi_m = Marshal.load lsi_md
135 | 
136 |     assert_equal lsi_m.search('cat', 3), lsi.search('cat', 3)
137 |     assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
138 |   end
139 | 
140 |   def test_keyword_search
141 |     lsi = Classifier::LSI.new
142 |     lsi.add_item @str1, 'Dog'
143 |     lsi.add_item @str2, 'Dog'
144 |     lsi.add_item @str3, 'Cat'
145 |     lsi.add_item @str4, 'Cat'
146 |     lsi.add_item @str5, 'Bird'
147 | 
148 |     assert_equal %i[dog text deal], lsi.highest_ranked_stems(@str1)
149 |   end
150 | 
151 |   def test_summary
152 |     assert_equal 'This text involves dogs too [...] This text also involves cats',
153 |                  [@str1, @str2, @str3, @str4, @str5].join.summary(2)
154 |   end
155 | end
156 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | $:.unshift(File.dirname(__FILE__) + '/../lib')
2 | 
3 | require 'minitest'
4 | require 'minitest/autorun'
5 | require 'classifier'
6 | 


--------------------------------------------------------------------------------