├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── Makefile
├── README.md
├── bin
    ├── digit_conflate.pl
    ├── flat_clusters2json.pl
    ├── hier2flat_no_freqs.sh
    ├── hier2flat_with_freqs.sh
    ├── lowercase.pl
    ├── mkcls
    ├── mkcls4brown
    ├── mkcls4word2vec
    └── ngram_counts.py
├── python
    ├── README.md
    └── clustercat.py
├── src
    ├── clustercat-array.c
    ├── clustercat-array.h
    ├── clustercat-cluster.c
    ├── clustercat-cluster.h
    ├── clustercat-data.h
    ├── clustercat-dbg.c
    ├── clustercat-dbg.h
    ├── clustercat-import-class-file.c
    ├── clustercat-import-class-file.h
    ├── clustercat-io.c
    ├── clustercat-io.h
    ├── clustercat-map.c
    ├── clustercat-map.h
    ├── clustercat-math.c
    ├── clustercat-math.h
    ├── clustercat-tokenize.c
    ├── clustercat-tokenize.h
    ├── clustercat.c
    ├── clustercat.h
    └── ext
    │   ├── uthash
    │       ├── LICENSE
    │       ├── README.md
    │       └── src
    │       │   └── uthash.h
    │   └── word2vec
    │       ├── LICENSE
    │       ├── README.txt
    │       ├── distance.c
    │       ├── makefile
    │       └── word-analogy.c
└── visualization
    └── d3
        ├── basque_cluster_thumbnail.png
        ├── french_cluster_thumbnail.png
        ├── index.html
        └── russian_cluster_thumbnail.png


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/clustercat
2 | src/ext/word2vec/distance
3 | src/ext/word2vec/word-analogy
4 | *.[oa~]
5 | .*.sw[op]
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | cache: brew
 3 | compiler:
 4 |   - clang
 5 |   - gcc
 6 | os:
 7 |   - linux
 8 |     #- osx
 9 |     #before_install:
10 |     #  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
11 |     #  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew tap homebrew/versions; fi
12 |     #  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
13 |     #  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm38; fi
14 | script:
15 |   #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=clang-omp; fi
16 |   #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=/usr/local/bin/clang-3.8 CFLAGS="$CFLAGS -I/usr/local/opt/llvm38/lib/llvm-3.8/include/" LDFLAGS="$LDFLAGS -L/usr/local/opt/llvm38/lib/llvm-3.8/lib" ; fi
17 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make; fi
18 | notifications:
19 |   email: false
20 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | This software is licensed under either the GNU LGPL version 3 or the Mozilla
  2 | Public License version 2.0 .  Both licenses are listed below.
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 |                    GNU LESSER GENERAL PUBLIC LICENSE
 10 |                        Version 3, 29 June 2007
 11 | 
 12 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 13 |  Everyone is permitted to copy and distribute verbatim copies
 14 |  of this license document, but changing it is not allowed.
 15 | 
 16 | 
 17 |   This version of the GNU Lesser General Public License incorporates
 18 | the terms and conditions of version 3 of the GNU General Public
 19 | License, supplemented by the additional permissions listed below.
 20 | 
 21 |   0. Additional Definitions.
 22 | 
 23 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 24 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 25 | General Public License.
 26 | 
 27 |   "The Library" refers to a covered work governed by this License,
 28 | other than an Application or a Combined Work as defined below.
 29 | 
 30 |   An "Application" is any work that makes use of an interface provided
 31 | by the Library, but which is not otherwise based on the Library.
 32 | Defining a subclass of a class defined by the Library is deemed a mode
 33 | of using an interface provided by the Library.
 34 | 
 35 |   A "Combined Work" is a work produced by combining or linking an
 36 | Application with the Library.  The particular version of the Library
 37 | with which the Combined Work was made is also called the "Linked
 38 | Version".
 39 | 
 40 |   The "Minimal Corresponding Source" for a Combined Work means the
 41 | Corresponding Source for the Combined Work, excluding any source code
 42 | for portions of the Combined Work that, considered in isolation, are
 43 | based on the Application, and not on the Linked Version.
 44 | 
 45 |   The "Corresponding Application Code" for a Combined Work means the
 46 | object code and/or source code for the Application, including any data
 47 | and utility programs needed for reproducing the Combined Work from the
 48 | Application, but excluding the System Libraries of the Combined Work.
 49 | 
 50 |   1. Exception to Section 3 of the GNU GPL.
 51 | 
 52 |   You may convey a covered work under sections 3 and 4 of this License
 53 | without being bound by section 3 of the GNU GPL.
 54 | 
 55 |   2. Conveying Modified Versions.
 56 | 
 57 |   If you modify a copy of the Library, and, in your modifications, a
 58 | facility refers to a function or data to be supplied by an Application
 59 | that uses the facility (other than as an argument passed when the
 60 | facility is invoked), then you may convey a copy of the modified
 61 | version:
 62 | 
 63 |    a) under this License, provided that you make a good faith effort to
 64 |    ensure that, in the event an Application does not supply the
 65 |    function or data, the facility still operates, and performs
 66 |    whatever part of its purpose remains meaningful, or
 67 | 
 68 |    b) under the GNU GPL, with none of the additional permissions of
 69 |    this License applicable to that copy.
 70 | 
 71 |   3. Object Code Incorporating Material from Library Header Files.
 72 | 
 73 |   The object code form of an Application may incorporate material from
 74 | a header file that is part of the Library.  You may convey such object
 75 | code under terms of your choice, provided that, if the incorporated
 76 | material is not limited to numerical parameters, data structure
 77 | layouts and accessors, or small macros, inline functions and templates
 78 | (ten or fewer lines in length), you do both of the following:
 79 | 
 80 |    a) Give prominent notice with each copy of the object code that the
 81 |    Library is used in it and that the Library and its use are
 82 |    covered by this License.
 83 | 
 84 |    b) Accompany the object code with a copy of the GNU GPL and this license
 85 |    document.
 86 | 
 87 |   4. Combined Works.
 88 | 
 89 |   You may convey a Combined Work under terms of your choice that,
 90 | taken together, effectively do not restrict modification of the
 91 | portions of the Library contained in the Combined Work and reverse
 92 | engineering for debugging such modifications, if you also do each of
 93 | the following:
 94 | 
 95 |    a) Give prominent notice with each copy of the Combined Work that
 96 |    the Library is used in it and that the Library and its use are
 97 |    covered by this License.
 98 | 
 99 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
100 |    document.
101 | 
102 |    c) For a Combined Work that displays copyright notices during
103 |    execution, include the copyright notice for the Library among
104 |    these notices, as well as a reference directing the user to the
105 |    copies of the GNU GPL and this license document.
106 | 
107 |    d) Do one of the following:
108 | 
109 |        0) Convey the Minimal Corresponding Source under the terms of this
110 |        License, and the Corresponding Application Code in a form
111 |        suitable for, and under terms that permit, the user to
112 |        recombine or relink the Application with a modified version of
113 |        the Linked Version to produce a modified Combined Work, in the
114 |        manner specified by section 6 of the GNU GPL for conveying
115 |        Corresponding Source.
116 | 
117 |        1) Use a suitable shared library mechanism for linking with the
118 |        Library.  A suitable mechanism is one that (a) uses at run time
119 |        a copy of the Library already present on the user's computer
120 |        system, and (b) will operate properly with a modified version
121 |        of the Library that is interface-compatible with the Linked
122 |        Version.
123 | 
124 |    e) Provide Installation Information, but only if you would otherwise
125 |    be required to provide such information under section 6 of the
126 |    GNU GPL, and only to the extent that such information is
127 |    necessary to install and execute a modified version of the
128 |    Combined Work produced by recombining or relinking the
129 |    Application with a modified version of the Linked Version. (If
130 |    you use option 4d0, the Installation Information must accompany
131 |    the Minimal Corresponding Source and Corresponding Application
132 |    Code. If you use option 4d1, you must provide the Installation
133 |    Information in the manner specified by section 6 of the GNU GPL
134 |    for conveying Corresponding Source.)
135 | 
136 |   5. Combined Libraries.
137 | 
138 |   You may place library facilities that are a work based on the
139 | Library side by side in a single library together with other library
140 | facilities that are not Applications and are not covered by this
141 | License, and convey such a combined library under terms of your
142 | choice, if you do both of the following:
143 | 
144 |    a) Accompany the combined library with a copy of the same work based
145 |    on the Library, uncombined with any other library facilities,
146 |    conveyed under the terms of this License.
147 | 
148 |    b) Give prominent notice with the combined library that part of it
149 |    is a work based on the Library, and explaining where to find the
150 |    accompanying uncombined form of the same work.
151 | 
152 |   6. Revised Versions of the GNU Lesser General Public License.
153 | 
154 |   The Free Software Foundation may publish revised and/or new versions
155 | of the GNU Lesser General Public License from time to time. Such new
156 | versions will be similar in spirit to the present version, but may
157 | differ in detail to address new problems or concerns.
158 | 
159 |   Each version is given a distinguishing version number. If the
160 | Library as you received it specifies that a certain numbered version
161 | of the GNU Lesser General Public License "or any later version"
162 | applies to it, you have the option of following the terms and
163 | conditions either of that published version or of any later version
164 | published by the Free Software Foundation. If the Library as you
165 | received it does not specify a version number of the GNU Lesser
166 | General Public License, you may choose any version of the GNU Lesser
167 | General Public License ever published by the Free Software Foundation.
168 | 
169 |   If the Library as you received it specifies that a proxy can decide
170 | whether future versions of the GNU Lesser General Public License shall
171 | apply, that proxy's public statement of acceptance of any version is
172 | permanent authorization for you to choose that version for the
173 | Library.
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | Mozilla Public License Version 2.0
203 | ==================================
204 | 
205 | 1. Definitions
206 | --------------
207 | 
208 | 1.1. "Contributor"
209 |     means each individual or legal entity that creates, contributes to
210 |     the creation of, or owns Covered Software.
211 | 
212 | 1.2. "Contributor Version"
213 |     means the combination of the Contributions of others (if any) used
214 |     by a Contributor and that particular Contributor's Contribution.
215 | 
216 | 1.3. "Contribution"
217 |     means Covered Software of a particular Contributor.
218 | 
219 | 1.4. "Covered Software"
220 |     means Source Code Form to which the initial Contributor has attached
221 |     the notice in Exhibit A, the Executable Form of such Source Code
222 |     Form, and Modifications of such Source Code Form, in each case
223 |     including portions thereof.
224 | 
225 | 1.5. "Incompatible With Secondary Licenses"
226 |     means
227 | 
228 |     (a) that the initial Contributor has attached the notice described
229 |         in Exhibit B to the Covered Software; or
230 | 
231 |     (b) that the Covered Software was made available under the terms of
232 |         version 1.1 or earlier of the License, but not also under the
233 |         terms of a Secondary License.
234 | 
235 | 1.6. "Executable Form"
236 |     means any form of the work other than Source Code Form.
237 | 
238 | 1.7. "Larger Work"
239 |     means a work that combines Covered Software with other material, in 
240 |     a separate file or files, that is not Covered Software.
241 | 
242 | 1.8. "License"
243 |     means this document.
244 | 
245 | 1.9. "Licensable"
246 |     means having the right to grant, to the maximum extent possible,
247 |     whether at the time of the initial grant or subsequently, any and
248 |     all of the rights conveyed by this License.
249 | 
250 | 1.10. "Modifications"
251 |     means any of the following:
252 | 
253 |     (a) any file in Source Code Form that results from an addition to,
254 |         deletion from, or modification of the contents of Covered
255 |         Software; or
256 | 
257 |     (b) any new file in Source Code Form that contains any Covered
258 |         Software.
259 | 
260 | 1.11. "Patent Claims" of a Contributor
261 |     means any patent claim(s), including without limitation, method,
262 |     process, and apparatus claims, in any patent Licensable by such
263 |     Contributor that would be infringed, but for the grant of the
264 |     License, by the making, using, selling, offering for sale, having
265 |     made, import, or transfer of either its Contributions or its
266 |     Contributor Version.
267 | 
268 | 1.12. "Secondary License"
269 |     means either the GNU General Public License, Version 2.0, the GNU
270 |     Lesser General Public License, Version 2.1, the GNU Affero General
271 |     Public License, Version 3.0, or any later versions of those
272 |     licenses.
273 | 
274 | 1.13. "Source Code Form"
275 |     means the form of the work preferred for making modifications.
276 | 
277 | 1.14. "You" (or "Your")
278 |     means an individual or a legal entity exercising rights under this
279 |     License. For legal entities, "You" includes any entity that
280 |     controls, is controlled by, or is under common control with You. For
281 |     purposes of this definition, "control" means (a) the power, direct
282 |     or indirect, to cause the direction or management of such entity,
283 |     whether by contract or otherwise, or (b) ownership of more than
284 |     fifty percent (50%) of the outstanding shares or beneficial
285 |     ownership of such entity.
286 | 
287 | 2. License Grants and Conditions
288 | --------------------------------
289 | 
290 | 2.1. Grants
291 | 
292 | Each Contributor hereby grants You a world-wide, royalty-free,
293 | non-exclusive license:
294 | 
295 | (a) under intellectual property rights (other than patent or trademark)
296 |     Licensable by such Contributor to use, reproduce, make available,
297 |     modify, display, perform, distribute, and otherwise exploit its
298 |     Contributions, either on an unmodified basis, with Modifications, or
299 |     as part of a Larger Work; and
300 | 
301 | (b) under Patent Claims of such Contributor to make, use, sell, offer
302 |     for sale, have made, import, and otherwise transfer either its
303 |     Contributions or its Contributor Version.
304 | 
305 | 2.2. Effective Date
306 | 
307 | The licenses granted in Section 2.1 with respect to any Contribution
308 | become effective for each Contribution on the date the Contributor first
309 | distributes such Contribution.
310 | 
311 | 2.3. Limitations on Grant Scope
312 | 
313 | The licenses granted in this Section 2 are the only rights granted under
314 | this License. No additional rights or licenses will be implied from the
315 | distribution or licensing of Covered Software under this License.
316 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
317 | Contributor:
318 | 
319 | (a) for any code that a Contributor has removed from Covered Software;
320 |     or
321 | 
322 | (b) for infringements caused by: (i) Your and any other third party's
323 |     modifications of Covered Software, or (ii) the combination of its
324 |     Contributions with other software (except as part of its Contributor
325 |     Version); or
326 | 
327 | (c) under Patent Claims infringed by Covered Software in the absence of
328 |     its Contributions.
329 | 
330 | This License does not grant any rights in the trademarks, service marks,
331 | or logos of any Contributor (except as may be necessary to comply with
332 | the notice requirements in Section 3.4).
333 | 
334 | 2.4. Subsequent Licenses
335 | 
336 | No Contributor makes additional grants as a result of Your choice to
337 | distribute the Covered Software under a subsequent version of this
338 | License (see Section 10.2) or under the terms of a Secondary License (if
339 | permitted under the terms of Section 3.3).
340 | 
341 | 2.5. Representation
342 | 
343 | Each Contributor represents that the Contributor believes its
344 | Contributions are its original creation(s) or it has sufficient rights
345 | to grant the rights to its Contributions conveyed by this License.
346 | 
347 | 2.6. Fair Use
348 | 
349 | This License is not intended to limit any rights You have under
350 | applicable copyright doctrines of fair use, fair dealing, or other
351 | equivalents.
352 | 
353 | 2.7. Conditions
354 | 
355 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
356 | in Section 2.1.
357 | 
358 | 3. Responsibilities
359 | -------------------
360 | 
361 | 3.1. Distribution of Source Form
362 | 
363 | All distribution of Covered Software in Source Code Form, including any
364 | Modifications that You create or to which You contribute, must be under
365 | the terms of this License. You must inform recipients that the Source
366 | Code Form of the Covered Software is governed by the terms of this
367 | License, and how they can obtain a copy of this License. You may not
368 | attempt to alter or restrict the recipients' rights in the Source Code
369 | Form.
370 | 
371 | 3.2. Distribution of Executable Form
372 | 
373 | If You distribute Covered Software in Executable Form then:
374 | 
375 | (a) such Covered Software must also be made available in Source Code
376 |     Form, as described in Section 3.1, and You must inform recipients of
377 |     the Executable Form how they can obtain a copy of such Source Code
378 |     Form by reasonable means in a timely manner, at a charge no more
379 |     than the cost of distribution to the recipient; and
380 | 
381 | (b) You may distribute such Executable Form under the terms of this
382 |     License, or sublicense it under different terms, provided that the
383 |     license for the Executable Form does not attempt to limit or alter
384 |     the recipients' rights in the Source Code Form under this License.
385 | 
386 | 3.3. Distribution of a Larger Work
387 | 
388 | You may create and distribute a Larger Work under terms of Your choice,
389 | provided that You also comply with the requirements of this License for
390 | the Covered Software. If the Larger Work is a combination of Covered
391 | Software with a work governed by one or more Secondary Licenses, and the
392 | Covered Software is not Incompatible With Secondary Licenses, this
393 | License permits You to additionally distribute such Covered Software
394 | under the terms of such Secondary License(s), so that the recipient of
395 | the Larger Work may, at their option, further distribute the Covered
396 | Software under the terms of either this License or such Secondary
397 | License(s).
398 | 
399 | 3.4. Notices
400 | 
401 | You may not remove or alter the substance of any license notices
402 | (including copyright notices, patent notices, disclaimers of warranty,
403 | or limitations of liability) contained within the Source Code Form of
404 | the Covered Software, except that You may alter any license notices to
405 | the extent required to remedy known factual inaccuracies.
406 | 
407 | 3.5. Application of Additional Terms
408 | 
409 | You may choose to offer, and to charge a fee for, warranty, support,
410 | indemnity or liability obligations to one or more recipients of Covered
411 | Software. However, You may do so only on Your own behalf, and not on
412 | behalf of any Contributor. You must make it absolutely clear that any
413 | such warranty, support, indemnity, or liability obligation is offered by
414 | You alone, and You hereby agree to indemnify every Contributor for any
415 | liability incurred by such Contributor as a result of warranty, support,
416 | indemnity or liability terms You offer. You may include additional
417 | disclaimers of warranty and limitations of liability specific to any
418 | jurisdiction.
419 | 
420 | 4. Inability to Comply Due to Statute or Regulation
421 | ---------------------------------------------------
422 | 
423 | If it is impossible for You to comply with any of the terms of this
424 | License with respect to some or all of the Covered Software due to
425 | statute, judicial order, or regulation then You must: (a) comply with
426 | the terms of this License to the maximum extent possible; and (b)
427 | describe the limitations and the code they affect. Such description must
428 | be placed in a text file included with all distributions of the Covered
429 | Software under this License. Except to the extent prohibited by statute
430 | or regulation, such description must be sufficiently detailed for a
431 | recipient of ordinary skill to be able to understand it.
432 | 
433 | 5. Termination
434 | --------------
435 | 
436 | 5.1. The rights granted under this License will terminate automatically
437 | if You fail to comply with any of its terms. However, if You become
438 | compliant, then the rights granted under this License from a particular
439 | Contributor are reinstated (a) provisionally, unless and until such
440 | Contributor explicitly and finally terminates Your grants, and (b) on an
441 | ongoing basis, if such Contributor fails to notify You of the
442 | non-compliance by some reasonable means prior to 60 days after You have
443 | come back into compliance. Moreover, Your grants from a particular
444 | Contributor are reinstated on an ongoing basis if such Contributor
445 | notifies You of the non-compliance by some reasonable means, this is the
446 | first time You have received notice of non-compliance with this License
447 | from such Contributor, and You become compliant prior to 30 days after
448 | Your receipt of the notice.
449 | 
450 | 5.2. If You initiate litigation against any entity by asserting a patent
451 | infringement claim (excluding declaratory judgment actions,
452 | counter-claims, and cross-claims) alleging that a Contributor Version
453 | directly or indirectly infringes any patent, then the rights granted to
454 | You by any and all Contributors for the Covered Software under Section
455 | 2.1 of this License shall terminate.
456 | 
457 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
458 | end user license agreements (excluding distributors and resellers) which
459 | have been validly granted by You or Your distributors under this License
460 | prior to termination shall survive termination.
461 | 
462 | ************************************************************************
463 | *                                                                      *
464 | *  6. Disclaimer of Warranty                                           *
465 | *  -------------------------                                           *
466 | *                                                                      *
467 | *  Covered Software is provided under this License on an "as is"       *
468 | *  basis, without warranty of any kind, either expressed, implied, or  *
469 | *  statutory, including, without limitation, warranties that the       *
470 | *  Covered Software is free of defects, merchantable, fit for a        *
471 | *  particular purpose or non-infringing. The entire risk as to the     *
472 | *  quality and performance of the Covered Software is with You.        *
473 | *  Should any Covered Software prove defective in any respect, You     *
474 | *  (not any Contributor) assume the cost of any necessary servicing,   *
475 | *  repair, or correction. This disclaimer of warranty constitutes an   *
476 | *  essential part of this License. No use of any Covered Software is   *
477 | *  authorized under this License except under this disclaimer.         *
478 | *                                                                      *
479 | ************************************************************************
480 | 
481 | ************************************************************************
482 | *                                                                      *
483 | *  7. Limitation of Liability                                          *
484 | *  --------------------------                                          *
485 | *                                                                      *
486 | *  Under no circumstances and under no legal theory, whether tort      *
487 | *  (including negligence), contract, or otherwise, shall any           *
488 | *  Contributor, or anyone who distributes Covered Software as          *
489 | *  permitted above, be liable to You for any direct, indirect,         *
490 | *  special, incidental, or consequential damages of any character      *
491 | *  including, without limitation, damages for lost profits, loss of    *
492 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
493 | *  and all other commercial damages or losses, even if such party      *
494 | *  shall have been informed of the possibility of such damages. This   *
495 | *  limitation of liability shall not apply to liability for death or   *
496 | *  personal injury resulting from such party's negligence to the       *
497 | *  extent applicable law prohibits such limitation. Some               *
498 | *  jurisdictions do not allow the exclusion or limitation of           *
499 | *  incidental or consequential damages, so this exclusion and          *
500 | *  limitation may not apply to You.                                    *
501 | *                                                                      *
502 | ************************************************************************
503 | 
504 | 8. Litigation
505 | -------------
506 | 
507 | Any litigation relating to this License may be brought only in the
508 | courts of a jurisdiction where the defendant maintains its principal
509 | place of business and such litigation shall be governed by laws of that
510 | jurisdiction, without reference to its conflict-of-law provisions.
511 | Nothing in this Section shall prevent a party's ability to bring
512 | cross-claims or counter-claims.
513 | 
514 | 9. Miscellaneous
515 | ----------------
516 | 
517 | This License represents the complete agreement concerning the subject
518 | matter hereof. If any provision of this License is held to be
519 | unenforceable, such provision shall be reformed only to the extent
520 | necessary to make it enforceable. Any law or regulation which provides
521 | that the language of a contract shall be construed against the drafter
522 | shall not be used to construe this License against a Contributor.
523 | 
524 | 10. Versions of the License
525 | ---------------------------
526 | 
527 | 10.1. New Versions
528 | 
529 | Mozilla Foundation is the license steward. Except as provided in Section
530 | 10.3, no one other than the license steward has the right to modify or
531 | publish new versions of this License. Each version will be given a
532 | distinguishing version number.
533 | 
534 | 10.2. Effect of New Versions
535 | 
536 | You may distribute the Covered Software under the terms of the version
537 | of the License under which You originally received the Covered Software,
538 | or under the terms of any subsequent version published by the license
539 | steward.
540 | 
541 | 10.3. Modified Versions
542 | 
543 | If you create software not governed by this License, and you want to
544 | create a new license for such software, you may create and use a
545 | modified version of this License if you rename the license and remove
546 | any references to the name of the license steward (except to note that
547 | such modified license differs from this License).
548 | 
549 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
550 | Licenses
551 | 
552 | If You choose to distribute Source Code Form that is Incompatible With
553 | Secondary Licenses under the terms of this version of the License, the
554 | notice described in Exhibit B of this License must be attached.
555 | 
556 | Exhibit A - Source Code Form License Notice
557 | -------------------------------------------
558 | 
559 |   This Source Code Form is subject to the terms of the Mozilla Public
560 |   License, v. 2.0. If a copy of the MPL was not distributed with this
561 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
562 | 
563 | If it is not possible or desirable to put the notice in a particular
564 | file, then You may include the notice in a location (such as a LICENSE
565 | file in a relevant directory) where a recipient would be likely to look
566 | for such a notice.
567 | 
568 | You may add additional accurate notices of copyright ownership.
569 | 
570 | Exhibit B - "Incompatible With Secondary Licenses" Notice
571 | ---------------------------------------------------------
572 | 
573 |   This Source Code Form is "Incompatible With Secondary Licenses", as
574 |   defined by the Mozilla Public License, v. 2.0.
575 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env make
 2 | 
 3 | CC=cc
 4 | INCLUDE=-I ./src/ext/uthash/src/
 5 | ##  * For -march info on your platform, type: gcc -march=native -Q --help=target  (or just compile with -march=native )
 6 | ##  * We include the argument -Wno-unknown-pragmas to suppress clang's lack of support for openmp
 7 | ##    Since we use the gnuism 'override', you don't need to modify this makefile; you can just run:  make -j4 CFLAGS=-DATA_STORE_TRIE_LCRS
 8 | override CFLAGS += -march=native -std=c99 -O3 -fopenmp -finline-functions -fno-math-errno -fstrict-aliasing -DHASH_FUNCTION=HASH_SAX -DHASH_BLOOM=25 -Wall -Wextra -Winline -Wstrict-aliasing -Wno-unknown-pragmas -Wno-comment -Wno-missing-field-initializers ${INCLUDE}
 9 | LDLIBS=-lm -fopenmp #-ltcmalloc_minimal
10 | BIN=bin/
11 | SRC=src/
12 | OBJS=${SRC}/clustercat-array.o ${SRC}/clustercat-cluster.o ${SRC}/clustercat-dbg.o ${SRC}/clustercat-io.o ${SRC}/clustercat-import-class-file.o ${SRC}/clustercat-map.o ${SRC}/clustercat-math.o ${SRC}/clustercat-tokenize.o
13 | includes=${SRC}/$(wildcard *.h)
14 | date:=$(shell date +%F)
15 | machine_type:=$(shell uname -m)
16 | 
17 | all: ${BIN}/clustercat
18 | .PHONY : all install tar clean
19 | 
20 | clustercat.h: ${SRC}/clustercat-array.h ${SRC}/clustercat-data.h ${SRC}/clustercat-map.h
21 | 
22 | 
23 | ${BIN}/clustercat: ${SRC}/clustercat.c ${OBJS}
24 | 	${CC} -Wl,-s $^ -o $@ ${CFLAGS} ${LDLIBS}
25 | 
26 | clustercat.c: ${SRC}/clustercat.h ${SRC}/clustercat-cluster.h ${SRC}/clustercat-dbg.h ${SRC}/clustercat-io.h ${SRC}/clustercat-import-class-file.h ${SRC}/clustercat-math.h ${SRC}/clustercat-tokenize.h
27 | 
28 | install: ${BIN}/clustercat
29 | 	cp -p ${BIN}/clustercat /usr/bin/ 2>/dev/null || \
30 | 	mkdir --parents ${HOME}/bin/ && \
31 | 	cp -p ${BIN}/clustercat ${HOME}/bin/
32 | 
33 | tar: ${BIN}/clustercat
34 | 	mkdir clustercat-${date} && \
35 | 	mkdir clustercat-${date}/bin && \
36 | 	mkdir clustercat-${date}/src && \
37 | 	mkdir --parents clustercat-${date}/src/ext/uthash/src && \
38 | 	cp -a ${BIN}/clustercat clustercat-${date}/bin/ && \
39 | 	cp -a ${BIN}/clustercat clustercat-${date}/bin/clustercat.${machine_type} && \
40 | 	cp -a ${SRC}/*.c ${SRC}/*.h clustercat-${date}/src/ && \
41 | 	cp -a Makefile README.md LICENSE clustercat-${date}/ && \
42 | 	cp -a ${SRC}/ext/uthash/src/uthash.h clustercat-${date}/src/ext/uthash/src/ && \
43 | 	tar -cf clustercat-${date}.tar clustercat-${date}/ && \
44 | 	gzip -9 clustercat-${date}.tar && \
45 | 	rm -rf clustercat-${date}/
46 | 
47 | clean:
48 | 	\rm -f ${BIN}/clustercat ${SRC}/*.o
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ClusterCat: Fast, Flexible Word Clustering Software
  2 | 
  3 | [![Build Status](https://travis-ci.org/jonsafari/clustercat.svg?branch=master)](https://travis-ci.org/jonsafari/clustercat)
  4 | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0)
  5 | [![License: MPL 2.0](https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg)](https://opensource.org/licenses/MPL-2.0)
  6 | 
  7 | 
  8 | ## Overview
  9 | 
 10 | ClusterCat induces word classes from unannotated text.
 11 | It is programmed in modern C, with no external libraries.
 12 | A Python wrapper is also provided.
 13 | 
 14 | Word classes are unsupervised part-of-speech tags, requiring no manually-annotated corpus.
 15 | Words are grouped together that share syntactic/semantic similarities.
 16 | They are used in many dozens of applications within natural language processing, machine translation, neural net training, and related fields.
 17 | 
 18 | 
 19 | ## Installation
 20 | ### Linux
 21 | You can use either GCC 4.6+ or Clang 3.7+, but GCC is usually faster.
 22 | 
 23 |       sudo apt update  &&  sudo apt install gcc make libomp-dev
 24 |       make -j
 25 | 
 26 | ### macOS / OSX
 27 | The current version of Clang in Xcode doesn't fully support [OpenMP][], so instead install GCC from [Homebrew][]:
 28 | 
 29 |       brew update  &&  brew install gcc@9 libomp  &&  xcode-select --install
 30 |       make -j CC=/opt/homebrew/bin/gcc-9
 31 | 
 32 | 
 33 | ## Commands
 34 | The binary program `clustercat` gets compiled into the `bin` directory.
 35 | 
 36 | **Clustering** preprocessed text (already tokenized, normalized, etc) is pretty simple:
 37 | 
 38 |       bin/clustercat [options] < train.tok.txt > clusters.tsv
 39 | 
 40 | The word-classes are induced from a bidirectional [predictive][] [exchange algorithm][].
 41 | The format of the output class file has each line consisting of `word`*TAB*`class` (a word type, then tab, then class).
 42 | 
 43 | Command-line argument **usage** may be obtained by running with program with the **`--help`** flag:
 44 | 
 45 |       bin/clustercat --help
 46 | 
 47 | 
 48 | ## Python
 49 | Installation and usage details for the Python module are described in a separate [readme](python/README.md).
 50 | 
 51 | 
 52 | ## Features
 53 | - Print **[word vectors][]** (a.k.a. word embeddings) using the `--word-vectors` flag.  The binary format is compatible with word2vec's tools.
 54 | - Start training using an **existing word cluster mapping** from other clustering software (eg. mkcls) using the `--class-file` flag.
 55 | - Adjust the number of **threads** to use with the `--threads` flag.  The default is 8.
 56 | - Adjust the **number of clusters** or vector dimensions using the `--classes` flag. The default is approximately the square root of the vocabulary size.
 57 | - Includes **compatibility wrapper script ` bin/mkcls `** that can be run just like mkcls.  You can use more classes now :-)
 58 | 
 59 | 
 60 | ## Comparison
 61 | | Training Set                                        | [Brown][] | ClusterCat | [mkcls][] | [Phrasal][] | [word2vec][] |
 62 | | ------------                                        | --------- | ---------- | --------- | ----------- | ------------ |
 63 | | 1 Billion English tokens,   800 clusters  | 12.5 hr   | **1.4** hr | 48.8 hr   | 5.1 hr      | 20.6 hr      |
 64 | | 1 Billion English tokens,   1200 clusters | 25.5 hr   | **1.7** hr | 68.8 hr   | 6.2 hr      | 33.7 hr      |
 65 | | 550 Million Russian tokens, 800 clusters  | 14.6 hr   | **1.5** hr | 75.0 hr   | 5.5 hr      | 12.0 hr      |
 66 | 
 67 | 
 68 | ## Visualization
 69 | See [bl.ocks.org][] for nice data visualizations of the clusters for various languages, including English, German, Persian, Hindi, Czech, Catalan, Tajik, Basque, Russian, French, and Maltese.
 70 | 
 71 | For example:
 72 | 
 73 |  ![French Clustering Thumbnail](visualization/d3/french_cluster_thumbnail.png)
 74 |  ![Russian Clustering Thumbnail](visualization/d3/russian_cluster_thumbnail.png)
 75 |  ![Basque Clustering Thumbnail](visualization/d3/basque_cluster_thumbnail.png)
 76 | 
 77 | You can generate your own graphics from ClusterCat's output.
 78 | Add the flag  `--print-freqs`  to ClusterCat, then type the command:
 79 | 
 80 |       bin/flat_clusters2json.pl --word-labels < clusters.tsv > visualization/d3/clusters.json
 81 | 
 82 | You can either upload the [JSON][] file to [gist.github.com][], following instructions on the [bl.ocks.org](http://bl.ocks.org) front page, or you can view the graphic locally by running a minimal webserver in the `visualization/d3` directory:
 83 | 
 84 |       python -m SimpleHTTPServer 8116 2>/dev/null &
 85 | 
 86 | Then open a tab in your browser to [localhost:8116](http://localhost:8116) .
 87 | 
 88 | The default settings are sensible for normal usage, but for visualization you probably want much fewer word types and clusters -- less than 10,000 word types and 120 clusters.
 89 | Your browser will thank you.
 90 | 
 91 | 
 92 | ## Perplexity
 93 | The perplexity that ClusterCat reports uses a bidirectional bigram class language model, which is richer than the unidirectional bigram-based perplexities reported by most other software.
 94 | Richer models provide a better evaluation of the quality of clusters, having more sensitivity (power) to detect improvements.
 95 | If you want to directly compare the quality of clusters with a different program's output, you have a few options:
 96 | 
 97 | 1. Load another clustering using `--class-file` , and see what the other clustering's initial bidirectional bigram perplexity is before any words get exchanged.
 98 | 2. Use an external class-based language model.  These are usually two-sided (unlexicalized) models, so they favor two-sided clusterers.
 99 | 3. Evaluate on a downstream task.  This is best.
100 | 
101 | 
102 | ## Contributions
103 | Contributions are welcome, via [pull requests][].
104 | 
105 | 
106 | ## Citation
107 | If you use this software please cite the following
108 | 
109 | Dehdari, Jon, Liling Tan, and Josef van Genabith. 2016. [BIRA: Improved Predictive Exchange Word Clustering](http://www.aclweb.org/anthology/N16-1139.pdf).
110 | In *Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)*, pages 1169–1174, San Diego, CA, USA.  Association for Computational Linguistics.
111 | 
112 |     @inproceedings{dehdari-etal2016,
113 |      author    = {Dehdari, Jon  and  Tan, Liling  and  van Genabith, Josef},
114 |      title     = {{BIRA}: Improved Predictive Exchange Word Clustering},
115 |      booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)},
116 |      month     = {June},
117 |      year      = {2016},
118 |      address   = {San Diego, CA, USA},
119 |      publisher = {Association for Computational Linguistics},
120 |      pages     = {1169--1174},
121 |      url       = {http://www.aclweb.org/anthology/N16-1139.pdf}
122 |     }
123 | 
124 | [lgpl3]: https://www.gnu.org/copyleft/lesser.html
125 | [mpl2]: https://www.mozilla.org/MPL/2.0
126 | [c99]: https://en.wikipedia.org/wiki/C99
127 | [homebrew]: http://brew.sh
128 | [openmp]: https://en.wikipedia.org/wiki/OpenMP
129 | [predictive]: https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
130 | [exchange algorithm]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354
131 | [brown]: https://github.com/percyliang/brown-cluster
132 | [mkcls]: https://github.com/moses-smt/mgiza
133 | [phrasal]: https://github.com/stanfordnlp/phrasal
134 | [word2vec]: https://code.google.com/archive/p/word2vec/
135 | [word vectors]: https://en.wikipedia.org/wiki/Word_embedding
136 | [bl.ocks.org]: http://bl.ocks.org/jonsafari
137 | [JSON]: https://en.wikipedia.org/wiki/JSON
138 | [gist.github.com]: https://gist.github.com
139 | [pull requests]: https://help.github.com/articles/creating-a-pull-request
140 | 


--------------------------------------------------------------------------------
/bin/digit_conflate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | ## By Jon Dehdari 2013
 3 | ## Conflates all digits to the same digit
 4 | ## Usage: perl digit_conflate.pl [options] < in > out
 5 | 
 6 | use strict;
 7 | use Getopt::Long;
 8 | 
 9 | ## Defaults
10 | my $digit = 5;
11 | 
12 | my $usage     = <<"END_OF_USAGE";
13 | digit_conflate.pl    (c) 2013 Jon Dehdari - LGPL v3
14 | 
15 | Usage:    perl $0 [options] < in > out
16 | 
17 | Function: Conflates all digits to the same digit
18 |           For example, "12,629.24" -> "55,555.55"
19 | 
20 | Options:
21 |  -h, --help        Print this usage
22 |  -d, --digit <n>   Set output digit to <n> (default: $digit)
23 | 
24 | END_OF_USAGE
25 | 
26 | GetOptions(
27 |     'h|help|?'		=> sub { print $usage; exit; },
28 |     'd|digit=i'		=> \$digit,
29 | ) or die $usage;
30 | 
31 | 
32 | while (<>) {
33 | 	s/\d/$digit/g;
34 | 	print;
35 | }
36 | 


--------------------------------------------------------------------------------
/bin/flat_clusters2json.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | ## By Jon Dehdari 2015
 3 | ## Converts boring flat tsv clustering format to json for visualization
 4 | ## Usage: perl clusters2json.pl [options] < in > out
 5 | 
 6 | use strict;
 7 | use Getopt::Long;
 8 | 
 9 | my $word_labels = undef;
10 | 
11 | my $usage     = <<"END_OF_USAGE";
12 | clusters2json.pl    (c) 2015 Jon Dehdari - LGPL v3 or Mozilla Public License v2
13 | 
14 | Usage:    perl $0 [options] < in > out
15 | 
16 | Function: Converts tsv clustering format to json for visualization
17 | 
18 | Options:
19 |  -h, --help         Print this usage
20 |      --word-labels  Use the first word in a cluster series as the cluster label.
21 |                     This option is useful if the input is already sorted by frequency.
22 | 
23 | END_OF_USAGE
24 | 
25 | GetOptions(
26 | 	'h|help|?'		=> sub { print $usage; exit; },
27 | 	'word-labels'	=> \$word_labels,
28 | ) or die $usage;
29 | 
30 | my ($word, $cluster, $freq) = undef;
31 | my $last_cluster = -1;
32 | 
33 | print <<END;
34 | {
35 |   "name": "Clusters",
36 |   "children": [
37 |     {
38 | END
39 | 
40 | while (<>) {
41 | 	chomp;
42 | 	($word, $cluster, $freq) = split;
43 | 	$freq or $freq = 1; # if word frequencies aren't provided
44 | 
45 | 	$word =~ s/(["\/])/\\$1/g; # escape problematic characters
46 | 	#$word =~ s/</&lt;/g;
47 | 	#$word =~ s/>/&gt;/g;
48 | 
49 | 	if ($cluster != $last_cluster) { # We've reached a new cluster
50 | 
51 | 		if ($last_cluster != -1) { # end cluster's children (ie words), then start new cluster
52 | 			print <<END
53 | }
54 |       ]
55 |     },
56 |     {
57 | END
58 | 		}
59 | 		$last_cluster = $cluster;
60 | 
61 | 		if ($word_labels) {
62 | 			print <<END;
63 |       "name": "$word",
64 | END
65 | 		} else {
66 | 			print <<END;
67 |       "name": "$cluster",
68 | END
69 | 		}
70 | 
71 | 		print <<END;
72 |       "children": [
73 | END
74 | 		print "        {";
75 | 	} else { # within a cluster
76 | 		print "},\n        {";
77 | 	}
78 | 
79 | 	print "\"name\": \"$word\", \"size\": $freq";
80 | 
81 | 	$freq = undef; # in case no freqs are provided
82 | } # end while (<>) loop
83 | 
84 | print <<END;
85 | }
86 |       ]
87 |     }
88 |   ]
89 | }
90 | END
91 | 


--------------------------------------------------------------------------------
/bin/hier2flat_no_freqs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Converts Percy Liang's brown-cluster hierarchical cluster labels to flat labels, starting with 0.
 4 | 
 5 | awk '
 6 | 	BEGIN {
 7 | 		clus=2
 8 | 		nclus=-1
 9 | 	}
10 | 	{
11 | 		if (clus != $1) {
12 | 			clus=$1
13 | 			nclus++
14 | 		}
15 | 		print $2 "\t" nclus
16 | 	}
17 | 	
18 | '
19 | 


--------------------------------------------------------------------------------
/bin/hier2flat_with_freqs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Converts Percy Liang's brown-cluster hierarchical cluster labels to flat labels, starting with 0
 4 | 
 5 | awk '
 6 | 	BEGIN {
 7 | 		clus=2
 8 | 		nclus=-1
 9 | 	}
10 | 	{
11 | 		if (clus != $1) {
12 | 			clus=$1
13 | 			nclus++
14 | 		}
15 | 		print $2 "\t" nclus "\t" $3
16 | 	}
17 | 	
18 | '
19 | 


--------------------------------------------------------------------------------
/bin/lowercase.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | ## Lowercase text
3 | ## Usage: perl lowercase.pl < input > output
4 | 
5 | binmode(STDIN, ":utf8");
6 | binmode(STDOUT, ":utf8");
7 | 
8 | print lc while <STDIN>;
9 | 


--------------------------------------------------------------------------------
/bin/mkcls:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Compatibility wrapper for clustercat, using mkcls's command-line arguments
 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
 5 | 
 6 | mkcls_cmd_args='
 7 | mkcls command-line arguments:
 8 | 
 9 |   -p<file>    training input text file (default: train)
10 |   -V<file>    cluster output file
11 |   -c<i>       number of word clusters (default: 100)
12 |   -m<i>       minimum word count (default: 1)
13 |   -v<i>       verbose mode
14 | 
15 | Ignored arguments:
16 |   -a<s>       set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
17 |   -e<f> <i>   set stochastic optimization parameter (for gamma, nu, alpha)
18 |   -h<s>       set hapax init name
19 |   -i<s>       set initialization value {ran,aio,gda,freq,other} (default: ran)
20 |   -k<s>       set category selection {det,ran,best} (default: best)
21 |   -l<f>       use LO, and set rho
22 |   -M<i>       maximum number of optimization steps
23 |   -n<i>       number of optimization runs (default: 1)
24 |   -N<i>       set optimize parameter count (default: 10)
25 |   -o<file>    graph output
26 |   -O<i>       set one-with-hapas (default: 1)
27 |   -P<file>    training ngram file
28 |   -r<i>       set random seed (default: 532567487)
29 |   -s<i>       set maximum runtime seconds
30 |   -w<s>       set word selection {det,ran,incr} (default: det)
31 |   -y<f>       use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 | 
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | cmd_string="$(dirname $0)/clustercat --min-count 1 --num-classes 100 --in train   "
36 | 
37 | 
38 | while [ $# -gt 0 ]; do
39 | 
40 | 	## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
41 | 	if [ $1 = '--help' ] || [ $1 = '-h' ]; then
42 | 		echo "$mkcls_cmd_args"
43 | 		exit
44 | 	fi
45 | 
46 | 	## Ugh.  Use a space between flags and their values
47 | 	flag=$(echo "$1" | grep -o '^-.')
48 | 	arg=${1#-?}
49 | 	case $flag in
50 | 	-p)
51 | 		cmd_string="$cmd_string --in $arg "
52 | 		shift
53 | 		;;
54 | 	-V)
55 | 		cmd_string="$cmd_string --out $arg "
56 | 		shift
57 | 		;;
58 | 	-c)
59 | 		cmd_string="$cmd_string --num-classes $arg "
60 | 		shift
61 | 		;;
62 | 	-m)
63 | 		cmd_string="$cmd_string --min-count $arg "
64 | 		shift
65 | 		;;
66 | 	-v)
67 | 		cmd_string="$cmd_string --verbose "
68 | 		shift
69 | 		;;
70 | 	*)
71 | 		shift
72 | 		;;
73 | 	esac
74 | done
75 | 
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string"
79 | 


--------------------------------------------------------------------------------
/bin/mkcls4brown:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Compatibility wrapper for brown-cluster, using mkcls's command-line arguments
 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
 5 | 
 6 | mkcls_cmd_args='
 7 | mkcls command-line arguments:
 8 | 
 9 |   -p<file>    training input text file (default: train)
10 |   -V<file>    cluster output file
11 |   -c<i>       number of word clusters (default: 100)
12 |   -m<i>       minimum word count (default: 1)
13 | 
14 | Ignored arguments:
15 |   -a<s>       set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
16 |   -e<f> <i>   set stochastic optimization parameter (for gamma, nu, alpha)
17 |   -h<s>       set hapax init name
18 |   -i<s>       set initialization value {ran,aio,gda,freq,other} (default: ran)
19 |   -k<s>       set category selection {det,ran,best} (default: best)
20 |   -l<f>       use LO, and set rho
21 |   -M<i>       maximum number of optimization steps
22 |   -n<i>       number of optimization runs (default: 1)
23 |   -N<i>       set optimize parameter count (default: 10)
24 |   -o<file>    graph output
25 |   -O<i>       set one-with-hapas (default: 1)
26 |   -P<file>    training ngram file
27 |   -r<i>       set random seed (default: 532567487)
28 |   -s<i>       set maximum runtime seconds
29 |   -v<i>       verbose mode
30 |   -w<s>       set word selection {det,ran,incr} (default: det)
31 |   -y<f>       use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 | 
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | min_count=1
36 | classes=100
37 | in_file='train'
38 | 
39 | 
40 | while [ $# -gt 0 ]; do
41 | 
42 | 	## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
43 | 	if [ $1 = '--help' ] || [ $1 = '-h' ]; then
44 | 		echo "$mkcls_cmd_args"
45 | 		exit
46 | 	fi
47 | 
48 | 	## Ugh.  Use a space between flags and their values
49 | 	flag=$(echo "$1" | grep -o '^-.')
50 | 	arg=${1#-?}
51 | 	case $flag in
52 | 	-p)
53 | 		in_file="$arg"
54 | 		shift
55 | 		;;
56 | 	-V)
57 | 		out_file="$arg"
58 | 		shift
59 | 		;;
60 | 	-c)
61 | 		classes="$arg"
62 | 		shift
63 | 		;;
64 | 	-m)
65 | 		min_count="$arg"
66 | 		shift
67 | 		;;
68 | 	*)
69 | 		shift
70 | 		;;
71 | 	esac
72 | done
73 | 
74 | cmd_string="$(dirname $0)/wcluster --threads 4 --min-occur $min_count  --c $classes --text $in_file --output_dir ${out_file}_brown_dir "
75 | 
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string" && \
79 | $(dirname $0)/hier2flat_no_freqs.sh < ${out_file}_brown_dir/paths > $out_file && \
80 | \rm ${out_file}_brown_dir/log # really verbose for large corpora
81 | 


--------------------------------------------------------------------------------
/bin/mkcls4word2vec:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Compatibility wrapper for word2vec, using mkcls's command-line arguments
 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
 5 | 
 6 | mkcls_cmd_args='
 7 | mkcls command-line arguments:
 8 | 
 9 |   -p<file>    training input text file (default: train)
10 |   -V<file>    cluster output file
11 |   -c<i>       number of word clusters (default: 100)
12 |   -m<i>       minimum word count (default: 1)
13 | 
14 | Ignored arguments:
15 |   -a<s>       set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
16 |   -e<f> <i>   set stochastic optimization parameter (for gamma, nu, alpha)
17 |   -h<s>       set hapax init name
18 |   -i<s>       set initialization value {ran,aio,gda,freq,other} (default: ran)
19 |   -k<s>       set category selection {det,ran,best} (default: best)
20 |   -l<f>       use LO, and set rho
21 |   -M<i>       maximum number of optimization steps
22 |   -n<i>       number of optimization runs (default: 1)
23 |   -N<i>       set optimize parameter count (default: 10)
24 |   -o<file>    graph output
25 |   -O<i>       set one-with-hapas (default: 1)
26 |   -P<file>    training ngram file
27 |   -r<i>       set random seed (default: 532567487)
28 |   -s<i>       set maximum runtime seconds
29 |   -v<i>       verbose mode
30 |   -w<s>       set word selection {det,ran,incr} (default: det)
31 |   -y<f>       use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 | 
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | min_count=1
36 | classes=100
37 | in_file='train'
38 | 
39 | 
40 | while [ $# -gt 0 ]; do
41 | 
42 | 	## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
43 | 	if [ $1 = '--help' ] || [ $1 = '-h' ]; then
44 | 		echo "$mkcls_cmd_args"
45 | 		exit
46 | 	fi
47 | 
48 | 	## Ugh.  Use a space between flags and their values
49 | 	flag=$(echo "$1" | grep -o '^-.')
50 | 	arg=${1#-?}
51 | 	case $flag in
52 | 	-p)
53 | 		in_file="$arg"
54 | 		shift
55 | 		;;
56 | 	-V)
57 | 		out_file="$arg"
58 | 		shift
59 | 		;;
60 | 	-c)
61 | 		classes="$arg"
62 | 		shift
63 | 		;;
64 | 	-m)
65 | 		min_count="$arg"
66 | 		shift
67 | 		;;
68 | 	*)
69 | 		shift
70 | 		;;
71 | 	esac
72 | done
73 | 
74 | cmd_string="$(dirname $0)/word2vec -min-count $min_count  -classes $classes -size $classes -train $in_file -output $out_file "
75 | 
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string" && \
79 | perl -p -i -e 's/ /\t/g' $out_file
80 | 


--------------------------------------------------------------------------------
/bin/ngram_counts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | ## By Jon Dehdari, 2015, public domain
 3 | ## Counts ngrams, including joined ngrams, from text corpus
 4 | 
 5 | import sys
 6 | 
 7 | ngram_order = 4
 8 | ngrams = []
 9 | for i in range(ngram_order):
10 |     ngrams.append({})
11 | 
12 | for line in sys.stdin:
13 |     line = line.rstrip()
14 |     tokens = line.split()
15 |     #tokens.insert(0, "<s>")
16 |     #tokens.append("</s>")
17 |     #print(tokens)
18 |     len_tokens = len(tokens)
19 | 
20 |     for i in range(len_tokens):
21 | 
22 |         # i := leftmost position
23 |         # j := rightmost position of current sub-ngram
24 |         # k := rightmost position of all sub-ngrams
25 | 
26 |         k = len_tokens if i+ngram_order >= len_tokens else i + ngram_order
27 |         #print("i=",i, "k=", k, tokens[i:k])
28 | 
29 |         # Build-up joined ngrams
30 |         for j in range(i+1,k+1):
31 |             joined_ngram = '_'.join(tokens[i:j])
32 |             if (j+1 < k):
33 |                 if joined_ngram in ngrams[0]:
34 |                     ngrams[0][joined_ngram] += 1
35 |                 else :
36 |                     ngrams[0][joined_ngram] = 1
37 | 
38 |             #print("   j=",j, joined_ngram)
39 | 
40 |             # Process sub-ngrams
41 |             num_subcuts = j - (i+1)
42 |             while (num_subcuts >= 1):
43 |                 if ( (j == k) and (num_subcuts % 2)): # skip imbalanced subcuts
44 |                     num_subcuts -= 1
45 |                     continue
46 |                 subcut = ' '.join([ '_'.join(tokens[i:i+num_subcuts]), '_'.join(tokens[i+num_subcuts:j]) ])
47 |                 if (subcut in ngrams[1]):
48 |                     ngrams[1][subcut] +=1
49 |                 else :
50 |                     ngrams[1][subcut] = 1
51 | 
52 |                 #print("        num_subcuts=", num_subcuts, "subcut=<<",subcut, ">>")
53 |                 num_subcuts -= 1
54 | 
55 | for i in range(ngram_order):
56 |     print()
57 |     for k, v in sorted(ngrams[i].items()):
58 |         print(k, "\t", v, sep='')
59 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # Python ClusterCat
 2 | 
 3 | 
 4 | ## Installation
 5 | First follow the [installation instructions](../README.md) in the above directory.
 6 | After that, you normally don't need to install anything here.  You can load the module `clustercat` using either Python 2 or 3.
 7 | 
 8 |     cd python
 9 |     python3
10 |     >>> import clustercat as cc
11 |     >>> clustering = cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1)
12 |     >>> print(clustering)
13 | 
14 | If you get an error message saying that it is unable to access clustercat binary, follow all the instructions in the error message.
15 | You'll need more text input than the toy example above to produce useful clusters.
16 | 
17 | To import this module from a different directory, you can add the module's directory to `$PYTHONPATH`:
18 | 
19 |     cd python
20 | 	echo "export PYTHONPATH=\$PYTHONPATH:`pwd`" >> ~/.bashrc
21 | 	source ~/.bashrc
22 | 
23 | ## Python ClusterCat Functions
24 | ### `cluster(text=None, in_file=None, ...)`
25 | Produce a clustering, given a textual input.  There is one required argument (the training input text), and many optional arguments.  The one required argument is **either** `text` **or** `in_file`.  The argument `text` is a list of Python strings.  The argument `in_file` is a path to a text file, consisting of preprocessed (eg. tokenized) one-sentence-per-line text.  The use of `text` is probably not a good idea for large corpora.
26 | 
27 | ```Python
28 | cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1)
29 | cc.cluster(in_file='/tmp/corpus.tok.txt', min_count=3)
30 | ```
31 | 
32 | The other optional arguments are described by running the compiled clustercat binary with the `--help` argument, except that the leading `--` from the shell argument is removed, and `-` is replaced with `_`.  So for example, instead of `--tune-cycles 15`, the Python function argument would be `tune_cycles=15` .
33 | 
34 | Returns a dictionary of the form `{ word : cluster_id }` .
35 | 
36 | 
37 | ### `save(mapping, out, format='tsv')`
38 | Save a clustering (dictionary) to file.  By default the output file is a tab-separated listing of words and their cluster ID.
39 | 
40 | ```Python
41 | cc.save(clustering, 'clusters.tsv')
42 | ```
43 | 
44 | 
45 | ### `load(in_file, format="tsv")`
46 | Load a clustering from a file.  By default the input file is a tab-separated listing of words and their cluster ID.
47 | Returns a dictionary of the clustering.
48 | 
49 | ```Python
50 | clustering = cc.load('clusters.tsv')
51 | ```
52 | 
53 | 
54 | ### `tag_string(mapping, text, unk="<unk>")`
55 | Tag a string with the corresponding cluster ID's.  If a word is not found in the clustering, use `unk`.
56 | Returns a string.
57 | 
58 | ```Python
59 | tagged_sent = cc.tag_string(clustering, "this is a test")
60 | ```
61 | 
62 | ### `tag_stdin(mapping, unk="<unk>")`
63 | This calls `tag_string()` for each line in `stdin`, and prints the result to `stdout`.
64 | 


--------------------------------------------------------------------------------
/python/clustercat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # By Jon Dehdari, 2016
  3 | # MIT License
  4 | """ Fast, flexible word clusters """
  5 | 
  6 | import sys
  7 | import os
  8 | import subprocess
  9 | import distutils.spawn
 10 | 
 11 | unk = '<unk>'
 12 | 
 13 | def load(in_file=None, format='tsv'):
 14 |     """ Load a clustering from a file. By default the input file is a
 15 |     tab-separated listing of words and their cluster ID. Returns a dictionary of
 16 |     the clustering.
 17 | 
 18 |     Args:
 19 |         in_file (string): path to input file
 20 |         format (string): input file format (default: tsv)
 21 | 
 22 |     Returns:
 23 |         dict: word-to-tag mapping
 24 |     """
 25 | 
 26 |     mapping = {}
 27 |     if format == 'tsv':
 28 |         with open(in_file) as f:
 29 |             # Primary sort by value (cluster ID), secondary sort by key (word)
 30 |             for line in f:
 31 |                 # Keep the full split line instead of key, val to allow for
 32 |                 # counts in optional third column
 33 |                 tokens = line.split()
 34 |                 mapping[tokens[0]] = int(tokens[1])
 35 | 
 36 |     return mapping
 37 | 
 38 | 
 39 | def save(mapping=None, out=None, format='tsv'):
 40 |     """ Save a clustering (dictionary) to file. By default the output file is
 41 |     a tab-separated listing of words and their cluster ID.
 42 | 
 43 |     Args:
 44 |         mapping (dict): word-to-tag mapping
 45 |         out (string): path to output file
 46 |         format (string): output file format (default: tsv)
 47 |     """
 48 | 
 49 |     if format == 'tsv':
 50 |         with open(out, 'w') as outfile:
 51 |             # Primary sort by value (cluster ID), secondary sort by key (word)
 52 |             for key in sorted(sorted(mapping), key=mapping.get):
 53 |                 line = str(key) + '\t' + str(mapping[key]) + '\n'
 54 |                 outfile.write(line)
 55 | 
 56 | 
 57 | def tag_string(mapping=None, text=None, unk=unk):
 58 |     """Tag a string with the corresponding cluster ID's. If a word is not
 59 |     found in the clustering, use unk.
 60 | 
 61 |     Args:
 62 |         mapping (dict): word-to-tag mapping
 63 |         text (string): the string to be tagged
 64 |         unk (string): what to label unknown/unseen words that are not in
 65 |                       mapping (default: <unk>)
 66 | 
 67 |     Returns:
 68 |         string: sequence of tags
 69 |     """
 70 | 
 71 |     newsent = ""
 72 |     for word in text.split():
 73 |         if word in mapping:
 74 |             newsent += ' ' + str(mapping[word])
 75 |         elif unk in mapping:
 76 |             newsent += ' ' + str(mapping[unk])
 77 |         else:
 78 |             newsent += ' ' + "<unk>"
 79 |     return newsent.lstrip()
 80 | 
 81 | 
 82 | def tag_stdin(mapping=None, unk=unk):
 83 |     """ This calls tag_string() for each line in stdin, and prints the
 84 |     result to stdout.
 85 | 
 86 |     Args:
 87 |         mapping (dict): word-to-tag mapping
 88 |         unk (string): what to label unknown/unseen words that are not in
 89 |                       mapping (default: <unk>)
 90 |     """
 91 | 
 92 |     for line in sys.stdin:
 93 |         print(tag_string(mapping=mapping, text=line, unk=unk))
 94 | 
 95 | 
 96 | def cluster(text=None, in_file=None, classes=None, class_file=None,
 97 |             class_offset=None, forward_lambda=None, ngram_input=None,
 98 |             min_count=None, out=None, print_freqs=None, quiet=None,
 99 |             refine=None, rev_alternate=None, threads=None, tune_cycles=None,
100 |             unidirectional=None, verbose=None, word_vectors=None):
101 |     """
102 |     Produce a clustering, given a textual input. There is one required argument
103 |     (the training input text), and many optional arguments. The one required
104 |     argument is either text or in_file. The argument text is a list of Python
105 |     strings. The argument in_file is a path to a text file, consisting of
106 |     preprocessed (eg. tokenized) one-sentence-per-line text. The use of text
107 |     is probably not a good idea for large corpora.
108 | 
109 |     The other optional arguments are described by running the compiled
110 |     clustercat binary with the --help argument, except that the
111 |     leading -- from the shell argument is removed, and - is replaced with _.
112 |     So for example, instead of --tune-cycles 15, the Python function argument
113 |     would be tune_cycles=15 .
114 | 
115 |     Returns a dictionary of the form { word : cluster_id } .
116 |     """
117 | 
118 |     # First check to see if we can access clustercat binary relative to this
119 |     # module.  If not, try $PATH.  If not, :-(
120 |     # Python 2 doesn't return absolute path in __file__
121 |     cc_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
122 |     cc_bin = os.path.join(cc_dir, 'bin', 'clustercat')
123 |     if os.path.isfile(cc_bin):
124 |         cmd_str = [cc_bin]
125 |     elif distutils.spawn.find_executable("clustercat"):
126 |         cmd_str = ["clustercat"]
127 |     else:
128 |         print("Error: Unable to access clustercat binary from either ", cc_dir, " or $PATH.  In the parent directory, first run 'make install', and then add $HOME/bin/ to your $PATH, by typing the following command:\necho 'PATH=$PATH:$HOME/bin' >> $HOME/.bashrc  &&  source $HOME/.bashrc")
129 |         exit(1)
130 | 
131 | 
132 |     # Now translate function arguments to command-line arguments
133 |     clustercat_params = {"in_file": "--in", "out": "--out",
134 |                          "classes": "--classes",
135 |                          "class_file": "--class-file",
136 |                          "class_offset": "--class-offset",
137 |                          "forward_lambda": "--forward-lambda",
138 |                          "ngram_input": "--ngram-input",
139 |                          "min_count": "--min-count",
140 |                          "refine": "--refine",
141 |                          "rev_alternate": "--rev-alternate",
142 |                          "threads": "--threads",
143 |                          "tune_cycles": "--tune-cycles",
144 |                          "word_vectors": "--word-vectors"
145 |                         }
146 | 
147 |     boolean_params = {"print_freqs": "--print-freqs",
148 |                       "quiet": "--quiet",
149 |                       "unidirectional": "--unidirectional",
150 |                       "verbose": "--verbose"
151 |                      }
152 | 
153 |     for arg, value in locals().items():
154 |         # Check for boolean parameters
155 |         if arg in boolean_params and value is True:
156 |             cmd_str.append(boolean_params[arg])
157 |         # Other non-boolean parameters that are not None
158 |         elif arg in clustercat_params and value is not None:
159 |             cmd_str.append(clustercat_params[arg])
160 |             cmd_str.append(str(value))
161 | 
162 |     #print(cmd_str, file=sys.stderr)  # Use Python 3 interpreter
163 | 
164 |     cmd_out = ''
165 |     if text and not in_file:
166 |         p1 = subprocess.Popen(["printf", "\n".join(text)],
167 |                               stdout=subprocess.PIPE, universal_newlines=True)
168 |         p2 = subprocess.Popen(cmd_str, stdin=p1.stdout, stdout=subprocess.PIPE,
169 |                               universal_newlines=True)
170 |         p1.stdout.close()
171 |         cmd_out = p2.communicate()[0]
172 |     elif in_file and not text:
173 |         cmd_out = subprocess.check_output(cmd_str, universal_newlines=True)
174 |     else:
175 |         print("Error: supply either text or in_file argument to clustercat.cluster(), but not both")
176 | 
177 |     clusters = {}
178 |     for line in cmd_out.split("\n"):
179 |         split_line = line.split("\t")
180 |         try:
181 |             clusters[split_line[0]] = int(split_line[1])
182 |         except:
183 |             pass
184 |     return clusters
185 | 
186 | 
187 | def main():
188 |     """ No real reason to use this as a standalone script.  Just invoke the
189 |         C-compiled binary for standalone applications.  But here you
190 |         go, anyways.
191 |     """
192 |     import argparse
193 |     parser = argparse.ArgumentParser(description='Clusters words, or tags them')
194 | 
195 |     parser.add_argument('-i', '--in', help="Load input training file")
196 |     parser.add_argument('-o', '--out', help="Save final mapping to file")
197 |     parser.add_argument('-t', '--tag', help="Tag stdin input, using clustering in supplied argument")
198 |     args = parser.parse_args()
199 | 
200 |     if args.tag:
201 |         mapping = load(in_file=args.tag)
202 |         tag_stdin(mapping=mapping)
203 |     else:
204 |         mapping = cluster(text=sys.stdin)
205 |         if args.out:
206 |             save(mapping=mapping, out=args.out)
207 |         else:
208 |             print(mapping)
209 | 
210 | if __name__ == '__main__':
211 |     main()
212 | 


--------------------------------------------------------------------------------
/src/clustercat-array.c:
--------------------------------------------------------------------------------
  1 | #include <stdarg.h>		// variadic functions for arrncat
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include "clustercat.h"	// macros
  6 | 
  7 | // Returns 0 if all values in array are 0.0; returns 1 otherwise
  8 | int anyf(const float array[], unsigned int arr_len) {
  9 | 	while (arr_len--) {
 10 | 		if (array[arr_len])
 11 | 			return 1;
 12 | 	}
 13 | 	return 0;
 14 | }
 15 | 
 16 | // Returns 0 if all values in array are 0.0; returns 1 otherwise
 17 | int any(const double array[], unsigned int arr_len) {
 18 | 	while (arr_len--) {
 19 | 		if (array[arr_len])
 20 | 			return 1;
 21 | 	}
 22 | 	return 0;
 23 | }
 24 | 
 25 | // Returns 1 if all values in array are non-zero; returns 0 otherwise
 26 | int allf(const float array[], unsigned int arr_len) {
 27 | 	while (arr_len--) {
 28 | 		if (!array[arr_len])
 29 | 			return 0;
 30 | 	}
 31 | 	return 1;
 32 | }
 33 | 
 34 | // Returns 1 if all values in array are non-zero; returns 0 otherwise
 35 | int all(const double array[], unsigned int arr_len) {
 36 | 	while (arr_len--) {
 37 | 		if (!array[arr_len])
 38 | 			return 0;
 39 | 	}
 40 | 	return 1;
 41 | }
 42 | 
 43 | float sumf(const float array[], unsigned int arr_len) {
 44 | 	float sum = 0.0;
 45 | 	while (arr_len--) {
 46 | 		sum += array[arr_len];
 47 | 	}
 48 | 	return sum;
 49 | }
 50 | 
 51 | double sum(const double array[], unsigned int arr_len) {
 52 | 	double sum = 0.0;
 53 | 	while (arr_len--) {
 54 | 		sum += array[arr_len];
 55 | 	}
 56 | 	return sum;
 57 | }
 58 | 
 59 | float productf(const float array[], unsigned int arr_len) {
 60 | 	float product = 1.0;
 61 | 	while (arr_len--) {
 62 | 		product *= array[arr_len];
 63 | 	}
 64 | 	return product;
 65 | }
 66 | 
 67 | double product(const double array[], unsigned int arr_len) {
 68 | 	double product = 1.0;
 69 | 	while (arr_len--) {
 70 | 		product *= array[arr_len];
 71 | 	}
 72 | 	return product;
 73 | }
 74 | 
 75 | float minf(const float array[], unsigned int arr_len) {
 76 | 	arr_len--;
 77 | 	float min = array[arr_len];
 78 | 	while (1) {
 79 | 		//printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout);
 80 | 		if (array[arr_len] < min)
 81 | 			min = array[arr_len];
 82 | 		if (arr_len == 0)
 83 | 			break;
 84 | 		arr_len--;
 85 | 	}
 86 | 	return min;
 87 | }
 88 | 
 89 | double min(const double array[], unsigned int arr_len) {
 90 | 	arr_len--;
 91 | 	double min = array[arr_len];
 92 | 	while (1) {
 93 | 		//printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout);
 94 | 		if (array[arr_len] < min)
 95 | 			min = array[arr_len];
 96 | 		if (arr_len == 0)
 97 | 			break;
 98 | 		arr_len--;
 99 | 	}
100 | 	return min;
101 | }
102 | 
103 | float maxf(const float array[], unsigned int arr_len) {
104 | 	arr_len--;
105 | 	float max = array[arr_len];
106 | 	while (1) {
107 | 		if (array[arr_len] > max)
108 | 			max = array[arr_len];
109 | 		if (arr_len == 0)
110 | 			break;
111 | 		arr_len--;
112 | 	}
113 | 	return max;
114 | }
115 | 
116 | double max(const double array[], unsigned int arr_len) {
117 | 	arr_len--;
118 | 	double max = array[arr_len];
119 | 	while (1) {
120 | 		if (array[arr_len] > max)
121 | 			max = array[arr_len];
122 | 		if (arr_len == 0)
123 | 			break;
124 | 		arr_len--;
125 | 	}
126 | 	return max;
127 | }
128 | 
129 | unsigned int which_minf(const float array[], const unsigned int arr_len) {
130 | 	unsigned int which_min = 0;
131 | 	float min = array[0];
132 | 
133 | 	unsigned int i = 1;
134 | 	for (; i < arr_len; i++) {
135 | 		if (array[i] < min) {
136 | 			which_min = i;
137 | 			min = array[i];
138 | 		}
139 | 	}
140 | 	return which_min;
141 | }
142 | 
143 | unsigned int which_min(const double array[], const unsigned int arr_len) {
144 | 	unsigned int which_min = 0;
145 | 	double min = array[0];
146 | 
147 | 	unsigned int i = 1;
148 | 	for (; i < arr_len; i++) {
149 | 		if (array[i] < min) {
150 | 			which_min = i;
151 | 			min = array[i];
152 | 		}
153 | 	}
154 | 	return which_min;
155 | }
156 | 
157 | unsigned int which_maxf(const float array[], const unsigned int arr_len) {
158 | 	unsigned int which_max = 0;
159 | 	float max = array[0];
160 | 
161 | 	unsigned int i = 1;
162 | 	for (; i < arr_len; i++) {
163 | 		if (array[i] > max) {
164 | 			which_max = i;
165 | 			max = array[i];
166 | 		}
167 | 	}
168 | 	return which_max;
169 | }
170 | 
171 | unsigned int which_max(const double array[], const unsigned int arr_len) {
172 | 	unsigned int which_max = 0;
173 | 	double max = array[0];
174 | 
175 | 	unsigned int i = 1;
176 | 	for (; i < arr_len; i++) {
177 | 		if (array[i] > max) {
178 | 			which_max = i;
179 | 			max = array[i];
180 | 		}
181 | 	}
182 | 	return which_max;
183 | }
184 | 
185 | void fprint_array(FILE *stream, const double array[const], const unsigned int arr_len, char * restrict sep) {
186 | 	//fputs("{ ", stream);
187 | 	unsigned int i = 0;
188 | 	for (; i < arr_len-1; i++)
189 | 		fprintf(stream, "%g%s", array[i], sep);
190 | 	fprintf(stream, "%g\n", array[arr_len-1]);
191 | }
192 | 
193 | void fprint_arrayf(FILE *stream, const float array[const], const unsigned int arr_len, char * restrict sep) {
194 | 	//fputs("{ ", stream);
195 | 	unsigned int i = 0;
196 | 	for (; i < arr_len-1; i++)
197 | 		fprintf(stream, "%g%s", array[i], sep);
198 | 	fprintf(stream, "%g\n", array[arr_len-1]);
199 | }
200 | 
201 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep) {
202 | 	char line[STDIN_SENT_MAX_CHARS];
203 |     if (fgets(line, sizeof(line), stream) == NULL) // Get line
204 | 		return 0;
205 | 	int elems = 0;
206 | 	char * restrict token;
207 | 	if ((token = strtok(line, sep)) == NULL)
208 | 		return 0;
209 | 	while (token) {
210 | 		array[elems] = atof(token);
211 | 		elems++;
212 | 		token = strtok(NULL, sep);
213 | 	}
214 | 
215 | 	return elems;
216 | }
217 | 
218 | 
219 | // Analogous to strncat(), but with variable number of arguments
220 | void arrncat(double full_array[], const unsigned int full_array_len, ...) {
221 | 	va_list argptr;
222 | 	va_start(argptr, full_array_len);
223 | 
224 | 	double * restrict offset = full_array;
225 | 	double * restrict full_array_last = full_array + full_array_len;
226 | 	//printf("30: full_array=%p, offset=%p, full_array_len=%u, sizeof(double)=%lu, len*size=%lu, full_array_last=%p, diff=%li\n", full_array, offset, full_array_len, sizeof(double), full_array_len*sizeof(double), full_array_last, full_array_last - full_array);
227 | 
228 | 	while (offset < full_array_last) {
229 | 		double * restrict arr = va_arg(argptr, double*);
230 | 		//printf("31\n");
231 | 		unsigned int arr_len = va_arg(argptr, unsigned int);
232 | 		//printf("32: arr_len=%u\n", arr_len);
233 | 		unsigned int arr_len_bytes = arr_len * sizeof(double);
234 | 		//printf("33: full_array=%p, offset=%p, *<-=%g, *+1=%g, full_array_len=%u, arr_len=%u, arr_len_bytes=%u, arr[0]=%g, arr[1]=%g, arr_last=%g\n", full_array, offset, *offset, *(offset+1), full_array_len, arr_len, arr_len_bytes, arr[0], arr[1], arr[arr_len-1]); fflush(stdout);
235 | 		memcpy(offset, arr, arr_len_bytes);
236 | 		//printf("34: offset=%p, *<-=%g, *+1=%g, *-1=%g, full_array_last=%p arr_len_bytes=%u\n", offset, *offset, *(offset+1), *(offset-1), full_array_last, arr_len_bytes); fflush(stdout);
237 | 		offset += arr_len;
238 | 		//printf("35: full_array=%p, offset=%p, full_array_last=%p arr_len_bytes=%u\n", full_array, offset, full_array_last, arr_len_bytes); fflush(stdout);
239 | 		//printf("36: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n");
240 | 	}
241 | 	va_end(argptr);
242 | 	//printf("37: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n");
243 | }
244 | 


--------------------------------------------------------------------------------
/src/clustercat-array.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_DKLM_ARRAY_HEADER
 2 | #define INCLUDE_DKLM_ARRAY_HEADER
 3 | 
 4 | int any(const double array[], unsigned int arr_len);
 5 | int anyf(const float array[], unsigned int arr_len);
 6 | int all(const double array[], unsigned int arr_len);
 7 | int allf(const float array[], unsigned int arr_len);
 8 | 
 9 | double sum(const double array[], unsigned int arr_len);
10 | float sumf(const float array[], unsigned int arr_len);
11 | double product(const double array[], unsigned int arr_len);
12 | float productf(const float array[], unsigned int arr_len);
13 | 
14 | double min(const double array[], unsigned int arr_len);
15 | float minf(const float array[], unsigned int arr_len);
16 | double max(const double array[], unsigned int arr_len);
17 | float maxf(const float array[], unsigned int arr_len);
18 | 
19 | unsigned int which_min(const double array[], const unsigned int arr_len);
20 | unsigned int which_minf(const float array[], const unsigned int arr_len);
21 | unsigned int which_max(const double array[], const unsigned int arr_len);
22 | unsigned int which_maxf(const float array[], const unsigned int arr_len);
23 | 
24 | void fprint_array(FILE *stream, const double array[], const unsigned int arr_len, char * restrict sep);
25 | void fprint_arrayf(FILE *stream, const float array[], const unsigned int arr_len, char * restrict sep);
26 | 
27 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep);
28 | 
29 | void arrncat(double full_array[], const unsigned int full_array_len, ...);
30 | 
31 | #endif // INCLUDE_HEADER
32 | 


--------------------------------------------------------------------------------
/src/clustercat-cluster.c:
--------------------------------------------------------------------------------
  1 | #include <time.h>				// clock_t, clock(), CLOCKS_PER_SEC, etc.
  2 | #include <float.h>				// FLT_MAX, etc.
  3 | #include "clustercat-cluster.h"
  4 | #include "clustercat-array.h"
  5 | #include "clustercat-math.h"
  6 | 
  7 | float entropy_term(const float entropy_terms[const], const unsigned int i);
  8 | double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move);
  9 | double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move);
 10 | 
 11 | inline float entropy_term(const float entropy_terms[const], const unsigned int i) {
 12 | 	if (i < ENTROPY_TERMS_MAX)
 13 | 		return entropy_terms[i];
 14 | 	else
 15 | 		return i * log2f(i);
 16 | }
 17 | 
 18 | inline double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) {
 19 | 	// See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008):  https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
 20 | 	register double delta = 0.0;
 21 | 	const unsigned int count_class = count_array[from_class];
 22 | 	if (count_class > 1)
 23 | 		delta = entropy_term(entropy_terms, count_class);
 24 | 	const unsigned int new_count_class = count_class - word_count;
 25 | 	if (new_count_class > 1)
 26 | 		delta -= entropy_term(entropy_terms, new_count_class);
 27 | 	//printf("rm42: word=%u, word_count=%u, from_class=%u, count_class=%u, new_count_class=%u (count_class - word_count), delta=%g\n", word, word_count, from_class, count_class, new_count_class, delta); fflush(stdout);
 28 | 
 29 | 	if (! is_tentative_move)
 30 | 		count_array[from_class] = new_count_class;
 31 | 
 32 | 	for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
 33 | 		word_id_t prev_word = word_bigrams[word].predecessors[i];
 34 | 		//printf(" rm43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * from_class=%u)\n", i, word_bigrams[word].length, word,  (prev_word * cmd_args.num_classes + from_class), prev_word, cmd_args.num_classes, from_class); fflush(stdout);
 35 | 		const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + from_class];
 36 | 		if (word_class_count > 1) // Can't do log(0); no need for 1
 37 | 			delta -= entropy_term(entropy_terms, word_class_count);
 38 | 		const unsigned int new_word_class_count = word_class_count - word_bigrams[word].bigram_counts[i];
 39 | 		delta += entropy_term(entropy_terms, new_word_class_count);
 40 | 		//printf(" rm45: word=%u (#=%u), prev_word=%u, #(<v,w>)=%u, from_class=%u, i=%u, count_class=%u, new_count_class=%u, <v,c>=<%u,%u>, #(<v,c>)=%u, new_#(<v,c>)=%u (w-c - %u), delta=%g\n", word, word_count, prev_word, word_bigrams[word].bigram_counts[i], from_class, i, count_class, new_count_class, prev_word, from_class, word_class_count, new_word_class_count, word_bigrams[word].bigram_counts[i], delta); fflush(stdout);
 41 | 		//print_word_class_counts(cmd_args, model_metadata, word_class_counts);
 42 | 		if (! is_tentative_move)
 43 | 			word_class_counts[prev_word * cmd_args.num_classes + from_class] = new_word_class_count;
 44 | 
 45 | 	}
 46 | 
 47 | 	if (cmd_args.rev_alternate && (!is_tentative_move)) { // also update reversed word-class counts
 48 | 		for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) {
 49 | 			const word_id_t next_word = word_bigrams_rev[word].predecessors[i];
 50 | 			const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + from_class];
 51 | 			const unsigned int new_word_class_rev_count = word_class_rev_count - word_bigrams_rev[word].bigram_counts[i];
 52 | 			//printf(" rm47: rev_next_word=%u, rev_#(<v,c>)=%u, rev_new_#(<v,c>)=%u\n", next_word, word_class_rev_count, new_word_class_rev_count); fflush(stdout);
 53 | 			//print_word_class_counts(cmd_args, model_metadata, word_class_rev_counts);
 54 | 			word_class_rev_counts[next_word * cmd_args.num_classes + from_class] = new_word_class_rev_count;
 55 | 		}
 56 | 	}
 57 | 
 58 | 	return delta;
 59 | }
 60 | 
 61 | inline double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) {
 62 | 	// See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008):  https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
 63 | 	unsigned int count_class = count_array[to_class];
 64 | 	if (!count_class) // class is empty
 65 | 		count_class = 1;
 66 | 	const unsigned int new_count_class = count_class + word_count; // Differs from paper: replace "-" with "+"
 67 | 	register double delta = entropy_term(entropy_terms, count_class)  -  entropy_term(entropy_terms, new_count_class);
 68 | 	//printf("mv42: word=%u, word_count=%u, to_class=%u, count_class=%u, new_count_class=%u, delta=%g, is_tentative_move=%d\n", word, word_count, to_class, count_class, new_count_class, delta, is_tentative_move); fflush(stdout);
 69 | 	const float backward_lambda = 1 - cmd_args.forward_lambda;
 70 | 
 71 | 	if (! is_tentative_move)
 72 | 		count_array[to_class] = new_count_class;
 73 | 
 74 | 	for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
 75 | 		word_id_t prev_word = word_bigrams[word].predecessors[i];
 76 | 		//printf(" mv43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * to_class=%u)\n", i, word_bigrams[word].length, word,  (prev_word * cmd_args.num_classes + to_class), prev_word, cmd_args.num_classes, to_class); fflush(stdout);
 77 | 		const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + to_class];
 78 | 		if (word_class_count > 1) { // Can't do log(0); no need for 1
 79 | 			if (cmd_args.unidirectional) {
 80 | 				delta -= entropy_term(entropy_terms, word_class_count);
 81 | 			} else {
 82 | 				delta -= entropy_term(entropy_terms, word_class_count) * cmd_args.forward_lambda;
 83 | 			}
 84 | 		}
 85 | 		const unsigned int new_word_class_count = word_class_count + word_bigrams[word].bigram_counts[i]; // Differs from paper: replace "-" with "+"
 86 | 		if (new_word_class_count > 1) { // Can't do log(0)
 87 | 			if (cmd_args.unidirectional) {
 88 | 				delta += entropy_term(entropy_terms, new_word_class_count);
 89 | 			} else {
 90 | 				delta += entropy_term(entropy_terms, new_word_class_count) * cmd_args.forward_lambda;
 91 | 			}
 92 | 		}
 93 | 		//printf(" mv45: word=%u; prev_word=%u, to_class=%u, i=%u, word_count=%u, count_class=%u, new_count_class=%u, <v,c>=<%u,%hu>, #(<v,c>)=%u, new_#(<v,c>)=%u, delta=%g\n", word, prev_word, to_class, i, word_count, count_class, new_count_class, prev_word, to_class, word_class_count, new_word_class_count, delta); fflush(stdout);
 94 | 		if (! is_tentative_move)
 95 | 			word_class_counts[prev_word * cmd_args.num_classes + to_class] = new_word_class_count;
 96 | 
 97 | 	}
 98 | 
 99 | 	if (cmd_args.rev_alternate) { // also update reversed word-class counts; reversed order of conditionals since the first clause here is more common in this function
100 | 		for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) {
101 | 			const word_id_t next_word = word_bigrams_rev[word].predecessors[i];
102 | 			const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + to_class];
103 | 			if (word_class_rev_count > 1) // Can't do log(0); no need for 1
104 | 				if (!cmd_args.unidirectional)
105 | 					delta -= entropy_term(entropy_terms, word_class_rev_count) * backward_lambda;
106 | 
107 | 			const unsigned int new_word_class_rev_count = word_class_rev_count + word_bigrams_rev[word].bigram_counts[i];
108 | 			if (new_word_class_rev_count > 1) // Can't do log(0); no need for 1
109 | 				if (!cmd_args.unidirectional)
110 | 					//delta += entropy_term(entropy_terms, word_class_rev_count) * backward_lambda;
111 | 					delta += entropy_term(entropy_terms, new_word_class_rev_count) * backward_lambda;
112 | 			//printf("word=%u, word_class_rev_count=%u, new_word_class_rev_count=%u, delta=%g\n", word, word_class_rev_count, new_word_class_rev_count, delta);
113 | 			if (!is_tentative_move)
114 | 				word_class_rev_counts[next_word * cmd_args.num_classes + to_class] = new_word_class_rev_count;
115 | 		}
116 | 	}
117 | 
118 | 	return delta;
119 | }
120 | 
121 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) {
122 | 	unsigned long steps = 0;
123 | 
124 | 	if (cmd_args.class_algo == EXCHANGE  ||  cmd_args.class_algo == EXCHANGE_BROWN) { // Exchange algorithm: See Sven Martin, Jörg Liermann, Hermann Ney. 1998. Algorithms For Bigram And Trigram Word Clustering. Speech Communication 24. 19-37. http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354
125 | 		// Get initial logprob
126 | 		count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *));
127 | 		init_count_arrays(cmd_args, count_arrays);
128 | 		tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays);
129 | 		unsigned int num_classes_current = (cmd_args.num_classes > 15) && (cmd_args.refine) ? powi(2,cmd_args.refine) : cmd_args.num_classes; // Don't bother with class refinement if the number of classes is really small.  powi() is declared in clustercat-math.h
130 | 
131 | 		// Build precomputed entropy terms
132 | 		float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
133 | 		build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
134 | 
135 | 		if (cmd_args.verbose > 3) {
136 | 			printf("cluster(): 42: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) {
137 | 				printf("c_%u=%lu, ", i, (unsigned long)count_arrays[0][i]);
138 | 				class_sum += count_arrays[0][i];
139 | 			} printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout);
140 | 		}
141 | 		double best_log_prob = training_data_log_likelihood(cmd_args, model_metadata, count_arrays, word_counts, word2class);
142 | 
143 | 		if (cmd_args.verbose >= -1) {
144 | 			fprintf(stderr, "%s: Expected Steps:  %'lu (%'u word types x %'u classes x %'u cycles);  initial logprob=%g, PP=%g\n", argv_0_basename, (unsigned long)model_metadata.type_count * cmd_args.num_classes * cmd_args.tune_cycles, model_metadata.type_count, cmd_args.num_classes, cmd_args.tune_cycles, best_log_prob, perplexity(best_log_prob, (model_metadata.token_count + model_metadata.line_count))); fflush(stderr);
145 | 		}
146 | 
147 | 		time_t time_start_cycles;
148 | 		time(&time_start_cycles);
149 | 		unsigned short cycle = 1; // Keep this around afterwards to print out number of actually-completed cycles
150 | 		word_id_t moved_count = 0;
151 | 		count_arrays_t temp_count_arrays = malloc(cmd_args.max_array * sizeof(void *));
152 | 		init_count_arrays(cmd_args, temp_count_arrays);
153 | 		for (; cycle <= cmd_args.tune_cycles; cycle++) {
154 | 			if (cmd_args.refine && (cycle == 4)) // Current setting forces bump to full cluster size after 3 iterations, but you can change this line and the next for a different schedule
155 | 				num_classes_current = cmd_args.num_classes;
156 | 			if ((num_classes_current != cmd_args.num_classes) && (num_classes_current > (cmd_args.num_classes / 4.0))) { // If the coarse cluster size is close to the final size, just go do the final size
157 | 				num_classes_current = cmd_args.num_classes;
158 | 				time(&time_start_cycles); // restart timer, when full clustering starts
159 | 			}
160 | 
161 | 			const bool is_nonreversed_cycle = (cmd_args.rev_alternate == 0) || (cycle % (cmd_args.rev_alternate+1)); // Only do a reverse predictive exchange (using <c,v>) after every cmd_arg.rev_alternate cycles; if rev_alternate==0 then always do this part.
162 | 
163 | 			clear_count_arrays(cmd_args, temp_count_arrays);
164 | 			double queried_log_prob = 0.0;
165 | 			if (model_metadata.token_count < 5e8  || cycle == cmd_args.tune_cycles || cycle == 2 || cycle == 3) { // For large training sets, only calculate PP on the interesting iterations
166 | 				tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, temp_count_arrays);
167 | 				queried_log_prob = training_data_log_likelihood(cmd_args, model_metadata, temp_count_arrays, word_counts, word2class);
168 | 			}
169 | 
170 | 			// ETA stuff
171 | 			const time_t time_this_cycle = time(NULL);
172 | 			const double time_elapsed = difftime(time_this_cycle, time_start_cycles) + 7.0; // a little is added since time prediction in early cycles tend to be too optimistic
173 | 			const double time_avg_per_cycle = (time_elapsed / ((double)cycle-1));
174 | 			const unsigned int remaining_cycles = cmd_args.tune_cycles - cycle + 1;
175 | 			const double time_remaining = ( time_avg_per_cycle * remaining_cycles);
176 | 			const time_t eta = time_this_cycle + time_remaining;
177 | 
178 | 			if (cmd_args.verbose >= -1) {
179 | 				if (is_nonreversed_cycle)
180 | 					fprintf(stderr, "ccat: Normal cycle %-2u", cycle);
181 | 				else
182 | 					fprintf(stderr, "ccat: Rev cycle    %-2u", cycle);
183 | 				fprintf(stderr, " C=%-3u", num_classes_current);
184 | 				if (cycle > 1) {
185 | 					fprintf(stderr, " Words moved last cycle: %.2g%% (%u/%u).", (100 * (moved_count / (float)model_metadata.type_count)), moved_count, model_metadata.type_count);
186 | 					if (cycle > 4) {
187 | 						char eta_string[300];
188 | 						strftime(eta_string, 300, "%x %X", localtime(&eta));
189 | 						fprintf(stderr, " Time left: %lim %lis.  ETA: %s", (long)time_remaining/60, ((long)time_remaining % 60), eta_string);
190 | 					}
191 | 					if (queried_log_prob) {
192 | 						if (cmd_args.ngram_input) {
193 | 							fprintf(stderr, "  LL=%g", queried_log_prob); // can't get reliable PP if input is ngram counts
194 | 						} else {
195 | 							fprintf(stderr, "  LL=%.3g PP=%g", queried_log_prob, perplexity(queried_log_prob,(model_metadata.token_count + model_metadata.line_count)));
196 | 						}
197 | 					}
198 | 					fprintf(stderr, "\n");
199 | 				}
200 | 				else if ( cmd_args.refine)
201 | 					fprintf(stderr, " Starting with %u coarse classes, for the first few cycles\n", num_classes_current);
202 | 				else
203 | 					fprintf(stderr, "\n");
204 | 				fflush(stderr);
205 | 			}
206 | 			moved_count = 0;
207 | 
208 | 			//#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps) // non-determinism
209 | 			for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
210 | 			//for (word_id_t word_i = model_metadata.type_count-1; word_i != -1; word_i--) {
211 | 				if (cycle < 3 && word_i < num_classes_current) // don't move high-frequency words in the first (few) iteration(s)
212 | 					continue;
213 | 				const word_count_t word_i_count = word_bigrams[word_i].headword_count;
214 | 				const wclass_t old_class = word2class[word_i];
215 | 				double scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array
216 | 				memset(scores, 0, sizeof(double) * cmd_args.num_classes);
217 | 				//const double delta_remove_word = pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays, true);
218 | 				//const double delta_remove_word = 0.0;  // Not really necessary
219 | 				//const double delta_remove_word_rev = 0.0;  // Not really necessary
220 | 
221 | 				//printf("cluster(): 43: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) {
222 | 				//	printf("c_%u=%u, ", i, count_arrays[0][i]);
223 | 				//	class_sum += count_arrays[0][i];
224 | 				//} printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout);
225 | 
226 | 				#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps)
227 | 				for (wclass_t class = 0; class < num_classes_current; class++) { // class values range from 0 to num_classes_current-1
228 | 					if (is_nonreversed_cycle) {
229 | 						scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true);
230 | 					} else { // This is the reversed one
231 | 						scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, true);
232 | 					}
233 | 					steps++;
234 | 				}
235 | 				//scores[old_class] -= 0.80 / cycle; // TA
236 | 
237 | 				const wclass_t best_hypothesis_class = which_max(scores, num_classes_current);
238 | 				const double best_hypothesis_score = max(scores, num_classes_current);
239 | 
240 | 				if (cmd_args.verbose > 1) {
241 | 					printf("Orig score for word w_«%u» using class «%hu» is %g;  Hypos %u-%u: ", word_i, old_class, scores[old_class], 1, num_classes_current);
242 | 					fprint_array(stdout, scores, num_classes_current, ","); fflush(stdout);
243 | 					//if (best_hypothesis_score > 0) { // Shouldn't happen
244 | 					//	fprintf(stderr, "Error: best_hypothesis_score=%g for class %hu > 0\n", best_hypothesis_score, best_hypothesis_class); fflush(stderr);
245 | 					//	exit(9);
246 | 					//}
247 | 				}
248 | 
249 | 				if (old_class != best_hypothesis_class) { // We've improved
250 | 					moved_count++;
251 | 
252 | 					if (cmd_args.verbose > 0) {
253 | 						fprintf(stderr, " Moving id=%-7u count=%-7lu %-18s %u -> %u\t(%g -> %g)\n", word_i, (unsigned long)word_bigrams[word_i].headword_count, word_list[word_i], old_class, best_hypothesis_class, scores[old_class], best_hypothesis_score); fflush(stderr);
254 | 					}
255 | 					//word2class[word_i] = best_hypothesis_class;
256 | 					word2class[word_i] = best_hypothesis_class;
257 | 					if (isnan(best_hypothesis_score)) { // shouldn't happen
258 | 						fprintf(stderr, "Error: best_hypothesis_score=%g :-(\n", best_hypothesis_score); fflush(stderr);
259 | 						exit(5);
260 | 					} else {
261 | 						best_log_prob += best_hypothesis_score;
262 | 					}
263 | 
264 | 					if (is_nonreversed_cycle) {
265 | 						pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false);
266 | 						pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false);
267 | 					} else { // This is the reversed one
268 | 						pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false);
269 | 						pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams_rev, word_bigrams,  word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false);
270 | 					}
271 | 				}
272 | 			}
273 | 
274 | 			//if (!moved_count) // Nothing moved in last cycle, so that's it
275 | 			//	break;
276 | 		}
277 | 
278 | 		if (cmd_args.verbose >= -1) {
279 | 			fprintf(stderr, "%s: Completed steps: %'lu\n", argv_0_basename, steps); fflush(stderr);
280 | 		}
281 | 			//fprintf(stderr, "%s: Completed steps: %'lu (%'u word types x %'u classes x %'u cycles);     best logprob=%g, PP=%g\n", argv_0_basename, steps, model_metadata.type_count, num_classes_current, cycle-1, best_log_prob, perplexity(best_log_prob,(model_metadata.token_count - model_metadata.line_count))); fflush(stderr);
282 | 
283 | 		if (cmd_args.class_algo == EXCHANGE_BROWN)
284 | 			post_exchange_brown_cluster(cmd_args, model_metadata, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays);
285 | 
286 | 		free_count_arrays(cmd_args, temp_count_arrays);
287 | 		free(temp_count_arrays);
288 | 		free_count_arrays(cmd_args, count_arrays);
289 | 		free(count_arrays);
290 | 		free(entropy_terms);
291 | 
292 | 	} else if (cmd_args.class_algo == BROWN) { // Agglomerative clustering.  Stops when the number of current clusters is equal to the desired number in cmd_args.num_classes
293 | 		// "Things equal to nothing else are equal to each other." --Anon
294 | 		for (unsigned long current_num_classes = model_metadata.type_count; current_num_classes > cmd_args.num_classes; current_num_classes--) {
295 | 			for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
296 | 				float log_probs[cmd_args.num_classes];
297 | 				//#pragma omp parallel for num_threads(cmd_args.num_threads)
298 | 				for (wclass_t class = 0; class < cmd_args.num_classes; class++, steps++) {
299 | 					// Get log prob
300 | 					log_probs[class] = -1 * (class+1); // Dummy predicate
301 | 				}
302 | 				wclass_t best_class = which_maxf(log_probs, cmd_args.num_classes);
303 | 				printf("Moving w_%u to class %u\n", word_i, best_class);
304 | 			}
305 | 		}
306 | 	}
307 | }
308 | 
309 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) {
310 | 	count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *));
311 | 	init_count_arrays(cmd_args, count_arrays);
312 | 	tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays);
313 | 
314 | 	// Build precomputed entropy terms
315 | 	float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
316 | 	build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
317 | 
318 | 	if ( ! cmd_args.print_freqs) // greedo compatible
319 | 		fprintf(out_file, "%lu %u\n", (long unsigned)model_metadata.type_count, cmd_args.num_classes); // Like output in word2vec
320 | 
321 | 	for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
322 | 		const word_count_t word_i_count = word_bigrams[word_i].headword_count;
323 | 		float scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array.  We use a float here to be compatible with word2vec
324 | 		float score_min = FLT_MAX; // use this later for rescaling
325 | 
326 | 		#pragma omp parallel for num_threads(cmd_args.num_threads)
327 | 		for (wclass_t class = 0; class < cmd_args.num_classes; class++) { // class values range from 0 to cmd_args.num_classes-1
328 | 			scores[class] = sqrt( -(float)pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true));
329 | 			if (scores[class] < score_min)
330 | 				score_min = scores[class];
331 | 		}
332 | 
333 | 		// Rescale vectors
334 | 		for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
335 | 			scores[class] -= score_min;
336 | 		}
337 | 
338 | 		if (cmd_args.print_freqs) // greedo compatible
339 | 			fprintf(out_file, "%lu %s ", (long unsigned) word_i_count, word_list[word_i]);
340 | 		else // word2vec compatible
341 | 			fprintf(out_file, "%s ", word_list[word_i]);
342 | 
343 | 		if (cmd_args.print_word_vectors == TEXT_VEC)
344 | 			fprint_arrayf(out_file, scores, cmd_args.num_classes, " ");
345 | 		else
346 | 			fwrite(scores, sizeof(float), cmd_args.num_classes, out_file);
347 | 	}
348 | 
349 | 	free_count_arrays(cmd_args, count_arrays);
350 | 	free(count_arrays);
351 | 	free(entropy_terms);
352 | }
353 | 
354 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays) {
355 | 
356 | 	// Build precomputed entropy terms
357 | 	float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
358 | 	build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
359 | 
360 | 	// Convert word2class to an array of classes pointing to arrays of words, which will successively get merged together
361 | 	struct_class_listing class2words[cmd_args.num_classes];
362 | 	memset(class2words, 0, sizeof(struct_class_listing) * cmd_args.num_classes);
363 | 	get_class_listing(cmd_args, model_metadata, word2class, class2words); // invert word2class array so that we know what words are associated with a given class
364 | 
365 | 	// Loop through classes, finding best pair of classes to merge.  Use pex_move_word() to find best pairs. Record merges separately to reduce overhead.
366 | 	for (wclass_t total_merges = 0; total_merges < cmd_args.num_classes-1; total_merges++) {
367 | 		// The scores arrays don't need to be private in the OMP parallelization, since each thread is writing to different elements in the array
368 | 		wclass_t scores_1_which[cmd_args.num_classes];
369 | 		double scores_1_val[cmd_args.num_classes];
370 | 		memset(scores_1_which, 0, sizeof(wclass_t) * cmd_args.num_classes);
371 | 		memset(scores_1_val, 0, sizeof(double) * cmd_args.num_classes);
372 | 
373 | 		#pragma omp parallel for num_threads(cmd_args.num_threads)
374 | 		for (wclass_t class_1 = 0; class_1 < cmd_args.num_classes-1; class_1++) {
375 | 			const size_t scores_2_length = cmd_args.num_classes - class_1;
376 | 			double scores_2[scores_2_length];
377 | 			memset(scores_2, 0, sizeof(double) * scores_2_length);
378 | 
379 | 			for (wclass_t class_2 = class_1+1; class_2 < cmd_args.num_classes; class_2++) {
380 | 				for (size_t word_offset = 0; word_offset < class2words[class_2].length; word_offset++) { // Sum of all words
381 | 					const word_id_t word = class2words[class_2].words[word_offset];
382 | 					scores_2[class_2] += pex_move_word(cmd_args, word, word_bigrams[word].headword_count, class_1, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true);
383 | 				}
384 | 				scores_1_which[class_1] = which_max(scores_2, scores_2_length);
385 | 				scores_1_val[class_1]   = max(scores_2, scores_2_length);
386 | 
387 | 			}
388 | 			//const double best_pairing_val = max(scores_1_val, cmd_args.num_classes);
389 | 		}
390 | 	}
391 | 
392 | 	free_class_listing(cmd_args, class2words);
393 | 	free(entropy_terms);
394 | }
395 | 
396 | 
397 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words) {
398 | // Invert word2class array so that we know what words are associated with a given class
399 | 
400 | 	// First pass through the word2class array to get counts of how many words are associated with a given class, then later allocate enough memory for these
401 | 	for (word_id_t word = 0; word < model_metadata.type_count; word++) {
402 | 		const wclass_t class = word2class[word];
403 | 		class2words[class].length++;
404 | 	}
405 | 
406 | 	// Allocate enough memory for all words in a given class, then zero-out length values, so that we know where next word should go
407 | 	for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
408 | 		class2words[class].words = malloc(sizeof(word_id_t) * class2words[class].length);
409 | 		class2words[class].length = 0;
410 | 	}
411 | 
412 | 	// Now add each word to the word array, and increment local offset
413 | 	for (word_id_t word = 0; word < model_metadata.type_count; word++) {
414 | 		const wclass_t class = word2class[word];
415 | 		class2words[class].words[class2words[class].length] = word;
416 | 		class2words[class].length++; // The final value of this should be the same as before we zeroed this value out
417 | 	}
418 | }
419 | 
420 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words) {
421 | 	for (wclass_t class = 0; class < cmd_args.num_classes; class++)
422 | 		free(class2words[class].words);
423 | }
424 | 
425 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max) {
426 | 	entropy_terms[0] = 0.0;
427 | 	#pragma omp parallel for num_threads(cmd_args.num_threads)
428 | 	for (unsigned long i = 1; i < entropy_terms_max; i++)
429 | 		entropy_terms[i] = i * log2f(i);
430 | }
431 | 


--------------------------------------------------------------------------------
/src/clustercat-cluster.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CC_CLUSTER_HEADER
 2 | #define INCLUDE_CC_CLUSTER_HEADER
 3 | 
 4 | #include "clustercat.h"
 5 | 
 6 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of word_id's all within the same class. We also keep track of the length of that array.
 7 | 	word_id_t * words;
 8 | 	unsigned int length;
 9 | } struct_class_listing;
10 | 
11 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts);
12 | 
13 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts);
14 | 
15 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays);
16 | 
17 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max);
18 | 
19 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words);
20 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words);
21 | #endif // INCLUDE_HEADER
22 | 


--------------------------------------------------------------------------------
/src/clustercat-data.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CLUSTERCAT_DATA_HEADER
 2 | #define INCLUDE_CLUSTERCAT_DATA_HEADER
 3 | 
 4 | #include "clustercat-map.h"
 5 | //#include "clustercat-tree.h"
 6 | 
 7 | // Thanks Dipstick
 8 | #define STR(x) #x
 9 | #define SHOW_DEFINE(x) printf("%s=%s\n", #x, STR(x))
10 | //	SHOW_DEFINE(DATA_STRUCT_FLOAT_NAME); // for example
11 | 
12 | // Default to storing word-word entries in hash table using uthash
13 | // You can change this by compiling with -DATA_STORE_TREE_LCRS or -DATA_STORE_TRIE
14 | #if defined ATA_STORE_KHASH // https://github.com/attractivechaos/klib
15 |  #define DATA_STRUCT_FLOAT_HUMAN_NAME "khash_map"
16 |  #define DATA_STRUCT_FLOAT_NAME word_word_float_khash
17 |  #define DATA_STRUCT_FLOAT_ADDR 
18 |  #define DATA_STRUCT_FLOAT_TYPE kh_struct_khash_float_t
19 |  #define DATA_STRUCT_FLOAT_TYPE_IN_STRUCT kh_struct_khash_float_t
20 |  #define DATA_STRUCT_FLOAT_SIZE sizeof(kh_struct_khash_float_t)
21 |  #define DECLARE_DATA_STRUCT_FLOAT KHASH_MAP_INIT_STR(DATA_STRUCT_FLOAT_TYPE, float);
22 |  #define INIT_DATA_STRUCT_FLOAT khash_t(struct_khash_float) * DATA_STRUCT_FLOAT_NAME = kh_init(struct_khash_float);
23 |  #define UPDATE_ENTRY_FLOAT(db,key,val) { \
24 | 	 int ret; \
25 | 	 khint_t k = kh_put(struct_khash_float, (&db), (key), &ret); \
26 | 	 if (!ret) kh_del(struct_khash_float, (&db), (k)); \
27 | 	 kh_value((&db), (k)) = (val); \
28 |  }
29 |  #define FIND_ENTRY_FLOAT(db,key) ( kh_get(struct_khash_float, (db), (key)))
30 |  //#define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) ({ \
31 |  //    unsigned long number_of_entries = 0; \
32 |  //    for (khint_t k = kh_begin(db); k != kh_end(db); ++k) \
33 |  //   	if (kh_exist(db, k)) { \
34 |  //   		printf("foobar\n"); \
35 |  ////		printf("%s%s%c%i\n", prefix, entry->key, sep_char, entry->count);
36 |  //   		number_of_entries++; \
37 |  //   	} \
38 |  //    return number_of_entries; \
39 |  //})
40 |  #define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) (1)
41 | #endif
42 | 
43 | typedef struct {
44 | 	struct_map_word *word_map;
45 | 	struct_map_word *word_word_map;
46 | 	struct_map_word *ngram_map;
47 | 	struct_map_word *class_map;
48 | 	char **unique_words;
49 | } struct_model_maps;
50 | 
51 | 
52 | #endif // INCLUDE_HEADER
53 | 


--------------------------------------------------------------------------------
/src/clustercat-dbg.c:
--------------------------------------------------------------------------------
 1 | #include "clustercat-dbg.h"
 2 | 
 3 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts) {
 4 | 	for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
 5 | 		printf("Class=%u   Offsets=%u,%u,...%u:\n\t", class, class, class+cmd_args.num_classes, (model_metadata.type_count-1) * cmd_args.num_classes + class);
 6 | 		for (word_id_t word = 0; word < model_metadata.type_count; word++) {
 7 | 			printf("#(<%u,%hu>)=%u  ", word, class, word_class_counts[word * cmd_args.num_classes + class]);
 8 | 		}
 9 | 		printf("\n");
10 | 	}
11 | 	fflush(stdout);
12 | }
13 | 
14 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list) {
15 | 	printf("word_bigrams:\n"); fflush(stdout);
16 | 	for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
17 | 		printf("  %18s=%u -> {%lu, [", word_list[word_i], word_i, word_bigrams[word_i].length); fflush(stdout);
18 | 		for (word_id_t word_j = 0; word_j < word_bigrams[word_i].length; word_j++) {
19 | 			if (word_j > 0)
20 | 				printf(", ");
21 | 			printf("%s=%u (%ux)", word_list[word_bigrams[word_i].predecessors[word_j]], word_bigrams[word_i].predecessors[word_j], word_bigrams[word_i].bigram_counts[word_j]);
22 | 		}
23 | 		printf("]}\n"); fflush(stdout);
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/clustercat-dbg.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CC_DBG_HEADER
 2 | #define INCLUDE_CC_DBG_HEADER
 3 | 
 4 | #include "clustercat.h"
 5 | 
 6 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts);
 7 | 
 8 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list);
 9 | 
10 | #endif // INCLUDE_HEADER
11 | 


--------------------------------------------------------------------------------
/src/clustercat-import-class-file.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <errno.h>
 3 | #include "clustercat-import-class-file.h"
 4 | #include "clustercat-map.h"
 5 | 
 6 | // Parse TSV file input and overwrite relevant word mappings
 7 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes) {
 8 | 	char * restrict line_end;
 9 | 	char * restrict line = calloc(MAX_WORD_LEN + 9, 1);
10 | 	const word_id_t unk_id = map_find_id(word_map, UNKNOWN_WORD, -1);
11 | 
12 | 	FILE *file = fopen(class_file_name, "r");
13 | 	if (!file) {
14 | 		fprintf(stderr, "%s: fopen of '%s' failed: %s.\n", argv_0_basename, class_file_name, strerror(errno));
15 | 		exit(EXIT_FAILURE);
16 | 	}
17 | 	while (fgets(line, MAX_WORD_LEN + 8, file) != 0) {
18 | 
19 | 		line_end = strchr(line, '\n');
20 | 		*line_end = '\0';
21 | 
22 | 		// Parse each line
23 | 		unsigned int keylen = strcspn(line, PRIMARY_SEP_STRING);
24 | 		line[keylen] = '\0'; // Split key and count
25 | 		char * restrict key = line;
26 | 		wclass_t class = atoi(line + keylen + 1);
27 | 		if (num_classes <= class) {
28 | 			fprintf(stderr,  " Error: Imported word classes from file \"%s\" must be in a range [0,%u-1].  Word \"%s\" has class %i.  If --num-classes is unset, a value is automatically chosen.  See --help\n", class_file_name, num_classes, key, class); fflush(stderr);
29 | 			exit(13);
30 | 		}
31 | 		//printf("keylen=%i, key=<<%s>>, class=<<%d>>\n", keylen, key, class);
32 | 		word_id_t key_int = map_find_id(word_map, key, unk_id);
33 | 		word2class[key_int] = class;
34 | 	}
35 | 
36 | 	fclose(file);
37 | 	free(line);
38 | }
39 | 


--------------------------------------------------------------------------------
/src/clustercat-import-class-file.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER
2 | #define INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER
3 | 
4 | #include "clustercat.h" // wclass_t
5 | 
6 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes);
7 | 
8 | #endif // INCLUDE_HEADER
9 | 


--------------------------------------------------------------------------------
/src/clustercat-io.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include "clustercat.h"
  4 | #include "clustercat-data.h"
  5 | #include "clustercat-array.h"
  6 | #include "clustercat-io.h"
  7 | 
  8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage) {
  9 | 	struct_model_metadata model_metadata = {0};
 10 | 	map_update_count(initial_word_map, UNKNOWN_WORD, 0, 0); // initialize entry for <unk>, <s>, and </s>
 11 | 	map_update_count(initial_word_map, "<s>", 0, 1);
 12 | 	map_update_count(initial_word_map, "</s>", 0, 2);
 13 | 	const word_id_t unk_id   = map_find_id(initial_word_map, UNKNOWN_WORD, 0);
 14 | 	const word_id_t start_id = map_find_id(initial_word_map, "<s>", 1);
 15 | 	const word_id_t end_id   = map_find_id(initial_word_map, "</s>", 2);
 16 | 	const size_t sizeof_struct_map_word   = sizeof(struct_map_word);
 17 | 	const size_t sizeof_struct_map_bigram = sizeof(struct_map_bigram);
 18 | 	model_metadata.type_count = 3; // start with <unk>, <s>, and </s>, and <unk>.
 19 | 
 20 | 	// n-gram input
 21 | 	if (cmd_args.ngram_input) {
 22 | 		char line[STDIN_SENT_MAX_CHARS];
 23 | 		register unsigned int strlen_line = 0;
 24 | 		register unsigned long line_num = 1;
 25 | 		register char * count_split_pos = NULL;
 26 | 		register char * word_split_pos  = NULL;
 27 | 		register unsigned long count = 0;
 28 | 
 29 | 		while (!feof(file)) {
 30 | 			if (! fgets(line, STDIN_SENT_MAX_CHARS, file))
 31 | 				break;
 32 | 			if (*line == '\n') // ignore empty lines
 33 | 				continue;
 34 | 			strlen_line = strlen(line);
 35 | 			if (strlen_line == STDIN_SENT_MAX_CHARS-1)
 36 | 				fprintf(stderr, "\n%s: Warning: Input line too long, at buffer line %lu. The full line was:\n%s\n", argv_0_basename, line_num, line);
 37 | 			line[strlen_line-1] = '\0'; // rm newline
 38 | 
 39 | 			// Split words from counts
 40 | 			count_split_pos = strchr(line, '\t');
 41 | 			*count_split_pos = '\0';
 42 | 			if (count_split_pos == NULL) {
 43 | 				fprintf(stderr, "\n%s: Warning: Malformed n-gram input line number %lu. The line was:\n%s\n", argv_0_basename, line_num, line); fflush(stderr);
 44 | 			} else {
 45 | 				count = strtoul(count_split_pos+1, NULL, 10);
 46 | 			}
 47 | 
 48 | 			// Try to split word1 from word2
 49 | 			word_split_pos  = strchr(line, ' ');
 50 | 
 51 | 			if (word_split_pos) { // Line has bigrams
 52 | 				*word_split_pos = '\0';
 53 | 
 54 | 				// Lookup each word
 55 | 				const word_id_t w1 = map_find_id(initial_word_map, line, unk_id);
 56 | 				const word_id_t w2 = map_find_id(initial_word_map, word_split_pos+1, unk_id);
 57 | 				if (w1 == unk_id || w2 == unk_id) // Unseen word(s) in bigram
 58 | 					fprintf(stderr, "%s: Warning: Unseen word(s) in bigram '%s %s' on input line %lu will be assigned to '%s'. Otherwise, include in unigram counts first.\n", argv_0_basename, line, word_split_pos+1, line_num, UNKNOWN_WORD);
 59 | 
 60 | 				// Form bigram
 61 | 				const struct_word_bigram bigram = {w1, w2};
 62 | 
 63 | 				// Update bigram count
 64 | 				if (map_update_bigram(initial_bigram_map, &bigram, count)) // increment previous+</s> bigram in bigram map
 65 | 					*memusage += sizeof_struct_map_bigram;
 66 | 
 67 | 			} else { // Line has unigrams
 68 | 				if (model_metadata.type_count == map_update_count(initial_word_map, line, count, model_metadata.type_count)) { // <unk>'s word_id is set to 0.
 69 | 					model_metadata.type_count++;
 70 | 					*memusage += sizeof_struct_map_word;
 71 | 				}
 72 | 
 73 | 			}
 74 | 
 75 | 			//if (word_split_pos) // line could be unigram count
 76 | 			//	printf("w1=<<%s>>; w2=<<%s>>; count=<<%s>>==%lu\n", line, word_split_pos+1, count_split_pos+1, count);
 77 | 			//else
 78 | 			//	printf("w1=<<%s>>; count=<<%s>>==%lu\n", line, count_split_pos+1, count);
 79 | 			//fflush(stdout);
 80 | 
 81 | 			line_num++;
 82 | 		}
 83 | 
 84 | 
 85 | 	// Normal text input
 86 | 	} else {
 87 | 		char curr_word[MAX_WORD_LEN + 1]; curr_word[MAX_WORD_LEN] = '\0';
 88 | 		register unsigned int chars_in_sent = 0;
 89 | 		register int ch = 0;
 90 | 		unsigned int curr_word_pos = 0;
 91 | 		unsigned int prev_word_id = start_id;
 92 | 
 93 | 		while (!feof(file)) {
 94 | 			ch = getc(file);
 95 | 			chars_in_sent++;
 96 | 			//printf("«%c» ", ch); fflush(stdout);
 97 | 			if (ch == ' ' || ch == '\t' || ch == '\n') { // end of a word
 98 | 
 99 | 				if (chars_in_sent > STDIN_SENT_MAX_CHARS) { // Line too long
100 | 					curr_word_pos = 0;
101 | 					curr_word[0] = '\0'; // truncate word
102 | 				} else {
103 | 					curr_word[curr_word_pos] = '\0'; // terminate word
104 | 				}
105 | 
106 | 				//printf("chars_in_sent=%u; max_chars=%u; curr_word=%s\n", chars_in_sent, STDIN_SENT_MAX_CHARS, curr_word); fflush(stdout);
107 | 
108 | 				if (!strncmp(curr_word, "", 1)) { // ignore empty words, due to leading, trailing, and multiple spaces
109 | 					//printf("skipping empty word; ch=«%c»\n", ch); fflush(stdout);
110 | 					if (ch == '\n') { // trailing spaces require more stuff to do
111 | 						const struct_word_bigram bigram = {prev_word_id, end_id};
112 | 						if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+</s> bigram in bigram map
113 | 							*memusage += sizeof_struct_map_bigram;
114 | 						chars_in_sent = 0;
115 | 						prev_word_id = start_id;
116 | 						model_metadata.line_count++;
117 | 					}
118 | 					continue;
119 | 				}
120 | 				//printf("curr_word=%s, prev_id=%u\n", curr_word, prev_word_id); fflush(stdout);
121 | 				model_metadata.token_count++;
122 | 				curr_word_pos = 0;
123 | 				// increment current word in word map
124 | 				const word_id_t curr_word_id = map_increment_count(initial_word_map, curr_word, model_metadata.type_count); // <unk>'s word_id is set to 0.
125 | 
126 | 				if (curr_word_id == model_metadata.type_count) { // previous call to map_increment_count() had a new word
127 | 					model_metadata.type_count++;
128 | 					*memusage += sizeof_struct_map_word;
129 | 				}
130 | 
131 | 				// increment previous+current bigram in bigram map
132 | 				const struct_word_bigram bigram = {prev_word_id, curr_word_id};
133 | 				//printf("{%u,%u}\n", prev_word_id, curr_word_id); fflush(stdout);
134 | 				if (map_increment_bigram(initial_bigram_map, &bigram)) // true if bigram is new
135 | 					*memusage += sizeof_struct_map_bigram;
136 | 
137 | 				//printf("process_input(): curr_word=<<%s>>; curr_word_id=%u, prev_word_id=%u\n", curr_word, curr_word_id, prev_word_id); fflush(stdout);
138 | 				if (ch == '\n') { // end of line
139 | 					const struct_word_bigram bigram = {curr_word_id, end_id};
140 | 					if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+</s> bigram in bigram map
141 | 						*memusage += sizeof_struct_map_bigram;
142 | 					chars_in_sent = 0;
143 | 					prev_word_id = start_id;
144 | 					model_metadata.line_count++;
145 | 				} else {
146 | 					prev_word_id = curr_word_id;
147 | 				}
148 | 
149 | 			} else { // normal character;  within a word
150 | 				if (curr_word_pos > MAX_WORD_LEN) // word is too long; do nothing until space or newline
151 | 					continue;
152 | 				else
153 | 					curr_word[curr_word_pos++] = ch;
154 | 			}
155 | 		}
156 | 	}
157 | 
158 | 	// Set counts of <s> and </s> once, based on line_count
159 | 	map_update_count(initial_word_map, "<s>", model_metadata.line_count, 1);
160 | 	map_update_count(initial_word_map, "</s>", model_metadata.line_count, 2);
161 | 	return model_metadata;
162 | }
163 | 


--------------------------------------------------------------------------------
/src/clustercat-io.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CLUSTERCAT_IO
 2 | #define INCLUDE_CLUSTERCAT_IO
 3 | 
 4 | #include "clustercat.h"
 5 | #include "clustercat-data.h"
 6 | 
 7 | // Import
 8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage);
 9 | 
10 | #endif // INCLUDE_HEADER
11 | 


--------------------------------------------------------------------------------
/src/clustercat-map.c:
--------------------------------------------------------------------------------
  1 | #include "clustercat-map.h"
  2 | 
  3 | inline bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram) {
  4 | 	struct_map_bigram *local_s;
  5 | 	HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash?
  6 | 	if (local_s == NULL) {
  7 | 		local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram));
  8 | 		//memcpy(local_s->key, bigram, sizeof(struct_word_bigram));
  9 | 		local_s->key = *bigram;
 10 | 		local_s->count = 1;
 11 | 		HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s);
 12 | 		return true;
 13 | 	} else {
 14 | 		(local_s->count)++;
 15 | 		return false;
 16 | 	}
 17 | }
 18 | 
 19 | inline bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count) {
 20 | 	struct_map_bigram *local_s;
 21 | 	HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash?
 22 | 	if (local_s == NULL) {
 23 | 		local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram));
 24 | 		//memcpy(local_s->key, bigram, sizeof(struct_word_bigram));
 25 | 		local_s->key = *bigram;
 26 | 		local_s->count = count;
 27 | 		HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s);
 28 | 		return true;
 29 | 	} else {
 30 | 		local_s->count += count;
 31 | 		return false;
 32 | 	}
 33 | }
 34 | 
 35 | void map_print_bigrams(struct_map_bigram **bigram_map, char **word_list) {
 36 | 	struct_map_bigram *entry, *tmp;
 37 | 	struct_word_bigram bigram_key;
 38 | 	word_id_t w_1, w_2;
 39 | 	word_bigram_count_t count;
 40 | 
 41 | 	printf("bigram_map:\n");
 42 | 	HASH_ITER(hh, *bigram_map, entry, tmp) {
 43 | 		count          = entry->count;
 44 | 		bigram_key     = entry->key;
 45 | 		w_1            = bigram_key.word_1;
 46 | 		w_2            = bigram_key.word_2;
 47 | 		if (w_1 == (word_id_t)-1 || w_2 == (word_id_t)-1) // Don't print dummy values
 48 | 			continue;
 49 | 		printf(" {%s=%u, %s=%u}: #=%u\n", word_list[w_1], w_1, word_list[w_2], w_2, count);
 50 | 		//printf(" {%u, %u}: #=%u\n", w_1, w_2, count); fflush(stdout);
 51 | 	}
 52 | 	printf("\n"); fflush(stdout);
 53 | }
 54 | 
 55 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id) {
 56 | 	// Iterates through initial bigram hash map and builds a new hash map based on the mapping of old word id's to new ids.  Alongside this, it also builds a reversed counterpart.
 57 | 	struct_map_bigram *entry, *tmp;
 58 | 	struct_word_bigram orig_bigram, new_bigram, new_bigram_rev;
 59 | 	word_id_t w_1, w_2;
 60 | 	word_bigram_count_t count;
 61 | 	//printf("initial_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map));
 62 | 	//printf("word_id_remap71: [%u,%u,%u,%u,%u,%u,...]\n", word_id_remap[0], word_id_remap[1], word_id_remap[2], word_id_remap[3], word_id_remap[4], word_id_remap[5]);
 63 | 
 64 | 	HASH_ITER(hh, *initial_bigram_map, entry, tmp) {
 65 | 		count          = entry->count;
 66 | 		orig_bigram    = entry->key;
 67 | 		w_1            = word_id_remap[orig_bigram.word_1];
 68 | 		w_2            = word_id_remap[orig_bigram.word_2];
 69 | 		if (w_1 == (word_id_t) -1) // reassign temporary placeholder unk_id to final unk_id
 70 | 			w_1 = real_unk_id;
 71 | 		if (w_2 == (word_id_t) -1)
 72 | 			w_2 = real_unk_id;
 73 | 		new_bigram     = (struct_word_bigram) {w_1, w_2};
 74 | 		new_bigram_rev = (struct_word_bigram) {w_2, w_1};
 75 | 		//printf("remap_and_rev_bigram_map: count=%u, orig_w_1=%u, new_w_1=%u, orig_w_2=%u, new_w_2=%u\n", count, orig_bigram.word_1, w_1, orig_bigram.word_2, w_2); fflush(stdout);
 76 | 
 77 | 		//#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
 78 | 		{
 79 | 			//#pragma omp section
 80 | 			{ map_update_bigram(new_bigram_map, &new_bigram, count); }
 81 | 			//const word_bigram_count_t bigram_count = map_update_bigram(&new_bigram_map, &new_bigram, count);
 82 | 			//printf("map_update_bigram: {%u,%u} += %u; now %u\n", new_bigram.word_1, new_bigram.word_2, count, bigram_count);
 83 | 			//#pragma omp section
 84 | 			{ map_update_bigram(new_bigram_map_rev, &new_bigram_rev, count); }
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | inline void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count) { // Based on uthash's docs
 90 | 	struct_map_word *local_s;
 91 | 
 92 | 	//HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
 93 | 	//if (local_s == NULL) {
 94 | 		local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
 95 | 		unsigned short strlen_entry_key = strlen(entry_key);
 96 | 		local_s->key = malloc(strlen_entry_key + 1);
 97 | 		strcpy(local_s->key, entry_key);
 98 | 		HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
 99 | 	//}
100 | 	local_s->count = count;
101 | }
102 | 
103 | inline void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class) {
104 | 	struct_map_word_class *local_s;
105 | 
106 | 	//HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
107 | 	//if (local_s == NULL) {
108 | 		local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class));
109 | 		strncpy(local_s->key, entry_key, KEYLEN-1);
110 | 		HASH_ADD_STR(*map, key, local_s);
111 | 	//}
112 | 	local_s->word_count = word_count;
113 | 	local_s->class = entry_class;
114 | }
115 | 
116 | inline void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned short entry_class) {
117 | 	struct_map_word_class *local_s;
118 | 
119 | 	HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
120 | 	if (local_s == NULL) {
121 | 		local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class));
122 | 		strncpy(local_s->key, entry_key, KEYLEN-1);
123 | 		HASH_ADD_STR(*map, key, local_s);
124 | 	}
125 | 	local_s->class = entry_class;
126 | }
127 | 
128 | inline void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) {
129 | 	struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering
130 | 
131 | 	#pragma omp critical (map_set_word_id_lookup)
132 | 	{
133 | 		HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
134 | 	}
135 | 	if (local_s == NULL) {
136 | 		printf("Error: map_set_word_id(): word '%s' should already be in word_map\n", entry_key); // Shouldn't happen
137 | 		exit(5);
138 | 	}
139 | 	#pragma omp critical (map_set_word_id_assignment)
140 | 	{ local_s->word_id = word_id; }
141 | }
142 | 
143 | inline word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) { // Based on uthash's docs
144 | 	struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering
145 | 
146 | 	#pragma omp critical (map_increment_count_lookup)
147 | 	{
148 | 		HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
149 | 		if (local_s == NULL) {
150 | 			local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
151 | 			local_s->count = 0;
152 | 			local_s->word_id = word_id;
153 | 			unsigned short strlen_entry_key = strlen(entry_key);
154 | 			local_s->key = malloc(strlen_entry_key + 1);
155 | 			strcpy(local_s->key, entry_key);
156 | 			HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
157 | 		}
158 | 	}
159 | 	#pragma omp critical (map_increment_count_increment)
160 | 	{ ++local_s->count; }
161 | 	//printf("map: count of %s is now %u\n", entry_key, local_s->count);
162 | 	return local_s->word_id;
163 | }
164 | 
165 | inline wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]) { // Based on uthash's docs
166 | 	struct_map_class *local_s;
167 | 	size_t sizeof_key = sizeof(wclass_t) * CLASSLEN;
168 | 	//printf("map++: sizeof_key=%zu, CLASSLEN=%u, cls_entry=[%hu,%hu,%hu,%hu]\n", sizeof_key, CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]);
169 | 
170 | 	//#pragma omp critical // not needed since each thread gets its own class_map
171 | 	{
172 | 		//printf("***41***: sizeof_key=%zu, sizeof(wclass_t)=%zu, CLASSLEN=%u, key=<%u,%u,%u,%u>\n", sizeof_key, sizeof(wclass_t), CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]); fflush(stdout);
173 | 		HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash?
174 | 		if (local_s == NULL) {
175 | 			local_s = (struct_map_class *)malloc(sizeof(struct_map_class));
176 | 			local_s->count = 0;
177 | 			memcpy(local_s->key, entry_key, sizeof_key);
178 | 			HASH_ADD(hh, *map, key, sizeof_key, local_s);
179 | 		}
180 | 		//printf("\t***42***: count: %u\n", local_s->count); fflush(stdout);
181 | 	}
182 | 	#pragma omp critical (map_increment_count_fixed_width_increment)
183 | 	{ ++local_s->count; }
184 | 	//printf("map: count of [%hu,%hu,%hu,%hu] is now %u\n", entry_key[0],entry_key[1],entry_key[2],entry_key[3], local_s->count);
185 | 	return local_s->count;
186 | }
187 | 
188 | inline wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]) { // Based on uthash's docs
189 | 	struct_map_class *local_s;
190 | 	size_t sizeof_key = sizeof(wclass_t) * CLASSLEN;
191 | 	wclass_count_t local_count = 0;
192 | 
193 | 	HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash?
194 | 	if (local_s != NULL) { // Deal with OOV
195 | 		local_count = local_s->count;
196 | 	}
197 | 	//printf("map: count=%u for cls_entry=[%hu,%hu,%hu,%hu]\n", local_count, entry_key[0], entry_key[1], entry_key[2], entry_key[3]);
198 | 	return local_count;
199 | }
200 | 
201 | inline word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id) { // Based on uthash's docs
202 | 	struct_map_word *local_s;
203 | 
204 | 	#pragma omp critical
205 | 	{
206 | 		HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
207 | 		if (local_s == NULL) {
208 | 			local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
209 | 			local_s->count = count;
210 | 			local_s->word_id = word_id;
211 | 			unsigned short strlen_entry_key = strlen(entry_key);
212 | 			local_s->key = malloc(strlen_entry_key + 1);
213 | 			strcpy(local_s->key, entry_key);
214 | 			HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
215 | 		} else {
216 | 			local_s->count += count;
217 | 		}
218 | 	}
219 | 	return local_s->word_id;
220 | }
221 | 
222 | inline word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs
223 | 	struct_map_word *local_s;
224 | 	word_count_t local_count = 0;
225 | 
226 | 	HASH_FIND_STR(*map, entry_key, local_s);	// local_s: output pointer
227 | 	if (local_s != NULL) { // Deal with OOV
228 | 		local_count = local_s->count;
229 | 	}
230 | 	return local_count;
231 | }
232 | 
233 | inline word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id) { // Based on uthash's docs
234 | 	struct_map_word *local_s;
235 | 	word_id_t local_id = unknown_id;
236 | 
237 | 	HASH_FIND_STR(*map, entry_key, local_s);
238 | 	if (local_s != NULL) { // Deal with OOV
239 | 		local_id = local_s->word_id;
240 | 	}
241 | 	return local_id;
242 | }
243 | 
244 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs
245 | 	struct_map_word *local_s;
246 | 
247 | 	HASH_FIND_STR(*map, entry_key, local_s);
248 | 	return *local_s;
249 | }
250 | 
251 | inline wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk) {
252 | 	struct_map_word_class *local_s;
253 | 
254 | 	HASH_FIND_STR(*map, entry_key, local_s);	// local_s: output pointer
255 | 	if (local_s != NULL) { // Word is found
256 | 		return local_s->class;
257 | 	} else { // Word is not found
258 | 		return unk;
259 | 	}
260 | }
261 | 
262 | word_id_t get_keys(struct_map_word *map[const], char *keys[]) {
263 | 	struct_map_word *entry, *tmp;
264 | 	word_id_t number_of_keys = 0;
265 | 
266 | 	HASH_ITER(hh, *map, entry, tmp) {
267 | 		// Build-up array of keys
268 | 		unsigned short wlen = strlen(entry->key);
269 | 		keys[number_of_keys] = (char *) malloc(wlen + 1);
270 | 		strcpy(keys[number_of_keys], entry->key);
271 | 		//printf("key=%s, i=%lu, count=%u\n", entry->key, (unsigned long)number_of_keys, entry->count);
272 | 		number_of_keys++;
273 | 	}
274 | 	return number_of_keys;
275 | }
276 | 
277 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]) { // most useful if map is already sorted by count; then you can directly map from old id to new id.
278 | 	struct_map_word *entry, *tmp;
279 | 	word_id_t number_of_keys = 0; // 0-2 are reserved for <unk>, <s>, and </s>
280 | 
281 | 	HASH_ITER(hh, *map, entry, tmp) {
282 | 		//word_ids[number_of_keys] = entry->word_id; // Build-up array of word_id's, from new id to old one
283 | 		const word_id_t word_id = entry->word_id;
284 | 		//if (word_id < 3) // don't change id's for <unk>, <s>, or </s>
285 | 		//	continue;
286 | 		word_ids[word_id] = number_of_keys; // Build-up array of word_id's, from old id to new one
287 | 		//printf("get_ids: old_id=%u\n", word_id); fflush(stdout);
288 | 		number_of_keys++;
289 | 	}
290 | 	return number_of_keys;
291 | }
292 | 
293 | void delete_entry(struct_map_word **map, struct_map_word *entry) { // Based on uthash's docs
294 | 	HASH_DEL(*map, entry);	// entry: pointer to deletee
295 | 	free(entry->key); // key is a malloc'd string
296 | 	free(entry);
297 | }
298 | 
299 | void delete_all(struct_map_word **map) {
300 | 	struct_map_word *current_entry, *tmp;
301 | 
302 | 	HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
303 | 		HASH_DEL(*map, current_entry);	// delete it (map advances to next)
304 | 		free(current_entry);	// free it
305 | 	}
306 | }
307 | 
308 | void delete_all_class(struct_map_class **map) {
309 | 	struct_map_class *current_entry, *tmp;
310 | 
311 | 	HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
312 | 		HASH_DEL(*map, current_entry);	// delete it (map advances to next)
313 | 		free(current_entry);	// free it
314 | 	}
315 | }
316 | 
317 | void delete_all_bigram(struct_map_bigram **map) {
318 | 	struct_map_bigram *current_entry, *tmp;
319 | 
320 | 	HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
321 | 		HASH_DEL(*map, current_entry);	// delete it (map advances to next)
322 | 		free(current_entry);	// free it
323 | 	}
324 | }
325 | 
326 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs) {
327 | 	struct_map_word_class *map = NULL;
328 | 
329 | 	for (word_id_t word_id = 0; word_id < type_count; word_id++) { // Populate new word2class_map, so we can do fun stuff like primary- and secondary-sort easily
330 | 		//printf("adding %s=%hu to temp word2class_map\n", word_list[word_id], word2class[word_id]); fflush(stdout);
331 | 		map_add_class(&map, word_list[word_id], (unsigned long)word_counts[word_id], word2class[word_id]);
332 | 	}
333 | 
334 | 	sort_by_key(&map); // Tertiary sort, alphabetically by key
335 | 	word_class_sort_by_count(&map); // Secondary sort, by count
336 | 	sort_by_class(&map); // Primary sort, numerically by class
337 | 
338 | 	struct_map_word_class *s;
339 | 	for (s = map; s != NULL; s = (struct_map_word_class *)(s->hh.next)) {
340 | 		fprintf(out_file, "%s\t%li", s->key, (long)(s->class) + class_offset);
341 | 		if (print_freqs)
342 | 			fprintf(out_file, "\t%lu", (long unsigned)(s->word_count));
343 | 		fprintf(out_file, "\n");
344 | 		HASH_DEL(map, s);	// delete it (map advances to next)
345 | 		free(s->key);	// free it
346 | 		//fprintf(stderr, "49.11: next=%zu\n", (struct_map_word_class *)(s->hh.next)); fflush(stderr);
347 | 	}
348 | }
349 | 
350 | int count_sort(struct_map_word *a, struct_map_word *b) { // Based on uthash's docs
351 | 	return (b->count - a->count); // sort descending: most frequent to least frequent
352 | }
353 | 
354 | void sort_by_count(struct_map_word **map) { // Based on uthash's docs
355 | 	HASH_SORT(*map, count_sort);
356 | }
357 | 
358 | int id_sort(struct_map_word *a, struct_map_word *b) {
359 | 	return (a->word_id - b->word_id); // sort ascending
360 | }
361 | 
362 | void sort_by_id(struct_map_word **map) {
363 | 	HASH_SORT(*map, id_sort);
364 | }
365 | 
366 | int word_class_count_sort(struct_map_word_class *a, struct_map_word_class *b) {
367 | 	return (b->word_count - a->word_count); // sort descending: most frequent to least frequent
368 | }
369 | 
370 | void word_class_sort_by_count(struct_map_word_class **map) {
371 | 	HASH_SORT(*map, word_class_count_sort);
372 | }
373 | 
374 | int key_sort(struct_map_word_class *a, struct_map_word_class *b) {
375 | 	return strcmp(a->key, b->key);
376 | }
377 | 
378 | void sort_by_key(struct_map_word_class **map) {
379 | 	HASH_SORT(*map, key_sort);
380 | }
381 | 
382 | int class_sort(struct_map_word_class *a, struct_map_word_class *b) { // Based on uthash's docs
383 | 	return (a->class - b->class);
384 | }
385 | 
386 | void sort_by_class(struct_map_word_class **map) {
387 | 	HASH_SORT(*map, class_sort);
388 | }
389 | 
390 | inline int bigram_sort_word_1(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs
391 | 	return ((a->key).word_1 - (b->key).word_1);
392 | }
393 | 
394 | inline int bigram_sort_word_2(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs
395 | 	return ((a->key).word_2 - (b->key).word_2);
396 | }
397 | 
398 | void sort_bigrams(struct_map_bigram **map) {
399 | 	HASH_SORT(*map, bigram_sort_word_2);
400 | 	//HASH_SORT(*map, bigram_sort_word_1);
401 | }
402 | 
403 | unsigned long map_count(struct_map_word *map[const]) {
404 | 	return HASH_COUNT(*map);
405 | }
406 | 
407 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count) {
408 | 	struct_map_word *entry, *tmp;
409 | 	unsigned long number_of_entries = 0;
410 | 
411 | 	HASH_ITER(hh, *map, entry, tmp) {
412 | 		if (entry->count >= min_count) {
413 | 			printf("%s%s%c%lu\n", prefix, entry->key, sep_char, (unsigned long)entry->count);
414 | 			number_of_entries++;
415 | 		}
416 | 	}
417 | 	return number_of_entries;
418 | }
419 | 


--------------------------------------------------------------------------------
/src/clustercat-map.h:
--------------------------------------------------------------------------------
  1 | #ifndef INCLUDE_CLUSTERCAT_MAP_HEADER
  2 | #define INCLUDE_CLUSTERCAT_MAP_HEADER
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdbool.h>
  6 | #include "uthash.h"
  7 | 
  8 | #ifdef ATA_STORE_KHASH
  9 |  #include "khash.h"
 10 |  KHASH_MAP_INIT_STR(struct_khash_float, float);
 11 | #endif
 12 | 
 13 | // Defaults
 14 | #define KEYLEN 80
 15 | #define CLASSLEN 3 // Longest possible class ngram to store
 16 | typedef unsigned short wclass_t;            // Max number of word classes
 17 | typedef unsigned int   wclass_count_t;      // Max count of a given word class
 18 | typedef unsigned int   word_id_t;           // Max number of words
 19 | typedef unsigned int   word_count_t;        // Max count of a given word class
 20 | typedef unsigned int   word_bigram_count_t; // Max count of a given word bigram
 21 | typedef unsigned int   class_bigram_count_t; // Max count of a given class bigram
 22 | typedef unsigned int   word_class_count_t;  // Max count of a given <word, class> tuple
 23 | 
 24 | typedef struct {
 25 | 	word_id_t word_1;
 26 | 	word_id_t word_2;
 27 | } struct_word_bigram;
 28 | 
 29 | 
 30 | typedef struct { // We need an O(1) map that we can iterate over later
 31 | 	struct_word_bigram key;
 32 | 	word_bigram_count_t count;
 33 | 	UT_hash_handle hh;	// makes this structure hashable
 34 | } struct_map_bigram;
 35 | 
 36 | typedef struct {
 37 | 	char * restrict key;
 38 | 	word_count_t count;
 39 | 	word_id_t word_id;
 40 | 	UT_hash_handle hh;	// makes this structure hashable
 41 | } struct_map_word;
 42 | 
 43 | typedef struct { // Maps a class to its count
 44 | 	wclass_t key[CLASSLEN];
 45 | 	wclass_count_t count;
 46 | 	UT_hash_handle hh;	// makes this structure hashable
 47 | } struct_map_class;
 48 | 
 49 | typedef struct { // Maps a word to its class
 50 | 	char key[KEYLEN];
 51 | 	unsigned long word_count;
 52 | 	wclass_t class;
 53 | 	UT_hash_handle hh;	// makes this structure hashable
 54 | } struct_map_word_class;
 55 | 
 56 | void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count);
 57 | 
 58 | void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class);
 59 | 
 60 | void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const wclass_t entry_class);
 61 | 
 62 | void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id);
 63 | 
 64 | word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id);
 65 | 
 66 | wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]);
 67 | 
 68 | bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram);
 69 | bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count);
 70 | void map_print_bigrams(struct_map_bigram **map, char **word_list);
 71 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id);
 72 | 
 73 | word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id);
 74 | 
 75 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key);
 76 | word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key);
 77 | wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]);
 78 | 
 79 | word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id);
 80 | 
 81 | wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk);
 82 | 
 83 | word_id_t get_keys(struct_map_word *map[const], char *keys[]);
 84 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]);
 85 | 
 86 | void sort_by_class(struct_map_word_class **map);
 87 | void sort_by_key(struct_map_word_class **map);
 88 | void sort_by_id(struct_map_word **map);
 89 | void sort_by_count(struct_map_word **map);
 90 | void word_class_sort_by_count(struct_map_word_class **map);
 91 | void sort_bigrams(struct_map_bigram **map);
 92 | 
 93 | unsigned long map_count(struct_map_word *map[const]);
 94 | 
 95 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count);
 96 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs);
 97 | 
 98 | void delete_all(struct_map_word **map);
 99 | void delete_all_class(struct_map_class **map);
100 | void delete_all_bigram(struct_map_bigram **map);
101 | void delete_entry(struct_map_word **map, struct_map_word *entry);
102 | 
103 | #endif // INCLUDE_HEADER
104 | 


--------------------------------------------------------------------------------
/src/clustercat-math.c:
--------------------------------------------------------------------------------
 1 | #include "clustercat.h"				// Model importing/exporting functions
 2 | #include "clustercat-math.h"
 3 | 
 4 | double dot_product(const double probs[const], const double weights[const], int length) {
 5 | 	double sum = 0;
 6 | 	double sum_weights = 0;
 7 | 	length--;
 8 | 
 9 | 	for (; length >= 0; --length) {
10 | 		sum_weights += weights[length];
11 | 		sum += probs[length] * weights[length];
12 | 		//printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length);
13 | 	}
14 | 	//printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights);
15 | 	return sum_weights ? (sum / sum_weights) : 0.0;
16 | }
17 | 
18 | float dot_productf(const float probs[const], const float weights[const], int length) {
19 | 	float sum = 0;
20 | 	float sum_weights = 0;
21 | 	length--;
22 | 
23 | 	for (; length >= 0; --length) {
24 | 		sum_weights += weights[length];
25 | 		sum += probs[length] * weights[length];
26 | 		//printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length);
27 | 	}
28 | 	//printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights);
29 | 	return sum_weights ? (sum / sum_weights) : 0.0;
30 | }
31 | 
32 | long int powi(long int base, long int exp) { // Integer exponentiation
33 | 	long int result = 1;
34 | 	while (exp--)
35 | 		result *= base;
36 | 	return result;
37 | }
38 | 
39 | double perplexity(const double log_probs, const unsigned long num_words_queried) {
40 | 	// Assumes log_probs used log2()
41 | 	return pow(2, -log_probs / (double)num_words_queried);
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/src/clustercat-math.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CLUSTERCAT_MATH
 2 | #define INCLUDE_CLUSTERCAT_MATH
 3 | 
 4 | double dot_product(const double probs[const], const double weights[const], int length);
 5 | float dot_productf(const float probs[const], const float weights[const], int length);
 6 | 
 7 | long int powi(long int base, long int exp);
 8 | 
 9 | double perplexity(const double log_probs, const unsigned long num_words_queried);
10 | 
11 | #endif // INCLUDE_HEADER
12 | 


--------------------------------------------------------------------------------
/src/clustercat-tokenize.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include "clustercat-tokenize.h"
 3 | 
 4 | // Simple threadsafe tokenization for plaintext, copying words into **sent_words
 5 | // Remember to free using tokenize_simple_free()
 6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words) {
 7 | 	sentlen_t i;
 8 | 	char * restrict pch;
 9 | 
10 | 	sent_words[0] = "<s>";
11 | 
12 | 	for (i = 1, pch = sent_string; i < SENT_LEN_MAX ; i++) {
13 | 		sentlen_t toklen = strcspn(pch, " \n\t");
14 | 
15 | 		if (toklen == 0) { // End of sentence
16 | 			sent_words[i] = "</s>";
17 | 			break;
18 | 		}
19 | 
20 | 		sent_words[i] = malloc(toklen+1);
21 | 		strncpy(sent_words[i], pch, toklen); // Threadsafe copy doesn't touch original
22 | 		sent_words[i][toklen] = '\0';
23 | 
24 | 		pch += toklen+1;
25 | 	}
26 | 
27 | 	return i;
28 | }
29 | 
30 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length) {
31 | 	sentlen_t i = 1;
32 | 	for (; i < length-1; ++i) { // Assumes word_0 is <s> and word_sentlen is </s>, which weren't malloc'd
33 | 	    free(sent_words[i]);
34 | 	}
35 | 	free(sent_words);
36 | }
37 | 


--------------------------------------------------------------------------------
/src/clustercat-tokenize.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDE_CLUSTERCAT_TOKENIZE
 2 | #define INCLUDE_CLUSTERCAT_TOKENIZE
 3 | 
 4 | #include "clustercat.h"
 5 | 
 6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words);
 7 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length);
 8 | 
 9 | #endif // INCLUDE_HEADER
10 | 


--------------------------------------------------------------------------------
/src/clustercat.c:
--------------------------------------------------------------------------------
  1 | /** Induces word categories
  2 |  *  By Jon Dehdari, 2014-2016
  3 |  *  Usage: ./clustercat [options] < corpus.tok.txt > classes.tsv
  4 | **/
  5 | 
  6 | #include <limits.h>				// UCHAR_MAX, UINT_MAX
  7 | #include <float.h>				// DBL_MAX, etc.
  8 | #include <math.h>				// isnan()
  9 | #include <time.h>				// clock_t, clock(), CLOCKS_PER_SEC
 10 | #include <stdbool.h>
 11 | #include <locale.h>				// OPTIONAL!  Comment-out on non-Posix machines, and the function setlocale() in the first line of main()
 12 | 
 13 | #include "clustercat.h"						// Model importing/exporting functions
 14 | #include "clustercat-array.h"				// which_maxf()
 15 | #include "clustercat-data.h"
 16 | #include "clustercat-cluster.h"				// cluster()
 17 | #include "clustercat-dbg.h"					// for printing out various complex data structures
 18 | #include "clustercat-import-class-file.h"	// import_class_file()
 19 | #include "clustercat-io.h"					// process_input()
 20 | #include "clustercat-math.h"				// perplexity(), powi()
 21 | 
 22 | #define USAGE_LEN 10000
 23 | #define LOG2ADD(a,b) (log2(a) + log2(1 + (b) / (a) ))
 24 | 
 25 | // Declarations
 26 | void get_usage_string(char * restrict usage_string, int usage_len);
 27 | void parse_cmd_args(const int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args);
 28 | char * restrict class_algo           = NULL;
 29 | char * restrict in_train_file_string = NULL;
 30 | char * restrict out_file_string      = NULL;
 31 | char * restrict initial_class_file   = NULL;
 32 | char * argv_0_basename = NULL;
 33 | 
 34 | struct_map_word *word_map = NULL; // Must initialize to NULL
 35 | struct_map_bigram *initial_bigram_map = NULL; // Must initialize to NULL
 36 | struct_map_bigram *new_bigram_map     = NULL; // Must initialize to NULL
 37 | struct_map_bigram *new_bigram_map_rev = NULL; // Must initialize to NULL
 38 | char usage[USAGE_LEN];
 39 | size_t memusage = 0;
 40 | 
 41 | 
 42 | // Defaults
 43 | struct cmd_args cmd_args = {
 44 | 	.class_algo         = EXCHANGE,
 45 | 	.class_offset       = 0,
 46 | 	.forward_lambda     = 0.55,
 47 | 	.min_count          = 3, // or max(2, floor(N^0.14 - 7))
 48 | 	.max_array          = 2,
 49 | 	.ngram_input        = false,
 50 | 	.num_threads        = 8,
 51 | 	.num_classes        = 0,
 52 | 	.print_freqs        = false,
 53 | 	.print_word_vectors = NO_VEC,
 54 | 	.refine             = 2,
 55 | 	.rev_alternate      = 3,
 56 | 	.tune_cycles        = 15,
 57 | 	.unidirectional     = false,
 58 | 	.verbose            = 0,
 59 | };
 60 | 
 61 | 
 62 | 
 63 | int main(int argc, char **argv) {
 64 | 	setlocale(LC_ALL, ""); // Comment-out on non-Posix systems
 65 | 	clock_t time_start = clock();
 66 | 	time_t time_t_start;
 67 | 	time(&time_t_start);
 68 | 	argv_0_basename = basename(argv[0]);
 69 | 	get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere
 70 | 
 71 | 	//printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args));
 72 | 	parse_cmd_args(argc, argv, usage, &cmd_args);
 73 | 
 74 | 	if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN)
 75 | 		memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage
 76 | 
 77 | 	struct_model_metadata global_metadata;
 78 | 
 79 | 	// The list of unique words should always include <s>, unknown word, and </s>
 80 | 	map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first
 81 | 	map_update_count(&word_map, "<s>", 0, 1);
 82 | 	map_update_count(&word_map, "</s>", 0, 2);
 83 | 
 84 | 	// Open input
 85 | 	FILE *in_train_file = stdin;
 86 | 	if (in_train_file_string)
 87 | 		in_train_file = fopen(in_train_file_string, "r");
 88 | 	if (in_train_file == NULL) {
 89 | 		fprintf(stderr, "%s: Error: Unable to open input file  %s\n", argv_0_basename, in_train_file_string); fflush(stderr);
 90 | 		exit(15);
 91 | 	}
 92 | 
 93 | 	// Process input sentences
 94 | 	size_t input_memusage = 0;
 95 | 	const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage);
 96 | 	memusage += input_memusage;
 97 | 	fclose(in_train_file);
 98 | 
 99 | 	clock_t time_input_processed = clock();
100 | 	if (cmd_args.verbose >= -1) {
101 | 		fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr);
102 | 	}
103 | 
104 | 	global_metadata.token_count = input_model_metadata.token_count;
105 | 	global_metadata.type_count  = map_count(&word_map);
106 | 
107 | 	// Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's
108 | 	sort_by_count(&word_map);
109 | 	word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count);
110 | 	get_ids(&word_map, word_id_remap);
111 | 	word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap);
112 | 
113 | 	// Get list of unique words
114 | 	char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count);
115 | 	memusage += sizeof(char*) * global_metadata.type_count;
116 | 	reassign_word_ids(&word_map, word_list, word_id_remap);
117 | 	get_keys(&word_map, word_list);
118 | 	sort_by_id(&word_map);
119 | 
120 | 
121 | 	// Check or set number of classes
122 | 	if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low
123 | 		fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u).  Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr);
124 | 		exit(3);
125 | 	} else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all
126 | 		cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2);
127 | 	}
128 | 
129 | 	// Build array of word_counts
130 | 	word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count);
131 | 	memusage += sizeof(word_count_t) * global_metadata.type_count;
132 | 	build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count);
133 | 
134 | 	// Initialize clusters, and possibly read-in external class file
135 | 	wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count);
136 | 	memusage += sizeof(wclass_t) * global_metadata.type_count;
137 | 	init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list);
138 | 	if (initial_class_file != NULL)
139 | 		import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file
140 | 
141 | 	// Remap word_id's in initial_bigram_map
142 | 	remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1));
143 | 	global_metadata.start_sent_id = map_find_id(&word_map, "<s>", -1);; // need this for tallying emission probs
144 | 	global_metadata.end_sent_id   = map_find_id(&word_map, "</s>", -1);; // need this for tallying emission probs
145 | 	global_metadata.line_count    = map_find_count(&word_map, "</s>"); // Used for calculating perplexity
146 | 
147 | 	if (global_metadata.line_count == 0) {
148 | 		fprintf(stderr, "%s: Warning: Number of lines is 0.  Include <s> and </s> in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr);
149 | 	}
150 | 
151 | 	//printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout);
152 | 	//printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout);
153 | 	free(word_id_remap);
154 | 	memusage -= sizeof(word_id_t) * input_model_metadata.type_count;
155 | 	delete_all(&word_map); // static
156 | 	delete_all_bigram(&initial_bigram_map); // static
157 | 	memusage -= input_memusage;
158 | 
159 | 	// Initialize and set word bigram listing
160 | 	clock_t time_bigram_start = clock();
161 | 	size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0;
162 | 	struct_word_bigram_entry * restrict word_bigrams = NULL;
163 | 	struct_word_bigram_entry * restrict word_bigrams_rev = NULL;
164 | 
165 | 	if (cmd_args.verbose >= -1) {
166 | 		fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr);
167 | 	}
168 | 
169 | 	#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
170 | 	{
171 | 		#pragma omp section
172 | 		{
173 | 			//sort_bigrams(&new_bigram_map); // speeds things up later
174 | 			word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
175 | 			memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
176 | 			bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map);
177 | 			// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
178 | 			for (word_id_t word = 0; word < global_metadata.type_count; word++)
179 | 				word_bigrams[word].headword_count = word_counts[word];
180 | 		}
181 | 
182 | 		// Initialize and set *reverse* word bigram listing
183 | 		#pragma omp section
184 | 		{
185 | 			if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
186 | 				//sort_bigrams(&new_bigram_map_rev); // speeds things up later
187 | 				word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
188 | 				memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
189 | 				bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev);
190 | 				// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
191 | 				for (word_id_t word = 0; word < global_metadata.type_count; word++)
192 | 					word_bigrams_rev[word].headword_count = word_counts[word];
193 | 			}
194 | 		}
195 | 	}
196 | 
197 | 	delete_all_bigram(&new_bigram_map);
198 | 	delete_all_bigram(&new_bigram_map_rev);
199 | 	memusage += bigram_memusage + bigram_rev_memusage;
200 | 	clock_t time_bigram_end = clock();
201 | 	if (cmd_args.verbose >= -1) {
202 | 		fprintf(stderr, "in %'.2f CPU secs.  Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr);
203 | 	}
204 | 
205 | 	//print_word_bigrams(global_metadata, word_bigrams, word_list);
206 | 
207 | 	// Build <v,c> counts, which consists of a word followed by a given class
208 | 	word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
209 | 	if (word_class_counts == NULL) {
210 | 		fprintf(stderr,  "%s: Error: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
211 | 		exit(13);
212 | 	}
213 | 	memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
214 | 	fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
215 | 	build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/);
216 | 	//print_word_class_counts(cmd_args, global_metadata, word_class_counts);
217 | 
218 | 	// Build reverse: <c,v> counts: class followed by word.  This and the normal one are both pretty fast, so no need to parallelize this
219 | 	word_class_count_t * restrict word_class_rev_counts = NULL;
220 | 	if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
221 | 		word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
222 | 		if (word_class_rev_counts == NULL) {
223 | 			fprintf(stderr,  "%s: Warning: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
224 | 			cmd_args.rev_alternate = 0;
225 | 		} else {
226 | 			memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
227 | 			fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
228 | 			build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/);
229 | 		}
230 | 
231 | 	}
232 | 
233 | 	// Calculate memusage for count_arrays
234 | 	for (unsigned char i = 1; i <= cmd_args.max_array; i++) {
235 | 		memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t));
236 | 		//printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout);
237 | 	}
238 | 
239 | 	clock_t time_model_built = clock();
240 | 	if (cmd_args.verbose >= -1) {
241 | 		fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr);
242 | 	}
243 | 	if (cmd_args.verbose >= -1) {
244 | 		fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr);
245 | 	}
246 | 
247 | 	cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
248 | 
249 | 	// Now print the final word2class mapping
250 | 	if (cmd_args.verbose >= 0) {
251 | 		FILE *out_file = stdout;
252 | 		if (out_file_string)
253 | 			out_file = fopen(out_file_string, "w");
254 | 		if (out_file == NULL) {
255 | 			fprintf(stderr, "%s: Error: Unable to open output file  %s\n", argv_0_basename, out_file_string); fflush(stderr);
256 | 			exit(16);
257 | 		}
258 | 		if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) {
259 | 			print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs);
260 | 		} else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) {
261 | 			print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
262 | 		}
263 | 		fclose(out_file);
264 | 	}
265 | 
266 | 	clock_t time_clustered = clock();
267 | 	time_t time_t_end;
268 | 	time(&time_t_end);
269 | 	double time_secs_total = difftime(time_t_end, time_t_start);
270 | 	if (cmd_args.verbose >= -1)
271 | 		fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds.  Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60)  );
272 | 
273 | 	free(word2class);
274 | 	free(word_bigrams);
275 | 	free(word_list);
276 | 	free(word_counts);
277 | 	exit(0);
278 | }
279 | 
280 | 
281 | void get_usage_string(char * restrict usage_string, int usage_len) {
282 | 
283 | 	snprintf(usage_string, usage_len, "ClusterCat  (c) 2014-2016 Jon Dehdari - LGPL v3 or Mozilla Public License v2\n\
284 | \n\
285 | Usage:    clustercat [options] < corpus.tok.txt > classes.tsv \n\
286 | \n\
287 | Function: Induces word categories from plaintext\n\
288 | \n\
289 | Options:\n\
290 |  -c, --classes <hu>       Set number of word classes (default: 1.2 * square root of vocabulary size)\n\
291 |      --class-file <file>  Initialize exchange word classes from an existing clustering tsv file (default: pseudo-random initialization\n\
292 |                           for exchange). If you use this option, you probably can set --tune-cycles to 3 or so\n\
293 |      --class-offset <c>   Print final word classes starting at a given number (default: %d)\n\
294 |      --forward-lambda <f> Set interpolation weight for forward bigram class model, in range of [0,1] (default: %g)\n\
295 |  -h, --help               Print this usage\n\
296 |      --in <file>          Specify input training file (default: stdin)\n\
297 |      --ngram-input        Input is a listing of n-grams and their counts. Otherwise input is a normal corpus\n\
298 |      --min-count <hu>     Minimum count of entries in training set to consider (default: %d occurrences)\n\
299 |      --max-array <c>      Set maximum order of n-grams for which to use an array instead of a sparse hash map (default: %d-grams)\n\
300 |      --out <file>         Specify output file (default: stdout)\n\
301 |      --print-freqs        Print word frequencies after words and classes in final clustering output (useful for visualization)\n\
302 |  -q, --quiet              Print less output.  Use additional -q for even less output\n\
303 |      --refine <c>         Set initial class refinement value (c==0 -> no refinement; otherwise 2^n.  Default:c==2 -> 4 initial clusters)\n\
304 |      --rev-alternate <u>  How often to alternate using reverse predictive exchange. 0==never, 1==after every normal cycle (default: %u)\n\
305 |  -j, --threads <hu>       Set number of threads to run simultaneously (default: %d threads)\n\
306 |      --tune-cycles <hu>   Set max number of cycles to tune on (default: %d cycles)\n\
307 |      --unidirectional     Disable simultaneous bidirectional predictive exchange. Results in faster cycles, but slower & worse convergence\n\
308 |                           If you want to do basic predictive exchange, use:  --rev-alternate 0 --unidirectional\n\
309 |  -v, --verbose            Print additional info to stderr.  Use additional -v for more verbosity\n\
310 |      --word-vectors <s>   Print word vectors (a.k.a. word embeddings) instead of discrete classes.\n\
311 |                           Specify <s> as either 'text' or 'binary'.  The binary format is compatible with word2vec\n\
312 | \n\
313 | ", cmd_args.class_offset, cmd_args.forward_lambda, cmd_args.min_count, cmd_args.max_array, cmd_args.rev_alternate, cmd_args.num_threads, cmd_args.tune_cycles);
314 | }
315 | //     --class-algo <s>     Set class-induction algorithm {brown,exchange,exchange-then-brown} (default: exchange)\n\
316 | // -o, --order <i>          Maximum n-gram order in training set to consider (default: %d-grams)\n\
317 | // -w, --weights 'f f ...'  Set class interpolation weights for: 3-gram, 2-gram, 1-gram, rev 2-gram, rev 3-gram. (default: %s)\n\
318 | 
319 | void parse_cmd_args(int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args) {
320 | 	for (int arg_i = 0; arg_i < argc; arg_i++) // Print command-line invocation, for reproducibility
321 | 		if (cmd_args->verbose >= -1) {
322 | 			fprintf(stderr, "%s ", argv[arg_i]); fflush(stderr);
323 | 		}
324 | 	if (cmd_args->verbose >= -1) {
325 | 		fprintf(stderr, "\n"); fflush(stderr);
326 | 	}
327 | 
328 | 	for (int arg_i = 1; arg_i < argc; arg_i++) {
329 | 		if (!(strcmp(argv[arg_i], "-h") && strcmp(argv[arg_i], "--help"))) {
330 | 			printf("%s", usage);
331 | 			exit(0);
332 | 		} else if (!strcmp(argv[arg_i], "--class-algo")) {
333 | 			char * restrict class_algo_string = argv[arg_i+1];
334 | 			arg_i++;
335 | 			if (!strcmp(class_algo_string, "brown"))
336 | 				cmd_args->class_algo = BROWN;
337 | 			else if (!strcmp(class_algo_string, "exchange"))
338 | 				cmd_args->class_algo = EXCHANGE;
339 | 			else if (!strcmp(class_algo_string, "exchange-then-brown"))
340 | 				cmd_args->class_algo = EXCHANGE_BROWN;
341 | 			else { printf("%s", usage); exit(1); }
342 | 		} else if (!strcmp(argv[arg_i], "--class-file")) {
343 | 			initial_class_file = argv[arg_i+1];
344 | 			arg_i++;
345 | 		} else if (!strcmp(argv[arg_i], "--class-offset")) {
346 | 			cmd_args->class_offset = (signed char)atoi(argv[arg_i+1]);
347 | 			arg_i++;
348 | 		} else if (!strcmp(argv[arg_i], "--forward-lambda")) {
349 | 			cmd_args->forward_lambda = (float)atof(argv[arg_i+1]);
350 | 			arg_i++;
351 | 		} else if (!strcmp(argv[arg_i], "--in")) {
352 | 			in_train_file_string = argv[arg_i+1];
353 | 			arg_i++;
354 | 		} else if (!(strcmp(argv[arg_i], "-j") && strcmp(argv[arg_i], "--threads") && strcmp(argv[arg_i], "--jobs"))) {
355 | 			cmd_args->num_threads = (unsigned int) atol(argv[arg_i+1]);
356 | 			arg_i++;
357 | 		} else if (!strcmp(argv[arg_i], "--min-count")) {
358 | 			cmd_args->min_count = (unsigned int) atol(argv[arg_i+1]);
359 | 			arg_i++;
360 | 		} else if (!strcmp(argv[arg_i], "--max-array")) {
361 | 			cmd_args->max_array = (unsigned char) atol(argv[arg_i+1]);
362 | 			if ((cmd_args->max_array) < 1 || (cmd_args->max_array > 3)) {
363 | 				printf("%s: --max-array value should be between 1-3\n", argv_0_basename);
364 | 				fflush(stderr);
365 | 				exit(10);
366 | 			}
367 | 			arg_i++;
368 | 		} else if (!(strcmp(argv[arg_i], "--ngram-input"))) {
369 | 			cmd_args->ngram_input = true;
370 | 		} else if (!(strcmp(argv[arg_i], "-c") && strcmp(argv[arg_i], "-n") && strcmp(argv[arg_i], "--classes") && strcmp(argv[arg_i], "--num-classes"))) {
371 | 			cmd_args->num_classes = (wclass_t) atol(argv[arg_i+1]);
372 | 			arg_i++;
373 | 		} else if (!strcmp(argv[arg_i], "--out")) {
374 | 			out_file_string = argv[arg_i+1];
375 | 			arg_i++;
376 | 		} else if (!(strcmp(argv[arg_i], "--print-freqs"))) {
377 | 			cmd_args->print_freqs = true;
378 | 		} else if (!(strcmp(argv[arg_i], "-q") && strcmp(argv[arg_i], "--quiet"))) {
379 | 			cmd_args->verbose--;
380 | 		} else if (!(strcmp(argv[arg_i], "--refine"))) {
381 | 			cmd_args->refine = (unsigned char) atol(argv[arg_i+1]);
382 | 			arg_i++;
383 | 		} else if (!strcmp(argv[arg_i], "--rev-alternate")) {
384 | 			cmd_args->rev_alternate = (unsigned char) atoi(argv[arg_i+1]);
385 | 			arg_i++;
386 | 		} else if (!strcmp(argv[arg_i], "--tune-cycles")) {
387 | 			cmd_args->tune_cycles = (unsigned short) atol(argv[arg_i+1]);
388 | 			arg_i++;
389 | 		} else if (!(strcmp(argv[arg_i], "--unidirectional"))) {
390 | 			cmd_args->unidirectional = true;
391 | 		} else if (!(strcmp(argv[arg_i], "-v") && strcmp(argv[arg_i], "--verbose"))) {
392 | 			cmd_args->verbose++;
393 | 		} else if (!(strcmp(argv[arg_i], "--word-vectors"))) {
394 | 			char * restrict print_word_vectors_string = argv[arg_i+1];
395 | 			arg_i++;
396 | 			if (!strcmp(print_word_vectors_string, "text"))
397 | 				cmd_args->print_word_vectors = TEXT_VEC;
398 | 			else if (!strcmp(print_word_vectors_string, "binary"))
399 | 				cmd_args->print_word_vectors = BINARY_VEC;
400 | 			else { printf("Error: Please specify either 'text' or 'binary' after the --word-vectors flag.\n\n%s", usage); exit(1); }
401 | 		} else if (!strncmp(argv[arg_i], "-", 1)) { // Unknown flag
402 | 			printf("%s: Unknown command-line argument: %s\n\n", argv_0_basename, argv[arg_i]);
403 | 			printf("%s", usage); fflush(stderr);
404 | 			exit(2);
405 | 		}
406 | 	}
407 | }
408 | 
409 | void build_word_count_array(struct_map_word **word_map, char * restrict word_list[const], word_count_t word_counts[restrict], const word_id_t type_count) {
410 | 	for (word_id_t i = 0; i < type_count; i++) {
411 | 		word_counts[i] = map_find_count(word_map, word_list[i]);
412 | 	}
413 | }
414 | 
415 | void populate_word_ids(struct_map_word **word_map, char * restrict word_list[const], const word_id_t type_count) {
416 | 	for (word_id_t i = 0; i < type_count; i++) {
417 | 		map_set_word_id(word_map, word_list[i], i);
418 | 	}
419 | }
420 | 
421 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap) {
422 | 	sort_by_count(word_map);
423 | 	struct_map_word *entry, *tmp;
424 | 	word_id_t i = 0;
425 | 
426 | 	HASH_ITER(hh, *word_map, entry, tmp) {
427 | 		const word_id_t word_id = entry->word_id;
428 | 		char * word = entry->key;
429 | 		word_id_remap[word_id] = i; // set remap
430 | 		word_list[i] = entry->key;
431 | 		//printf("reassigning w=%s %u -> %u; count=%u\n", entry->key, word_id, i, entry->count); fflush(stdout);
432 | 		map_set_word_id(word_map, word, i); // reset word_id in word_map
433 | 		i++;
434 | 	}
435 | }
436 | 
437 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** word_map, word_id_t * restrict word_id_remap) { // word_map must already be sorted by word frequency!
438 | 
439 | 	unsigned long number_of_deleted_words = 0;
440 | 	unsigned long vocab_size = model_metadata->type_count; // Save this to separate variable since we'll modify model_metadata.type_count later
441 | 	// Get keys
442 | 	// Iterate over keys
443 | 	//   If count of key_i < threshold,
444 | 	//     increment count of <unk> by count of key_i,
445 | 	//     decrement model_metadata.type_count by one
446 | 	//     free & delete entry in map,
447 | 
448 | 	char **local_word_list = (char **)malloc(model_metadata->type_count * sizeof(char*));
449 | 	//char * local_word_list[model_metadata->type_count];
450 | 	if (vocab_size != get_keys(word_map, local_word_list)) {
451 | 		printf("Error: model_metadata->type_count (%lu) != get_keys() (%lu)\n", (long unsigned) vocab_size, (long unsigned) get_keys(word_map, local_word_list) ); fflush(stderr);
452 | 		exit(4);
453 | 	}
454 | 
455 | 	unsigned long new_id = 0;
456 | 	for (unsigned long word_i = 0; word_i < vocab_size; word_i++, new_id++) {
457 | 		char * word = local_word_list[word_i];
458 | 		//if ((!strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) || (!strncmp(word, "<s>", MAX_WORD_LEN)) || (!strncmp(word, "</s>", MAX_WORD_LEN))) { // Deal with <unk>, <s>, and </s>
459 | 		//	//new_id--;
460 | 		//	continue;
461 | 		//}
462 | 
463 | 		unsigned long word_i_count = map_find_count(word_map, word);  // We'll use this a couple times
464 | 		if ((word_i_count < cmd_args.min_count) && (strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) && (strncmp(word, "<s>", MAX_WORD_LEN)) &&  (strncmp(word, "</s>", MAX_WORD_LEN))) { // Don't delete <unk>
465 | 			number_of_deleted_words++;
466 | 			if (cmd_args.verbose > 3) {
467 | 				printf("Filtering-out word: %s (old id=%lu, new id=0) (%lu < %hu);\tcount(%s)=%lu\n", word, word_i, (unsigned long)word_i_count, cmd_args.min_count, UNKNOWN_WORD, (unsigned long)map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
468 | 			}
469 | 			word_id_remap[map_find_id(word_map, word, (word_id_t) -1)] = (word_id_t) -1; // set value of dud word in remap to temporary unk, which is -1.  This gets changed later
470 | 			map_update_count(word_map, UNKNOWN_WORD, word_i_count, 0);
471 | 			model_metadata->type_count--;
472 | 			struct_map_word *local_s;
473 | 			HASH_FIND_STR(*word_map, word, local_s);
474 | 			delete_entry(word_map, local_s);
475 | 		} else { // Keep word
476 | 			//printf("Keeping word: %s (old id=%u, new id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
477 | 			//map_set_word_id(word_map, word, new_id); // word_id's 0-2 are reserved for <unk>, <s>, and </s>
478 | 			//printf("  Kept word: %s (new map id=%u, new_id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
479 | 		}
480 | 	}
481 | 	//map_set_word_id(word_map, UNKNOWN_WORD, 0); // word_id's 0-2 are reserved for <unk>, <s>, and </s>
482 | 	//map_set_word_id(word_map, "<s>", 1); // word_id's 0-2 are reserved for <unk>, <s>, and </s>
483 | 	//map_set_word_id(word_map, "</s>", 2); // word_id's 0-2 are reserved for <unk>, <s>, and </s>
484 | 
485 | 	free(local_word_list);
486 | 	return number_of_deleted_words;
487 | }
488 | 
489 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays) { // Right now it's a drop-in replacement for tally_class_counts_in_store(), but it's not the best way of doing things (eg. for unigram counts, tallying & querying in two separate steps, etc).  So this will need to be modified after getting rid of the sent-store
490 | 	for (word_id_t word_id = 0; word_id < model_metadata.type_count; word_id++) {
491 | 		const wclass_t headword_class = word2class[word_id];
492 | 		count_arrays[0][headword_class] += word_bigrams[word_id].headword_count;
493 | 		//printf("tally_class_ngram_counts: word=??, word_id=%u, type_count=%u, headword_class=%hu, headword_count=%u, class_count=%lu\n", word_id, model_metadata.type_count, headword_class, word_bigrams[word_id].headword_count, (unsigned long)count_arrays[0][headword_class]); fflush(stdout);
494 | 		for (unsigned int i = 0; i < word_bigrams[word_id].length; i++) {
495 | 			const word_id_t prev_word = word_bigrams[word_id].predecessors[i];
496 | 			wclass_t prev_class = word2class[prev_word];
497 | 			const size_t offset = prev_class + cmd_args.num_classes * headword_class;
498 | 			//printf("  tally_class_ngram_counts: prev_word=%u, prev_class=%hu, offset=%zu\n", prev_word, prev_class, offset); fflush(stdout);
499 | 			count_arrays[1][offset] += word_bigrams[word_id].bigram_counts[i];
500 | 		}
501 | 	}
502 | }
503 | 
504 | 
505 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]) {
506 | 	register unsigned long word_i = 0;
507 | 
508 | 	if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) { // It doesn't really matter how you initialize word classes in exchange algo.  This assigns words from the word list an incrementing class number from [0,num_classes-1].  So it's a simple pseudo-randomized initialization.
509 | 		register wclass_t class = 0; // [0,num_classes-1]
510 | 		for (; word_i < vocab_size; word_i++, class++) {
511 | 			if (class == cmd_args.num_classes) // reset
512 | 				class = 0;
513 | 			if (cmd_args.verbose > 3)
514 | 				printf("cls=%-4u w_i=%-8lu #(w)=%-8u str(w)=%-20s vocab_size=%u\n", class, word_i, word_counts[word_i], word_list[word_i], vocab_size);
515 | 			word2class[word_i] = class;
516 | 		}
517 | 
518 | 	} else if (cmd_args.class_algo == BROWN) { // Really simple initialization: one class per word
519 | 		for (unsigned long class = 0; word_i < vocab_size; word_i++, class++)
520 | 			word2class[word_i] = class;
521 | 	}
522 | }
523 | 
524 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map) {
525 | 
526 | 	// Build a hash map of bigrams, since we need random access when traversing the corpus.
527 | 	// Then we convert that to an array of linked lists, since we'll need sequential access during the clustering phase of predictive exchange clustering.
528 | 
529 | 	sort_bigrams(&bigram_map);
530 | 
531 | 	register size_t memusage = 0;
532 | 	register word_id_t word_2;
533 | 	register word_id_t word_2_last = 0;
534 | 	register unsigned int length = 0;
535 | 	word_id_t * word_buffer     = malloc(sizeof(word_id_t) * MAX_WORD_PREDECESSORS);
536 | 	word_bigram_count_t * count_buffer = malloc(sizeof(word_bigram_count_t) * MAX_WORD_PREDECESSORS);
537 | 
538 | 	// Add a dummy entry at the end of the hash map in order to simplify iterating through it, since it must track changes in head words.
539 | 	struct_word_bigram dummy = {-1, -1}; // Make sure this bigram is new, so that it's appended to end
540 | 	map_update_bigram(&bigram_map, &dummy, 0);
541 | 
542 | 	// Iterate through bigram map to get counts of word_2's, so we know how much to allocate for each predecessor list
543 | 	struct_map_bigram *entry, *tmp;
544 | 	HASH_ITER(hh, bigram_map, entry, tmp) {
545 | 		word_2 = (entry->key).word_2;
546 | 		//printf("\n[%u,%u]=%u, w2_last=%u, length=%u\n", (entry->key).word_1, (entry->key).word_2, entry->count, word_2_last, length); fflush(stdout);
547 | 		if (word_2 == word_2_last) { // Within successive entry; ie. 2nd entry or greater
548 | 			word_buffer[length]  = (entry->key).word_1;
549 | 			count_buffer[length] = entry->count;
550 | 			if (length < MAX_WORD_PREDECESSORS)
551 | 				length++;
552 | 			else {
553 | 				printf("Error: MAX_WORD_PREDECESSORS exceeded (%lu).  Increase it in clustercat.h and recompile.  Add the -B flag to 'make' to force recompilation.\n", (long unsigned int)MAX_WORD_PREDECESSORS); fflush(stderr);
554 | 				exit(14);
555 | 			}
556 | 		} else { // New entry; process previous entry
557 | 			word_bigrams[word_2_last].length = length;
558 | 			word_bigrams[word_2_last].predecessors  = malloc(length * sizeof(word_id_t));
559 | 			memcpy(word_bigrams[word_2_last].predecessors,  word_buffer, length * sizeof(word_id_t));
560 | 			memusage += length * sizeof(word_id_t);
561 | 			word_bigrams[word_2_last].bigram_counts = malloc(length * sizeof(word_bigram_count_t));
562 | 			memcpy(word_bigrams[word_2_last].bigram_counts, count_buffer , length * sizeof(word_bigram_count_t));
563 | 			memusage += length * sizeof(word_bigram_count_t);
564 | 			//printf("word_2_last=%u, length=%u word_1s: ", word_2_last, length);
565 | 			//for (unsigned int i = 0; i < length; i++) {
566 | 			//	printf("<%u,%u> ", word_bigrams[word_2_last].predecessors[i], word_bigrams[word_2_last].bigram_counts[i]);
567 | 			//}
568 | 			//printf("\n");
569 | 
570 | 			word_2_last = word_2;
571 | 			word_buffer[0]  = (entry->key).word_1;
572 | 			count_buffer[0] = entry->count;
573 | 			length = 1;
574 | 		}
575 | 	}
576 | 
577 | 	free(word_buffer);
578 | 	free(count_buffer);
579 | 	//delete_all_bigram(&map_bigram);
580 | 
581 | 	return memusage;
582 | }
583 | 
584 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/) {
585 | 	//long sum = 0;
586 | 	// set <v,c> counts
587 | 	for (word_id_t word = 0; word < type_count; word++) {
588 | 		for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
589 | 			word_id_t prev_word = word_bigrams[word].predecessors[i];
590 | 			const wclass_t class_i = word2class[word];
591 | 			word_class_counts[prev_word * cmd_args.num_classes + class_i] += word_bigrams[word].bigram_counts[i];
592 | 			//printf("i=%hu, <%s,%s>=<%u,%u>, <v,c>=<%u,%u>, num_classes=%u, offset=%u (%u * %u + %u), orig_val=%u\n", i, word_list[prev_word], word_list[word], prev_word, word, prev_word, class_i, cmd_args.num_classes, prev_word * cmd_args.num_classes + class_i, prev_word, cmd_args.num_classes, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i]); fflush(stdout);
593 | 			//sum += word_bigrams[word].bigram_counts[i];
594 | 			//printf("  <%u,%u>=%u at pos %zu\n", prev_word, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i], ((size_t)prev_word * cmd_args.num_classes + class_i)); fflush(stdout);
595 | 		}
596 | 	}
597 | 	//printf("<w,c>: sum: %lu; [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u...]\n", sum, word_class_counts[0], word_class_counts[1], word_class_counts[2], word_class_counts[3], word_class_counts[4], word_class_counts[5], word_class_counts[6], word_class_counts[7], word_class_counts[8], word_class_counts[9]);
598 | }
599 | 
600 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]) {
601 | 	const double backward_lambda = 1 - cmd_args.forward_lambda;
602 | 
603 | 	// Transition Probs
604 | 	double transition_logprob = 0;
605 | 	// Bigrams
606 | 	#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:transition_logprob)
607 | 	for (word_bigram_count_t ngram = 0; ngram < (powi(cmd_args.num_classes, 2)); ngram++) {
608 | 		const class_bigram_count_t bigram_count = count_arrays[1][ngram];
609 | 		if (!bigram_count) // bigram doesn't exist in training set
610 | 			continue;
611 | 		const wclass_t c_1 = ngram % cmd_args.num_classes;
612 | 		const wclass_t c_2 = ngram / cmd_args.num_classes;
613 | 		const wclass_count_t c_1_count = count_arrays[0][c_1];
614 | 		const wclass_count_t c_2_count = count_arrays[0][c_2];
615 | 		const double a = cmd_args.forward_lambda  * (bigram_count / (double)c_1_count);
616 | 		const double b = backward_lambda * (bigram_count / (double)c_2_count);
617 | 		transition_logprob += LOG2ADD(a,b) * bigram_count;
618 | 		//printf("ngram=%u, c_1=%u, #(c_1)=%lu, c_2=%u, #(c_2)=%lu, #(c_1,c_2)=%lu, trans_prob=%g\n", ngram, c_1, (unsigned long)c_1_count, c_2, (unsigned long)c_2_count, (unsigned long)bigram_count, transition_logprob); fflush(stdout);
619 | 	}
620 | 
621 | 	// Emission Probs
622 | 	//long double emission_prob = 0;
623 | 	double emission_logprob = 0;
624 | 	//#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:emission_logprob)
625 | 	for (word_id_t word = 0; word < model_metadata.type_count; word++) {
626 | 		//if (word == model_metadata.start_sent_id) // Don't tally emission prob for <s>
627 | 		//	continue;
628 | 		const word_count_t word_count = word_counts[word];
629 | 		if (!word_count) // Don't tally emission prob for <unk> if min-count is 1
630 | 			continue;
631 | 		const wclass_t class = word2class[word];
632 | 		const wclass_count_t class_count = count_arrays[0][class];
633 | 		emission_logprob += log2(word_count / (double)class_count) * word_count;
634 | 		//printf("word=%u, class=%u, emission_logprob=%g after += %g = log2(word_count=%lu / class_count=%u) * word_count=%lu\n", word, (unsigned int)class, emission_logprob, log2(word_count / (double)class_count) * word_count, (unsigned long)word_count, class_count, (unsigned long)word_count); fflush(stdout);
635 | 	}
636 | 
637 | 	//printf("emission_logprob=%g, transition_logprob=%g, LL=%g\n", emission_logprob, transition_logprob, emission_logprob + transition_logprob);
638 | 	return emission_logprob + transition_logprob;
639 | }
640 | 
641 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
642 | 	for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
643 | 		count_arrays[i-1] = calloc(powi(cmd_args.num_classes, i), sizeof(wclass_count_t)); // powi() is in clustercat-math.c
644 | 		if (count_arrays[i-1] == NULL) {
645 | 			fprintf(stderr,  "%s: Error: Unable to allocate enough memory for %u-grams.  I tried to allocate %zu MB per thread (%zuB * %u^%u).  Reduce the number of desired classes using --classes (current value: %u)\n", argv_0_basename, i, sizeof(wclass_count_t) * powi(cmd_args.num_classes, i) / 1048576, sizeof(wclass_count_t), cmd_args.num_classes, i, cmd_args.num_classes ); fflush(stderr);
646 | 			exit(12);
647 | 		}
648 | 		//printf("Allocating %zu B (cmd_args.num_classes=%u^i=%u * sizeof(uint)=%zu)\n", (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), cmd_args.num_classes, i, sizeof(wclass_count_t));
649 | 	}
650 | }
651 | 
652 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
653 | 	for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
654 | 		memset(count_arrays[i-1], 0, powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)); // powi() is in clustercat-math.c
655 | 	}
656 | }
657 | 
658 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
659 | 	for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
660 | 		free(count_arrays[i-1]);
661 | 	}
662 | }
663 | 


--------------------------------------------------------------------------------
/src/clustercat.h:
--------------------------------------------------------------------------------
  1 | #ifndef INCLUDE_CLUSTERCAT_HEADER
  2 | #define INCLUDE_CLUSTERCAT_HEADER
  3 | 
  4 | #include <stdlib.h>
  5 | #include <stdio.h>
  6 | #include <stdbool.h>
  7 | #include <string.h>
  8 | #include <math.h>			// log(), exp(), pow()
  9 | #include <libgen.h>			// basename()
 10 | #include <limits.h>			// USHRT_MAX, UINT_MAX
 11 | #include <errno.h>
 12 | #include "clustercat-math.h"	// powi()
 13 | 
 14 | // Defaults
 15 | #define PRIMARY_SEP_CHAR     '\t'
 16 | #define PRIMARY_SEP_STRING   "\t"
 17 | #define SECONDARY_SEP_CHAR   ' '
 18 | #define SECONDARY_SEP_STRING " "
 19 | #define TOK_CHARS            " \t\n"
 20 | #define UNKNOWN_WORD         "<unk>"
 21 | // Number of characters to read-in for each line
 22 | #define STDIN_SENT_MAX_CHARS 8000
 23 | #define MAX_WORD_LEN 128
 24 | #define MAX_WORD_PREDECESSORS 20000000
 25 | #define ENTROPY_TERMS_MAX 10000000
 26 | 
 27 | enum class_algos {EXCHANGE, BROWN, EXCHANGE_BROWN};
 28 | enum print_word_vectors {NO_VEC, TEXT_VEC, BINARY_VEC};
 29 | 
 30 | #include "clustercat-data.h" // bad. chicken-and-egg typedef deps
 31 | 
 32 | typedef unsigned short sentlen_t; // Number of words in a sentence
 33 | #define SENT_LEN_MAX USHRT_MAX
 34 | //typedef unsigned short wclass_t;  // Defined in clustercat-map.h
 35 | //typedef unsigned int   word_id_t; // Defined in clustercat-map.h
 36 | typedef word_count_t * * restrict count_arrays_t;
 37 | typedef word_count_t * restrict count_array_t;
 38 | 
 39 | typedef struct {
 40 | 	unsigned long token_count;
 41 | 	unsigned long line_count;
 42 | 	word_id_t     type_count;
 43 | 	word_id_t     start_sent_id; // need this for tallying emission probs
 44 | 	word_id_t     end_sent_id; // need this for tallying emission probs
 45 | } struct_model_metadata;
 46 | 
 47 | // typedef {...} struct_word_bigram; // see clustercat-map.h
 48 | 
 49 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of successors to a given word, as well as the length of that array
 50 | 	word_id_t * predecessors;
 51 | 	word_bigram_count_t * bigram_counts;
 52 | 	unsigned long length;
 53 | 	word_count_t headword_count;
 54 | } struct_word_bigram_entry;
 55 | 
 56 | extern char *argv_0_basename; // Allow for global access to filename
 57 | 
 58 | struct cmd_args {
 59 | 	float           forward_lambda;
 60 | 	wclass_t        num_classes;
 61 | 	unsigned short  min_count : 12;
 62 | 	signed char     verbose : 4;      // Negative values increasingly suppress normal output
 63 | 	unsigned short  tune_cycles : 8;
 64 | 	unsigned char   refine; // 0=no refinement; otherwise 2^n
 65 | 	signed char     class_offset: 4;
 66 | 	unsigned short  num_threads : 8;
 67 | 	unsigned char   rev_alternate: 3; // How often to alternate using reverse pex.  0 == never, 1 == after every one normal pex cycles, ...
 68 | 	unsigned char   max_array : 2;
 69 | 	unsigned char   class_algo : 2;   // enum class_algos
 70 | 	unsigned char   print_word_vectors : 2; // enum print_word_vectors
 71 | 	bool ngram_input;
 72 | 	bool print_freqs;
 73 | 	bool unidirectional;
 74 | };
 75 | 
 76 | void populate_word_ids(struct_map_word **ngram_map, char * restrict unique_words[const], const word_id_t type_count);
 77 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap);
 78 | void build_word_count_array(struct_map_word **ngram_map, char * restrict unique_words[const], word_count_t word_counts[restrict], const word_id_t type_count);
 79 | 
 80 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays);
 81 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** ngram_map, word_id_t * restrict word_id_remap);
 82 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]);
 83 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map);
 84 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/);
 85 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]);
 86 | 
 87 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
 88 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
 89 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
 90 | 
 91 | // Like atoi/strtol, but doesn't interpret each char's ascii value 0..9 .  Hence [104,101] ("he") -> 26725  (ie. (104*256)+101).  [3,7,11] -> 198411 (3*256*256) + (7*256) + 11)
 92 | // Using a class n-gram array is fast, at the expense of memory usage for lots of unattested ngrams, especially for higher-order n-grams.
 93 | // Trigrams are probably the highest order you'd want to use as an array, since the memory usage would be:  sizeof(wclass_t) * |C|^3   where |C| is the number of word classes.
 94 | // |C| can be represented using an unsigned short (16 bits == 65k classes) for exchange clustering, but probably should be an unsigned int (32 bit == 4 billion classes) for Brown clustering, since initially every word type is its own class.
 95 | inline size_t array_offset(wclass_t * pointer, const unsigned int max, const wclass_t num_classes) {
 96 | 	register uint_fast8_t ptr_i = 1;
 97 | 	register size_t total_offset = (*pointer);
 98 | 
 99 | 	for (; ptr_i < max; ptr_i++) { // little endian
100 | 		//printf("1: atosize_t: pointer=%p; all vals: [%hu,%hu,%hu]; total_offset=%zu; max=%u\n", pointer, *pointer, *(pointer+1), *(pointer+2), total_offset, max); fflush(stdout);
101 | 		total_offset += (pointer[ptr_i]) * powi(num_classes, ptr_i);
102 | 		//printf("2: adding ((pointer[%u]=%u)* powi(%hu, %u)=%lu)=%lu\n", ptr_i, pointer[ptr_i], num_classes, ptr_i, powi(num_classes, ptr_i), pointer[ptr_i] * powi(num_classes, ptr_i)); fflush(stdout);
103 | 	}
104 | 	//printf("3: atosize_t: pointer=%p; val0=%hu; total_offset=%zu; max=%u\n\n", pointer, *pointer, total_offset, max); fflush(stdout);
105 | 	return total_offset;
106 | }
107 | 
108 | 
109 | 
110 | #endif // INCLUDE_HEADER
111 | 


--------------------------------------------------------------------------------
/src/ext/uthash/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2005-2014, Troy D. Hanson    http://troydhanson.github.com/uthash/
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
11 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
12 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
13 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
14 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
15 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
17 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
18 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
19 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
20 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21 | 
22 | 


--------------------------------------------------------------------------------
/src/ext/uthash/README.md:
--------------------------------------------------------------------------------
1 | 
2 | Documentation for uthash is available at:
3 | 
4 | http://troydhanson.github.com/uthash/
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/src/ext/word2vec/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/src/ext/word2vec/README.txt:
--------------------------------------------------------------------------------
 1 | Tools for computing distributed representtion of words
 2 | ------------------------------------------------------
 3 | 
 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
 5 | 
 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
 8 |  - desired vector dimensionality
 9 |  - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 |  - training algorithm: hierarchical softmax and / or negative sampling
11 |  - threshold for downsampling the frequent words 
12 |  - number of threads to use
13 |  - the format of the output word vector file (text or binary)
14 | 
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 
16 | 
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 | 
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 | 
22 | 


--------------------------------------------------------------------------------
/src/ext/word2vec/distance.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char *bestw[N];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   float *M;
 32 |   char *vocab;
 33 |   if (argc < 2) {
 34 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 35 |     return 0;
 36 |   }
 37 |   strcpy(file_name, argv[1]);
 38 |   f = fopen(file_name, "rb");
 39 |   if (f == NULL) {
 40 |     printf("Input file not found\n");
 41 |     return -1;
 42 |   }
 43 |   fscanf(f, "%lld", &words);
 44 |   fscanf(f, "%lld", &size);
 45 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 46 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 47 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 48 |   if (M == NULL) {
 49 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 50 |     return -1;
 51 |   }
 52 |   for (b = 0; b < words; b++) {
 53 |     a = 0;
 54 |     while (1) {
 55 |       vocab[b * max_w + a] = fgetc(f);
 56 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 57 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 58 |     }
 59 |     vocab[b * max_w + a] = 0;
 60 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 61 |     len = 0;
 62 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 63 |     len = sqrt(len);
 64 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 65 |   }
 66 |   fclose(f);
 67 |   while (1) {
 68 |     for (a = 0; a < N; a++) bestd[a] = 0;
 69 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 70 |     printf("Enter word or sentence (EXIT or CTRL-d to break): ");
 71 |     a = 0;
 72 |     while (1) {
 73 |       st1[a] = fgetc(stdin);
 74 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 75 |         st1[a] = 0;
 76 |         break;
 77 |       }
 78 |       a++;
 79 |     }
 80 |     if ((!strcmp(st1, "EXIT")) || st1[0] == -1) {
 81 | 		printf("\n");
 82 | 		break;
 83 | 	}
 84 |     cn = 0;
 85 |     b = 0;
 86 |     c = 0;
 87 |     while (1) {
 88 |       st[cn][b] = st1[c];
 89 |       b++;
 90 |       c++;
 91 |       st[cn][b] = 0;
 92 |       if (st1[c] == 0) break;
 93 |       if (st1[c] == ' ') {
 94 |         cn++;
 95 |         b = 0;
 96 |         c++;
 97 |       }
 98 |     }
 99 |     cn++;
100 |     for (a = 0; a < cn; a++) {
101 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
102 |       if (b == words) b = -1;
103 |       bi[a] = b;
104 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
105 |       if (b == -1) {
106 |         printf("Out of dictionary word!\n");
107 |         break;
108 |       }
109 |     }
110 |     if (b == -1) continue;
111 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
112 |     for (a = 0; a < size; a++) vec[a] = 0;
113 |     for (b = 0; b < cn; b++) {
114 |       if (bi[b] == -1) continue;
115 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
116 |     }
117 |     len = 0;
118 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
119 |     len = sqrt(len);
120 |     for (a = 0; a < size; a++) vec[a] /= len;
121 |     for (a = 0; a < N; a++) bestd[a] = -1;
122 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
123 |     for (c = 0; c < words; c++) {
124 |       a = 0;
125 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 |       if (a == 1) continue;
127 |       dist = 0;
128 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 |       for (a = 0; a < N; a++) {
130 |         if (dist > bestd[a]) {
131 |           for (d = N - 1; d > a; d--) {
132 |             bestd[d] = bestd[d - 1];
133 |             strcpy(bestw[d], bestw[d - 1]);
134 |           }
135 |           bestd[a] = dist;
136 |           strcpy(bestw[a], &vocab[c * max_w]);
137 |           break;
138 |         }
139 |       }
140 |     }
141 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
142 |   }
143 |   return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/src/ext/word2vec/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: distance word-analogy
 6 | 
 7 | distance : distance.c
 8 | 	$(CC) distance.c -o distance $(CFLAGS)
 9 | word-analogy : word-analogy.c
10 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
11 | 
12 | clean:
13 | 	rm -rf distance word-analogy
14 | 


--------------------------------------------------------------------------------
/src/ext/word2vec/word-analogy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   float *M;
 32 |   char *vocab;
 33 |   if (argc < 2) {
 34 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 35 |     return 0;
 36 |   }
 37 |   strcpy(file_name, argv[1]);
 38 |   f = fopen(file_name, "rb");
 39 |   if (f == NULL) {
 40 |     printf("Input file not found\n");
 41 |     return -1;
 42 |   }
 43 |   fscanf(f, "%lld", &words);
 44 |   fscanf(f, "%lld", &size);
 45 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 46 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 47 |   if (M == NULL) {
 48 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 49 |     return -1;
 50 |   }
 51 |   for (b = 0; b < words; b++) {
 52 |     a = 0;
 53 |     while (1) {
 54 |       vocab[b * max_w + a] = fgetc(f);
 55 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 56 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 57 |     }
 58 |     vocab[b * max_w + a] = 0;
 59 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 60 |     len = 0;
 61 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 62 |     len = sqrt(len);
 63 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 64 |   }
 65 |   fclose(f);
 66 |   while (1) {
 67 |     for (a = 0; a < N; a++) bestd[a] = 0;
 68 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 69 |     printf("Enter three words (EXIT or CTRL-d to break): ");
 70 |     a = 0;
 71 |     while (1) {
 72 |       st1[a] = fgetc(stdin);
 73 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 74 |         st1[a] = 0;
 75 |         break;
 76 |       }
 77 |       a++;
 78 |     }
 79 |     if ((!strcmp(st1, "EXIT")) || st1[0] == -1) {
 80 | 		printf("\n");
 81 | 		break;
 82 | 	}
 83 |     cn = 0;
 84 |     b = 0;
 85 |     c = 0;
 86 |     while (1) {
 87 |       st[cn][b] = st1[c];
 88 |       b++;
 89 |       c++;
 90 |       st[cn][b] = 0;
 91 |       if (st1[c] == 0) break;
 92 |       if (st1[c] == ' ') {
 93 |         cn++;
 94 |         b = 0;
 95 |         c++;
 96 |       }
 97 |     }
 98 |     cn++;
 99 |     if (cn < 3) {
100 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
101 |       continue;
102 |     }
103 |     for (a = 0; a < cn; a++) {
104 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
105 |       if (b == words) b = 0;
106 |       bi[a] = b;
107 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
108 |       if (b == 0) {
109 |         printf("Out of dictionary word!\n");
110 |         break;
111 |       }
112 |     }
113 |     if (b == 0) continue;
114 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
115 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
116 |     len = 0;
117 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
118 |     len = sqrt(len);
119 |     for (a = 0; a < size; a++) vec[a] /= len;
120 |     for (a = 0; a < N; a++) bestd[a] = 0;
121 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
122 |     for (c = 0; c < words; c++) {
123 |       if (c == bi[0]) continue;
124 |       if (c == bi[1]) continue;
125 |       if (c == bi[2]) continue;
126 |       a = 0;
127 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
128 |       if (a == 1) continue;
129 |       dist = 0;
130 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
131 |       for (a = 0; a < N; a++) {
132 |         if (dist > bestd[a]) {
133 |           for (d = N - 1; d > a; d--) {
134 |             bestd[d] = bestd[d - 1];
135 |             strcpy(bestw[d], bestw[d - 1]);
136 |           }
137 |           bestd[a] = dist;
138 |           strcpy(bestw[a], &vocab[c * max_w]);
139 |           break;
140 |         }
141 |       }
142 |     }
143 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
144 |   }
145 |   return 0;
146 | }
147 | 


--------------------------------------------------------------------------------
/visualization/d3/basque_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/basque_cluster_thumbnail.png


--------------------------------------------------------------------------------
/visualization/d3/french_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/french_cluster_thumbnail.png


--------------------------------------------------------------------------------
/visualization/d3/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <!-- Derived from https://mbostock.github.io/d3/talk/20111116/pack-hierarchy.html -->
  4 | <style>
  5 | 
  6 | .node {
  7 |   cursor: pointer;
  8 | }
  9 | 
 10 | .node:hover {
 11 |   stroke: #66c;
 12 |   stroke-width: 1.5px;
 13 | }
 14 | 
 15 | .node--leaf {
 16 |   fill: white;
 17 | }
 18 | 
 19 | .label {
 20 |   font: 16px "Helvetica Neue", Helvetica, Arial, sans-serif;
 21 |   text-anchor: middle;
 22 |   text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, -1px 0 0 #fff, 0 -1px 0 #fff;
 23 | }
 24 | 
 25 | .label,
 26 | .node--root,
 27 | .node--leaf {
 28 |   pointer-events: none;
 29 | }
 30 | 
 31 | h1,h5 {
 32 | 	font-family: "Avantgarde", "Helvetica Neue", Helvetica, Arial, sans-serif;
 33 | 	color: darkblue;
 34 | }
 35 | 
 36 | 
 37 | </style>
 38 | <body>
 39 | 	<h1>Word Clusters</h1>
 40 | 	<h5><small>Made using <a href="https://github.com/jonsafari/clustercat">ClusterCat</a></small></h5>
 41 | 	<h5><small><b>Click</b> to zoom in/out</small></h5>
 42 | <script src="http://d3js.org/d3.v3.min.js"></script>
 43 | <script>
 44 | 
 45 | var margin = 20,
 46 |     diameter = 960;
 47 | 
 48 | var color = d3.scale.linear()
 49 |     .domain([-1, 5])
 50 |     //.range(["hsl(152,80%,80%)", "hsl(228,30%,40%)"])
 51 |     //.range(["hsl(250,96%,35%)", "hsl(250,96%,10%)"]) // blue
 52 |     //.range(["hsl(000,100%,35%)", "hsl(000,100%,05%)"]) // red
 53 |     //.range(["hsl(54,96%,59%)", "hsl(340,96%,30%)"]) // yellow/red
 54 |     .range(["hsl(152,60%,99%)", "hsl(228,30%,40%)"]) // light blue
 55 |     .interpolate(d3.interpolateHcl);
 56 | 
 57 | var pack = d3.layout.pack()
 58 |     .padding(2)
 59 |     .size([diameter - margin, diameter - margin])
 60 |     .value(function(d) { return d.size; })
 61 | 
 62 | var svg = d3.select("body").append("svg")
 63 |     .attr("width", diameter)
 64 |     .attr("height", diameter)
 65 |   .append("g")
 66 |     .attr("transform", "translate(" + diameter / 2 + "," + diameter / 2 + ")");
 67 | 
 68 | d3.json("clusters.json", function(error, root) {
 69 |   if (error) return console.error(error);
 70 | 
 71 |   var focus = root,
 72 |       nodes = pack.nodes(root),
 73 |       view;
 74 | 
 75 |   var circle = svg.selectAll("circle")
 76 |       .data(nodes)
 77 |     .enter().append("circle")
 78 |       .attr("class", function(d) { return d.parent ? d.children ? "node" : "node node--leaf" : "node node--root"; })
 79 |       .style("fill", function(d) { return d.children ? color(d.depth) : null; })
 80 |       .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); });
 81 | 
 82 |   var text = svg.selectAll("text")
 83 |       .data(nodes)
 84 |     .enter().append("text")
 85 |       .attr("class", "label")
 86 |       .style("fill-opacity", function(d) { return d.parent === root ? 1 : 0; })
 87 |       .style("display", function(d) { return d.parent === root ? null : "none"; })
 88 |       .text(function(d) { return d.name; });
 89 | 
 90 |   var node = svg.selectAll("circle,text");
 91 | 
 92 |   d3.select("body")
 93 |       .style("background", "white")
 94 |       .on("click", function() { zoom(root); });
 95 | 
 96 |   zoomTo([root.x, root.y, root.r * 2 + margin]);
 97 | 
 98 |   function zoom(d) {
 99 |     var focus0 = focus; focus = d;
100 | 
101 |     var transition = d3.transition()
102 |         .duration(d3.event.altKey ? 7500 : 750)
103 |         .tween("zoom", function(d) {
104 |           var i = d3.interpolateZoom(view, [focus.x, focus.y, focus.r * 2 + margin]);
105 |           return function(t) { zoomTo(i(t)); };
106 |         });
107 | 
108 |     transition.selectAll("text")
109 |       .filter(function(d) { return d.parent === focus || this.style.display === "inline"; })
110 |         .style("fill-opacity", function(d) { return d.parent === focus ? 1 : 0; })
111 |         .each("start", function(d) { if (d.parent === focus) this.style.display = "inline"; })
112 |         .each("end", function(d) { if (d.parent !== focus) this.style.display = "none"; });
113 |   }
114 | 
115 |   function zoomTo(v) {
116 |     var k = diameter / v[2]; view = v;
117 |     node.attr("transform", function(d) { return "translate(" + (d.x - v[0]) * k + "," + (d.y - v[1]) * k + ")"; });
118 |     circle.attr("r", function(d) { return d.r * k; });
119 |   }
120 | });
121 | 
122 | d3.select(self.frameElement).style("height", diameter + "px");
123 | 
124 | </script>
125 | 
126 | <small><small>Uses <a href="http://d3js.org">D3</a></small></small> &nbsp; &nbsp;
127 | <small><small><a href="clusters.json">Download json data</a></small></small>
128 | </body>
129 | 


--------------------------------------------------------------------------------
/visualization/d3/russian_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/russian_cluster_thumbnail.png


--------------------------------------------------------------------------------