├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── Makefile
├── README.md
├── bin
├── digit_conflate.pl
├── flat_clusters2json.pl
├── hier2flat_no_freqs.sh
├── hier2flat_with_freqs.sh
├── lowercase.pl
├── mkcls
├── mkcls4brown
├── mkcls4word2vec
└── ngram_counts.py
├── python
├── README.md
└── clustercat.py
├── src
├── clustercat-array.c
├── clustercat-array.h
├── clustercat-cluster.c
├── clustercat-cluster.h
├── clustercat-data.h
├── clustercat-dbg.c
├── clustercat-dbg.h
├── clustercat-import-class-file.c
├── clustercat-import-class-file.h
├── clustercat-io.c
├── clustercat-io.h
├── clustercat-map.c
├── clustercat-map.h
├── clustercat-math.c
├── clustercat-math.h
├── clustercat-tokenize.c
├── clustercat-tokenize.h
├── clustercat.c
├── clustercat.h
└── ext
│ ├── uthash
│ ├── LICENSE
│ ├── README.md
│ └── src
│ │ └── uthash.h
│ └── word2vec
│ ├── LICENSE
│ ├── README.txt
│ ├── distance.c
│ ├── makefile
│ └── word-analogy.c
└── visualization
└── d3
├── basque_cluster_thumbnail.png
├── french_cluster_thumbnail.png
├── index.html
└── russian_cluster_thumbnail.png
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/clustercat
2 | src/ext/word2vec/distance
3 | src/ext/word2vec/word-analogy
4 | *.[oa~]
5 | .*.sw[op]
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 | cache: brew
3 | compiler:
4 | - clang
5 | - gcc
6 | os:
7 | - linux
8 | #- osx
9 | #before_install:
10 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
11 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew tap homebrew/versions; fi
12 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
13 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm38; fi
14 | script:
15 | #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=clang-omp; fi
16 | #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=/usr/local/bin/clang-3.8 CFLAGS="$CFLAGS -I/usr/local/opt/llvm38/lib/llvm-3.8/include/" LDFLAGS="$LDFLAGS -L/usr/local/opt/llvm38/lib/llvm-3.8/lib" ; fi
17 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make; fi
18 | notifications:
19 | email: false
20 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This software is licensed under either the GNU LGPL version 3 or the Mozilla
2 | Public License version 2.0 . Both licenses are listed below.
3 |
4 |
5 |
6 |
7 |
8 |
9 | GNU LESSER GENERAL PUBLIC LICENSE
10 | Version 3, 29 June 2007
11 |
12 | Copyright (C) 2007 Free Software Foundation, Inc.
13 | Everyone is permitted to copy and distribute verbatim copies
14 | of this license document, but changing it is not allowed.
15 |
16 |
17 | This version of the GNU Lesser General Public License incorporates
18 | the terms and conditions of version 3 of the GNU General Public
19 | License, supplemented by the additional permissions listed below.
20 |
21 | 0. Additional Definitions.
22 |
23 | As used herein, "this License" refers to version 3 of the GNU Lesser
24 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
25 | General Public License.
26 |
27 | "The Library" refers to a covered work governed by this License,
28 | other than an Application or a Combined Work as defined below.
29 |
30 | An "Application" is any work that makes use of an interface provided
31 | by the Library, but which is not otherwise based on the Library.
32 | Defining a subclass of a class defined by the Library is deemed a mode
33 | of using an interface provided by the Library.
34 |
35 | A "Combined Work" is a work produced by combining or linking an
36 | Application with the Library. The particular version of the Library
37 | with which the Combined Work was made is also called the "Linked
38 | Version".
39 |
40 | The "Minimal Corresponding Source" for a Combined Work means the
41 | Corresponding Source for the Combined Work, excluding any source code
42 | for portions of the Combined Work that, considered in isolation, are
43 | based on the Application, and not on the Linked Version.
44 |
45 | The "Corresponding Application Code" for a Combined Work means the
46 | object code and/or source code for the Application, including any data
47 | and utility programs needed for reproducing the Combined Work from the
48 | Application, but excluding the System Libraries of the Combined Work.
49 |
50 | 1. Exception to Section 3 of the GNU GPL.
51 |
52 | You may convey a covered work under sections 3 and 4 of this License
53 | without being bound by section 3 of the GNU GPL.
54 |
55 | 2. Conveying Modified Versions.
56 |
57 | If you modify a copy of the Library, and, in your modifications, a
58 | facility refers to a function or data to be supplied by an Application
59 | that uses the facility (other than as an argument passed when the
60 | facility is invoked), then you may convey a copy of the modified
61 | version:
62 |
63 | a) under this License, provided that you make a good faith effort to
64 | ensure that, in the event an Application does not supply the
65 | function or data, the facility still operates, and performs
66 | whatever part of its purpose remains meaningful, or
67 |
68 | b) under the GNU GPL, with none of the additional permissions of
69 | this License applicable to that copy.
70 |
71 | 3. Object Code Incorporating Material from Library Header Files.
72 |
73 | The object code form of an Application may incorporate material from
74 | a header file that is part of the Library. You may convey such object
75 | code under terms of your choice, provided that, if the incorporated
76 | material is not limited to numerical parameters, data structure
77 | layouts and accessors, or small macros, inline functions and templates
78 | (ten or fewer lines in length), you do both of the following:
79 |
80 | a) Give prominent notice with each copy of the object code that the
81 | Library is used in it and that the Library and its use are
82 | covered by this License.
83 |
84 | b) Accompany the object code with a copy of the GNU GPL and this license
85 | document.
86 |
87 | 4. Combined Works.
88 |
89 | You may convey a Combined Work under terms of your choice that,
90 | taken together, effectively do not restrict modification of the
91 | portions of the Library contained in the Combined Work and reverse
92 | engineering for debugging such modifications, if you also do each of
93 | the following:
94 |
95 | a) Give prominent notice with each copy of the Combined Work that
96 | the Library is used in it and that the Library and its use are
97 | covered by this License.
98 |
99 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
100 | document.
101 |
102 | c) For a Combined Work that displays copyright notices during
103 | execution, include the copyright notice for the Library among
104 | these notices, as well as a reference directing the user to the
105 | copies of the GNU GPL and this license document.
106 |
107 | d) Do one of the following:
108 |
109 | 0) Convey the Minimal Corresponding Source under the terms of this
110 | License, and the Corresponding Application Code in a form
111 | suitable for, and under terms that permit, the user to
112 | recombine or relink the Application with a modified version of
113 | the Linked Version to produce a modified Combined Work, in the
114 | manner specified by section 6 of the GNU GPL for conveying
115 | Corresponding Source.
116 |
117 | 1) Use a suitable shared library mechanism for linking with the
118 | Library. A suitable mechanism is one that (a) uses at run time
119 | a copy of the Library already present on the user's computer
120 | system, and (b) will operate properly with a modified version
121 | of the Library that is interface-compatible with the Linked
122 | Version.
123 |
124 | e) Provide Installation Information, but only if you would otherwise
125 | be required to provide such information under section 6 of the
126 | GNU GPL, and only to the extent that such information is
127 | necessary to install and execute a modified version of the
128 | Combined Work produced by recombining or relinking the
129 | Application with a modified version of the Linked Version. (If
130 | you use option 4d0, the Installation Information must accompany
131 | the Minimal Corresponding Source and Corresponding Application
132 | Code. If you use option 4d1, you must provide the Installation
133 | Information in the manner specified by section 6 of the GNU GPL
134 | for conveying Corresponding Source.)
135 |
136 | 5. Combined Libraries.
137 |
138 | You may place library facilities that are a work based on the
139 | Library side by side in a single library together with other library
140 | facilities that are not Applications and are not covered by this
141 | License, and convey such a combined library under terms of your
142 | choice, if you do both of the following:
143 |
144 | a) Accompany the combined library with a copy of the same work based
145 | on the Library, uncombined with any other library facilities,
146 | conveyed under the terms of this License.
147 |
148 | b) Give prominent notice with the combined library that part of it
149 | is a work based on the Library, and explaining where to find the
150 | accompanying uncombined form of the same work.
151 |
152 | 6. Revised Versions of the GNU Lesser General Public License.
153 |
154 | The Free Software Foundation may publish revised and/or new versions
155 | of the GNU Lesser General Public License from time to time. Such new
156 | versions will be similar in spirit to the present version, but may
157 | differ in detail to address new problems or concerns.
158 |
159 | Each version is given a distinguishing version number. If the
160 | Library as you received it specifies that a certain numbered version
161 | of the GNU Lesser General Public License "or any later version"
162 | applies to it, you have the option of following the terms and
163 | conditions either of that published version or of any later version
164 | published by the Free Software Foundation. If the Library as you
165 | received it does not specify a version number of the GNU Lesser
166 | General Public License, you may choose any version of the GNU Lesser
167 | General Public License ever published by the Free Software Foundation.
168 |
169 | If the Library as you received it specifies that a proxy can decide
170 | whether future versions of the GNU Lesser General Public License shall
171 | apply, that proxy's public statement of acceptance of any version is
172 | permanent authorization for you to choose that version for the
173 | Library.
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 | Mozilla Public License Version 2.0
203 | ==================================
204 |
205 | 1. Definitions
206 | --------------
207 |
208 | 1.1. "Contributor"
209 | means each individual or legal entity that creates, contributes to
210 | the creation of, or owns Covered Software.
211 |
212 | 1.2. "Contributor Version"
213 | means the combination of the Contributions of others (if any) used
214 | by a Contributor and that particular Contributor's Contribution.
215 |
216 | 1.3. "Contribution"
217 | means Covered Software of a particular Contributor.
218 |
219 | 1.4. "Covered Software"
220 | means Source Code Form to which the initial Contributor has attached
221 | the notice in Exhibit A, the Executable Form of such Source Code
222 | Form, and Modifications of such Source Code Form, in each case
223 | including portions thereof.
224 |
225 | 1.5. "Incompatible With Secondary Licenses"
226 | means
227 |
228 | (a) that the initial Contributor has attached the notice described
229 | in Exhibit B to the Covered Software; or
230 |
231 | (b) that the Covered Software was made available under the terms of
232 | version 1.1 or earlier of the License, but not also under the
233 | terms of a Secondary License.
234 |
235 | 1.6. "Executable Form"
236 | means any form of the work other than Source Code Form.
237 |
238 | 1.7. "Larger Work"
239 | means a work that combines Covered Software with other material, in
240 | a separate file or files, that is not Covered Software.
241 |
242 | 1.8. "License"
243 | means this document.
244 |
245 | 1.9. "Licensable"
246 | means having the right to grant, to the maximum extent possible,
247 | whether at the time of the initial grant or subsequently, any and
248 | all of the rights conveyed by this License.
249 |
250 | 1.10. "Modifications"
251 | means any of the following:
252 |
253 | (a) any file in Source Code Form that results from an addition to,
254 | deletion from, or modification of the contents of Covered
255 | Software; or
256 |
257 | (b) any new file in Source Code Form that contains any Covered
258 | Software.
259 |
260 | 1.11. "Patent Claims" of a Contributor
261 | means any patent claim(s), including without limitation, method,
262 | process, and apparatus claims, in any patent Licensable by such
263 | Contributor that would be infringed, but for the grant of the
264 | License, by the making, using, selling, offering for sale, having
265 | made, import, or transfer of either its Contributions or its
266 | Contributor Version.
267 |
268 | 1.12. "Secondary License"
269 | means either the GNU General Public License, Version 2.0, the GNU
270 | Lesser General Public License, Version 2.1, the GNU Affero General
271 | Public License, Version 3.0, or any later versions of those
272 | licenses.
273 |
274 | 1.13. "Source Code Form"
275 | means the form of the work preferred for making modifications.
276 |
277 | 1.14. "You" (or "Your")
278 | means an individual or a legal entity exercising rights under this
279 | License. For legal entities, "You" includes any entity that
280 | controls, is controlled by, or is under common control with You. For
281 | purposes of this definition, "control" means (a) the power, direct
282 | or indirect, to cause the direction or management of such entity,
283 | whether by contract or otherwise, or (b) ownership of more than
284 | fifty percent (50%) of the outstanding shares or beneficial
285 | ownership of such entity.
286 |
287 | 2. License Grants and Conditions
288 | --------------------------------
289 |
290 | 2.1. Grants
291 |
292 | Each Contributor hereby grants You a world-wide, royalty-free,
293 | non-exclusive license:
294 |
295 | (a) under intellectual property rights (other than patent or trademark)
296 | Licensable by such Contributor to use, reproduce, make available,
297 | modify, display, perform, distribute, and otherwise exploit its
298 | Contributions, either on an unmodified basis, with Modifications, or
299 | as part of a Larger Work; and
300 |
301 | (b) under Patent Claims of such Contributor to make, use, sell, offer
302 | for sale, have made, import, and otherwise transfer either its
303 | Contributions or its Contributor Version.
304 |
305 | 2.2. Effective Date
306 |
307 | The licenses granted in Section 2.1 with respect to any Contribution
308 | become effective for each Contribution on the date the Contributor first
309 | distributes such Contribution.
310 |
311 | 2.3. Limitations on Grant Scope
312 |
313 | The licenses granted in this Section 2 are the only rights granted under
314 | this License. No additional rights or licenses will be implied from the
315 | distribution or licensing of Covered Software under this License.
316 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
317 | Contributor:
318 |
319 | (a) for any code that a Contributor has removed from Covered Software;
320 | or
321 |
322 | (b) for infringements caused by: (i) Your and any other third party's
323 | modifications of Covered Software, or (ii) the combination of its
324 | Contributions with other software (except as part of its Contributor
325 | Version); or
326 |
327 | (c) under Patent Claims infringed by Covered Software in the absence of
328 | its Contributions.
329 |
330 | This License does not grant any rights in the trademarks, service marks,
331 | or logos of any Contributor (except as may be necessary to comply with
332 | the notice requirements in Section 3.4).
333 |
334 | 2.4. Subsequent Licenses
335 |
336 | No Contributor makes additional grants as a result of Your choice to
337 | distribute the Covered Software under a subsequent version of this
338 | License (see Section 10.2) or under the terms of a Secondary License (if
339 | permitted under the terms of Section 3.3).
340 |
341 | 2.5. Representation
342 |
343 | Each Contributor represents that the Contributor believes its
344 | Contributions are its original creation(s) or it has sufficient rights
345 | to grant the rights to its Contributions conveyed by this License.
346 |
347 | 2.6. Fair Use
348 |
349 | This License is not intended to limit any rights You have under
350 | applicable copyright doctrines of fair use, fair dealing, or other
351 | equivalents.
352 |
353 | 2.7. Conditions
354 |
355 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
356 | in Section 2.1.
357 |
358 | 3. Responsibilities
359 | -------------------
360 |
361 | 3.1. Distribution of Source Form
362 |
363 | All distribution of Covered Software in Source Code Form, including any
364 | Modifications that You create or to which You contribute, must be under
365 | the terms of this License. You must inform recipients that the Source
366 | Code Form of the Covered Software is governed by the terms of this
367 | License, and how they can obtain a copy of this License. You may not
368 | attempt to alter or restrict the recipients' rights in the Source Code
369 | Form.
370 |
371 | 3.2. Distribution of Executable Form
372 |
373 | If You distribute Covered Software in Executable Form then:
374 |
375 | (a) such Covered Software must also be made available in Source Code
376 | Form, as described in Section 3.1, and You must inform recipients of
377 | the Executable Form how they can obtain a copy of such Source Code
378 | Form by reasonable means in a timely manner, at a charge no more
379 | than the cost of distribution to the recipient; and
380 |
381 | (b) You may distribute such Executable Form under the terms of this
382 | License, or sublicense it under different terms, provided that the
383 | license for the Executable Form does not attempt to limit or alter
384 | the recipients' rights in the Source Code Form under this License.
385 |
386 | 3.3. Distribution of a Larger Work
387 |
388 | You may create and distribute a Larger Work under terms of Your choice,
389 | provided that You also comply with the requirements of this License for
390 | the Covered Software. If the Larger Work is a combination of Covered
391 | Software with a work governed by one or more Secondary Licenses, and the
392 | Covered Software is not Incompatible With Secondary Licenses, this
393 | License permits You to additionally distribute such Covered Software
394 | under the terms of such Secondary License(s), so that the recipient of
395 | the Larger Work may, at their option, further distribute the Covered
396 | Software under the terms of either this License or such Secondary
397 | License(s).
398 |
399 | 3.4. Notices
400 |
401 | You may not remove or alter the substance of any license notices
402 | (including copyright notices, patent notices, disclaimers of warranty,
403 | or limitations of liability) contained within the Source Code Form of
404 | the Covered Software, except that You may alter any license notices to
405 | the extent required to remedy known factual inaccuracies.
406 |
407 | 3.5. Application of Additional Terms
408 |
409 | You may choose to offer, and to charge a fee for, warranty, support,
410 | indemnity or liability obligations to one or more recipients of Covered
411 | Software. However, You may do so only on Your own behalf, and not on
412 | behalf of any Contributor. You must make it absolutely clear that any
413 | such warranty, support, indemnity, or liability obligation is offered by
414 | You alone, and You hereby agree to indemnify every Contributor for any
415 | liability incurred by such Contributor as a result of warranty, support,
416 | indemnity or liability terms You offer. You may include additional
417 | disclaimers of warranty and limitations of liability specific to any
418 | jurisdiction.
419 |
420 | 4. Inability to Comply Due to Statute or Regulation
421 | ---------------------------------------------------
422 |
423 | If it is impossible for You to comply with any of the terms of this
424 | License with respect to some or all of the Covered Software due to
425 | statute, judicial order, or regulation then You must: (a) comply with
426 | the terms of this License to the maximum extent possible; and (b)
427 | describe the limitations and the code they affect. Such description must
428 | be placed in a text file included with all distributions of the Covered
429 | Software under this License. Except to the extent prohibited by statute
430 | or regulation, such description must be sufficiently detailed for a
431 | recipient of ordinary skill to be able to understand it.
432 |
433 | 5. Termination
434 | --------------
435 |
436 | 5.1. The rights granted under this License will terminate automatically
437 | if You fail to comply with any of its terms. However, if You become
438 | compliant, then the rights granted under this License from a particular
439 | Contributor are reinstated (a) provisionally, unless and until such
440 | Contributor explicitly and finally terminates Your grants, and (b) on an
441 | ongoing basis, if such Contributor fails to notify You of the
442 | non-compliance by some reasonable means prior to 60 days after You have
443 | come back into compliance. Moreover, Your grants from a particular
444 | Contributor are reinstated on an ongoing basis if such Contributor
445 | notifies You of the non-compliance by some reasonable means, this is the
446 | first time You have received notice of non-compliance with this License
447 | from such Contributor, and You become compliant prior to 30 days after
448 | Your receipt of the notice.
449 |
450 | 5.2. If You initiate litigation against any entity by asserting a patent
451 | infringement claim (excluding declaratory judgment actions,
452 | counter-claims, and cross-claims) alleging that a Contributor Version
453 | directly or indirectly infringes any patent, then the rights granted to
454 | You by any and all Contributors for the Covered Software under Section
455 | 2.1 of this License shall terminate.
456 |
457 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
458 | end user license agreements (excluding distributors and resellers) which
459 | have been validly granted by You or Your distributors under this License
460 | prior to termination shall survive termination.
461 |
462 | ************************************************************************
463 | * *
464 | * 6. Disclaimer of Warranty *
465 | * ------------------------- *
466 | * *
467 | * Covered Software is provided under this License on an "as is" *
468 | * basis, without warranty of any kind, either expressed, implied, or *
469 | * statutory, including, without limitation, warranties that the *
470 | * Covered Software is free of defects, merchantable, fit for a *
471 | * particular purpose or non-infringing. The entire risk as to the *
472 | * quality and performance of the Covered Software is with You. *
473 | * Should any Covered Software prove defective in any respect, You *
474 | * (not any Contributor) assume the cost of any necessary servicing, *
475 | * repair, or correction. This disclaimer of warranty constitutes an *
476 | * essential part of this License. No use of any Covered Software is *
477 | * authorized under this License except under this disclaimer. *
478 | * *
479 | ************************************************************************
480 |
481 | ************************************************************************
482 | * *
483 | * 7. Limitation of Liability *
484 | * -------------------------- *
485 | * *
486 | * Under no circumstances and under no legal theory, whether tort *
487 | * (including negligence), contract, or otherwise, shall any *
488 | * Contributor, or anyone who distributes Covered Software as *
489 | * permitted above, be liable to You for any direct, indirect, *
490 | * special, incidental, or consequential damages of any character *
491 | * including, without limitation, damages for lost profits, loss of *
492 | * goodwill, work stoppage, computer failure or malfunction, or any *
493 | * and all other commercial damages or losses, even if such party *
494 | * shall have been informed of the possibility of such damages. This *
495 | * limitation of liability shall not apply to liability for death or *
496 | * personal injury resulting from such party's negligence to the *
497 | * extent applicable law prohibits such limitation. Some *
498 | * jurisdictions do not allow the exclusion or limitation of *
499 | * incidental or consequential damages, so this exclusion and *
500 | * limitation may not apply to You. *
501 | * *
502 | ************************************************************************
503 |
504 | 8. Litigation
505 | -------------
506 |
507 | Any litigation relating to this License may be brought only in the
508 | courts of a jurisdiction where the defendant maintains its principal
509 | place of business and such litigation shall be governed by laws of that
510 | jurisdiction, without reference to its conflict-of-law provisions.
511 | Nothing in this Section shall prevent a party's ability to bring
512 | cross-claims or counter-claims.
513 |
514 | 9. Miscellaneous
515 | ----------------
516 |
517 | This License represents the complete agreement concerning the subject
518 | matter hereof. If any provision of this License is held to be
519 | unenforceable, such provision shall be reformed only to the extent
520 | necessary to make it enforceable. Any law or regulation which provides
521 | that the language of a contract shall be construed against the drafter
522 | shall not be used to construe this License against a Contributor.
523 |
524 | 10. Versions of the License
525 | ---------------------------
526 |
527 | 10.1. New Versions
528 |
529 | Mozilla Foundation is the license steward. Except as provided in Section
530 | 10.3, no one other than the license steward has the right to modify or
531 | publish new versions of this License. Each version will be given a
532 | distinguishing version number.
533 |
534 | 10.2. Effect of New Versions
535 |
536 | You may distribute the Covered Software under the terms of the version
537 | of the License under which You originally received the Covered Software,
538 | or under the terms of any subsequent version published by the license
539 | steward.
540 |
541 | 10.3. Modified Versions
542 |
543 | If you create software not governed by this License, and you want to
544 | create a new license for such software, you may create and use a
545 | modified version of this License if you rename the license and remove
546 | any references to the name of the license steward (except to note that
547 | such modified license differs from this License).
548 |
549 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
550 | Licenses
551 |
552 | If You choose to distribute Source Code Form that is Incompatible With
553 | Secondary Licenses under the terms of this version of the License, the
554 | notice described in Exhibit B of this License must be attached.
555 |
556 | Exhibit A - Source Code Form License Notice
557 | -------------------------------------------
558 |
559 | This Source Code Form is subject to the terms of the Mozilla Public
560 | License, v. 2.0. If a copy of the MPL was not distributed with this
561 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
562 |
563 | If it is not possible or desirable to put the notice in a particular
564 | file, then You may include the notice in a location (such as a LICENSE
565 | file in a relevant directory) where a recipient would be likely to look
566 | for such a notice.
567 |
568 | You may add additional accurate notices of copyright ownership.
569 |
570 | Exhibit B - "Incompatible With Secondary Licenses" Notice
571 | ---------------------------------------------------------
572 |
573 | This Source Code Form is "Incompatible With Secondary Licenses", as
574 | defined by the Mozilla Public License, v. 2.0.
575 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env make
2 |
3 | CC=cc
4 | INCLUDE=-I ./src/ext/uthash/src/
5 | ## * For -march info on your platform, type: gcc -march=native -Q --help=target (or just compile with -march=native )
6 | ## * We include the argument -Wno-unknown-pragmas to suppress clang's lack of support for openmp
7 | ## Since we use the gnuism 'override', you don't need to modify this makefile; you can just run: make -j4 CFLAGS=-DATA_STORE_TRIE_LCRS
8 | override CFLAGS += -march=native -std=c99 -O3 -fopenmp -finline-functions -fno-math-errno -fstrict-aliasing -DHASH_FUNCTION=HASH_SAX -DHASH_BLOOM=25 -Wall -Wextra -Winline -Wstrict-aliasing -Wno-unknown-pragmas -Wno-comment -Wno-missing-field-initializers ${INCLUDE}
9 | LDLIBS=-lm -fopenmp #-ltcmalloc_minimal
10 | BIN=bin/
11 | SRC=src/
12 | OBJS=${SRC}/clustercat-array.o ${SRC}/clustercat-cluster.o ${SRC}/clustercat-dbg.o ${SRC}/clustercat-io.o ${SRC}/clustercat-import-class-file.o ${SRC}/clustercat-map.o ${SRC}/clustercat-math.o ${SRC}/clustercat-tokenize.o
13 | includes=${SRC}/$(wildcard *.h)
14 | date:=$(shell date +%F)
15 | machine_type:=$(shell uname -m)
16 |
17 | all: ${BIN}/clustercat
18 | .PHONY : all install tar clean
19 |
20 | clustercat.h: ${SRC}/clustercat-array.h ${SRC}/clustercat-data.h ${SRC}/clustercat-map.h
21 |
22 |
23 | ${BIN}/clustercat: ${SRC}/clustercat.c ${OBJS}
24 | ${CC} -Wl,-s $^ -o $@ ${CFLAGS} ${LDLIBS}
25 |
26 | clustercat.c: ${SRC}/clustercat.h ${SRC}/clustercat-cluster.h ${SRC}/clustercat-dbg.h ${SRC}/clustercat-io.h ${SRC}/clustercat-import-class-file.h ${SRC}/clustercat-math.h ${SRC}/clustercat-tokenize.h
27 |
28 | install: ${BIN}/clustercat
29 | cp -p ${BIN}/clustercat /usr/bin/ 2>/dev/null || \
30 | mkdir --parents ${HOME}/bin/ && \
31 | cp -p ${BIN}/clustercat ${HOME}/bin/
32 |
33 | tar: ${BIN}/clustercat
34 | mkdir clustercat-${date} && \
35 | mkdir clustercat-${date}/bin && \
36 | mkdir clustercat-${date}/src && \
37 | mkdir --parents clustercat-${date}/src/ext/uthash/src && \
38 | cp -a ${BIN}/clustercat clustercat-${date}/bin/ && \
39 | cp -a ${BIN}/clustercat clustercat-${date}/bin/clustercat.${machine_type} && \
40 | cp -a ${SRC}/*.c ${SRC}/*.h clustercat-${date}/src/ && \
41 | cp -a Makefile README.md LICENSE clustercat-${date}/ && \
42 | cp -a ${SRC}/ext/uthash/src/uthash.h clustercat-${date}/src/ext/uthash/src/ && \
43 | tar -cf clustercat-${date}.tar clustercat-${date}/ && \
44 | gzip -9 clustercat-${date}.tar && \
45 | rm -rf clustercat-${date}/
46 |
47 | clean:
48 | \rm -f ${BIN}/clustercat ${SRC}/*.o
49 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ClusterCat: Fast, Flexible Word Clustering Software
2 |
3 | [](https://travis-ci.org/jonsafari/clustercat)
4 | [](http://www.gnu.org/licenses/lgpl-3.0)
5 | [](https://opensource.org/licenses/MPL-2.0)
6 |
7 |
8 | ## Overview
9 |
10 | ClusterCat induces word classes from unannotated text.
11 | It is programmed in modern C, with no external libraries.
12 | A Python wrapper is also provided.
13 |
14 | Word classes are unsupervised part-of-speech tags, requiring no manually-annotated corpus.
15 | Words are grouped together that share syntactic/semantic similarities.
16 | They are used in many dozens of applications within natural language processing, machine translation, neural net training, and related fields.
17 |
18 |
19 | ## Installation
20 | ### Linux
21 | You can use either GCC 4.6+ or Clang 3.7+, but GCC is usually faster.
22 |
23 | sudo apt update && sudo apt install gcc make libomp-dev
24 | make -j
25 |
26 | ### macOS / OSX
27 | The current version of Clang in Xcode doesn't fully support [OpenMP][], so instead install GCC from [Homebrew][]:
28 |
29 | brew update && brew install gcc@9 libomp && xcode-select --install
30 | make -j CC=/opt/homebrew/bin/gcc-9
31 |
32 |
33 | ## Commands
34 | The binary program `clustercat` gets compiled into the `bin` directory.
35 |
36 | **Clustering** preprocessed text (already tokenized, normalized, etc) is pretty simple:
37 |
38 | bin/clustercat [options] < train.tok.txt > clusters.tsv
39 |
40 | The word-classes are induced from a bidirectional [predictive][] [exchange algorithm][].
41 | The format of the output class file has each line consisting of `word`*TAB*`class` (a word type, then tab, then class).
42 |
43 | Command-line argument **usage** may be obtained by running with program with the **`--help`** flag:
44 |
45 | bin/clustercat --help
46 |
47 |
48 | ## Python
49 | Installation and usage details for the Python module are described in a separate [readme](python/README.md).
50 |
51 |
52 | ## Features
53 | - Print **[word vectors][]** (a.k.a. word embeddings) using the `--word-vectors` flag. The binary format is compatible with word2vec's tools.
54 | - Start training using an **existing word cluster mapping** from other clustering software (eg. mkcls) using the `--class-file` flag.
55 | - Adjust the number of **threads** to use with the `--threads` flag. The default is 8.
56 | - Adjust the **number of clusters** or vector dimensions using the `--classes` flag. The default is approximately the square root of the vocabulary size.
57 | - Includes **compatibility wrapper script ` bin/mkcls `** that can be run just like mkcls. You can use more classes now :-)
58 |
59 |
60 | ## Comparison
61 | | Training Set | [Brown][] | ClusterCat | [mkcls][] | [Phrasal][] | [word2vec][] |
62 | | ------------ | --------- | ---------- | --------- | ----------- | ------------ |
63 | | 1 Billion English tokens, 800 clusters | 12.5 hr | **1.4** hr | 48.8 hr | 5.1 hr | 20.6 hr |
64 | | 1 Billion English tokens, 1200 clusters | 25.5 hr | **1.7** hr | 68.8 hr | 6.2 hr | 33.7 hr |
65 | | 550 Million Russian tokens, 800 clusters | 14.6 hr | **1.5** hr | 75.0 hr | 5.5 hr | 12.0 hr |
66 |
67 |
68 | ## Visualization
69 | See [bl.ocks.org][] for nice data visualizations of the clusters for various languages, including English, German, Persian, Hindi, Czech, Catalan, Tajik, Basque, Russian, French, and Maltese.
70 |
71 | For example:
72 |
73 | 
74 | 
75 | 
76 |
77 | You can generate your own graphics from ClusterCat's output.
78 | Add the flag `--print-freqs` to ClusterCat, then type the command:
79 |
80 | bin/flat_clusters2json.pl --word-labels < clusters.tsv > visualization/d3/clusters.json
81 |
82 | You can either upload the [JSON][] file to [gist.github.com][], following instructions on the [bl.ocks.org](http://bl.ocks.org) front page, or you can view the graphic locally by running a minimal webserver in the `visualization/d3` directory:
83 |
84 | python -m SimpleHTTPServer 8116 2>/dev/null &
85 |
86 | Then open a tab in your browser to [localhost:8116](http://localhost:8116) .
87 |
88 | The default settings are sensible for normal usage, but for visualization you probably want much fewer word types and clusters -- less than 10,000 word types and 120 clusters.
89 | Your browser will thank you.
90 |
91 |
92 | ## Perplexity
93 | The perplexity that ClusterCat reports uses a bidirectional bigram class language model, which is richer than the unidirectional bigram-based perplexities reported by most other software.
94 | Richer models provide a better evaluation of the quality of clusters, having more sensitivity (power) to detect improvements.
95 | If you want to directly compare the quality of clusters with a different program's output, you have a few options:
96 |
97 | 1. Load another clustering using `--class-file` , and see what the other clustering's initial bidirectional bigram perplexity is before any words get exchanged.
98 | 2. Use an external class-based language model. These are usually two-sided (unlexicalized) models, so they favor two-sided clusterers.
99 | 3. Evaluate on a downstream task. This is best.
100 |
101 |
102 | ## Contributions
103 | Contributions are welcome, via [pull requests][].
104 |
105 |
106 | ## Citation
107 | If you use this software please cite the following
108 |
109 | Dehdari, Jon, Liling Tan, and Josef van Genabith. 2016. [BIRA: Improved Predictive Exchange Word Clustering](http://www.aclweb.org/anthology/N16-1139.pdf).
110 | In *Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)*, pages 1169–1174, San Diego, CA, USA. Association for Computational Linguistics.
111 |
112 | @inproceedings{dehdari-etal2016,
113 | author = {Dehdari, Jon and Tan, Liling and van Genabith, Josef},
114 | title = {{BIRA}: Improved Predictive Exchange Word Clustering},
115 | booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)},
116 | month = {June},
117 | year = {2016},
118 | address = {San Diego, CA, USA},
119 | publisher = {Association for Computational Linguistics},
120 | pages = {1169--1174},
121 | url = {http://www.aclweb.org/anthology/N16-1139.pdf}
122 | }
123 |
124 | [lgpl3]: https://www.gnu.org/copyleft/lesser.html
125 | [mpl2]: https://www.mozilla.org/MPL/2.0
126 | [c99]: https://en.wikipedia.org/wiki/C99
127 | [homebrew]: http://brew.sh
128 | [openmp]: https://en.wikipedia.org/wiki/OpenMP
129 | [predictive]: https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
130 | [exchange algorithm]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354
131 | [brown]: https://github.com/percyliang/brown-cluster
132 | [mkcls]: https://github.com/moses-smt/mgiza
133 | [phrasal]: https://github.com/stanfordnlp/phrasal
134 | [word2vec]: https://code.google.com/archive/p/word2vec/
135 | [word vectors]: https://en.wikipedia.org/wiki/Word_embedding
136 | [bl.ocks.org]: http://bl.ocks.org/jonsafari
137 | [JSON]: https://en.wikipedia.org/wiki/JSON
138 | [gist.github.com]: https://gist.github.com
139 | [pull requests]: https://help.github.com/articles/creating-a-pull-request
140 |
--------------------------------------------------------------------------------
/bin/digit_conflate.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | ## By Jon Dehdari 2013
3 | ## Conflates all digits to the same digit
4 | ## Usage: perl digit_conflate.pl [options] < in > out
5 |
6 | use strict;
7 | use Getopt::Long;
8 |
9 | ## Defaults
10 | my $digit = 5;
11 |
12 | my $usage = <<"END_OF_USAGE";
13 | digit_conflate.pl (c) 2013 Jon Dehdari - LGPL v3
14 |
15 | Usage: perl $0 [options] < in > out
16 |
17 | Function: Conflates all digits to the same digit
18 | For example, "12,629.24" -> "55,555.55"
19 |
20 | Options:
21 | -h, --help Print this usage
22 | -d, --digit Set output digit to (default: $digit)
23 |
24 | END_OF_USAGE
25 |
26 | GetOptions(
27 | 'h|help|?' => sub { print $usage; exit; },
28 | 'd|digit=i' => \$digit,
29 | ) or die $usage;
30 |
31 |
32 | while (<>) {
33 | s/\d/$digit/g;
34 | print;
35 | }
36 |
--------------------------------------------------------------------------------
/bin/flat_clusters2json.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | ## By Jon Dehdari 2015
3 | ## Converts boring flat tsv clustering format to json for visualization
4 | ## Usage: perl clusters2json.pl [options] < in > out
5 |
6 | use strict;
7 | use Getopt::Long;
8 |
9 | my $word_labels = undef;
10 |
11 | my $usage = <<"END_OF_USAGE";
12 | clusters2json.pl (c) 2015 Jon Dehdari - LGPL v3 or Mozilla Public License v2
13 |
14 | Usage: perl $0 [options] < in > out
15 |
16 | Function: Converts tsv clustering format to json for visualization
17 |
18 | Options:
19 | -h, --help Print this usage
20 | --word-labels Use the first word in a cluster series as the cluster label.
21 | This option is useful if the input is already sorted by frequency.
22 |
23 | END_OF_USAGE
24 |
25 | GetOptions(
26 | 'h|help|?' => sub { print $usage; exit; },
27 | 'word-labels' => \$word_labels,
28 | ) or die $usage;
29 |
30 | my ($word, $cluster, $freq) = undef;
31 | my $last_cluster = -1;
32 |
33 | print <) {
41 | chomp;
42 | ($word, $cluster, $freq) = split;
43 | $freq or $freq = 1; # if word frequencies aren't provided
44 |
45 | $word =~ s/(["\/])/\\$1/g; # escape problematic characters
46 | #$word =~ s/</g;
47 | #$word =~ s/>/>/g;
48 |
49 | if ($cluster != $last_cluster) { # We've reached a new cluster
50 |
51 | if ($last_cluster != -1) { # end cluster's children (ie words), then start new cluster
52 | print <) loop
83 |
84 | print < output
4 |
5 | binmode(STDIN, ":utf8");
6 | binmode(STDOUT, ":utf8");
7 |
8 | print lc while ;
9 |
--------------------------------------------------------------------------------
/bin/mkcls:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ## By Jon Dehdari, 2015, public domain
3 | ## Compatibility wrapper for clustercat, using mkcls's command-line arguments
4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
5 |
6 | mkcls_cmd_args='
7 | mkcls command-line arguments:
8 |
9 | -p training input text file (default: train)
10 | -V cluster output file
11 | -c number of word clusters (default: 100)
12 | -m minimum word count (default: 1)
13 | -v verbose mode
14 |
15 | Ignored arguments:
16 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
17 | -e set stochastic optimization parameter (for gamma, nu, alpha)
18 | -h set hapax init name
19 | -i set initialization value {ran,aio,gda,freq,other} (default: ran)
20 | -k set category selection {det,ran,best} (default: best)
21 | -l use LO, and set rho
22 | -M maximum number of optimization steps
23 | -n number of optimization runs (default: 1)
24 | -N set optimize parameter count (default: 10)
25 | -o graph output
26 | -O set one-with-hapas (default: 1)
27 | -P training ngram file
28 | -r set random seed (default: 532567487)
29 | -s set maximum runtime seconds
30 | -w set word selection {det,ran,incr} (default: det)
31 | -y use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 |
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | cmd_string="$(dirname $0)/clustercat --min-count 1 --num-classes 100 --in train "
36 |
37 |
38 | while [ $# -gt 0 ]; do
39 |
40 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
41 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then
42 | echo "$mkcls_cmd_args"
43 | exit
44 | fi
45 |
46 | ## Ugh. Use a space between flags and their values
47 | flag=$(echo "$1" | grep -o '^-.')
48 | arg=${1#-?}
49 | case $flag in
50 | -p)
51 | cmd_string="$cmd_string --in $arg "
52 | shift
53 | ;;
54 | -V)
55 | cmd_string="$cmd_string --out $arg "
56 | shift
57 | ;;
58 | -c)
59 | cmd_string="$cmd_string --num-classes $arg "
60 | shift
61 | ;;
62 | -m)
63 | cmd_string="$cmd_string --min-count $arg "
64 | shift
65 | ;;
66 | -v)
67 | cmd_string="$cmd_string --verbose "
68 | shift
69 | ;;
70 | *)
71 | shift
72 | ;;
73 | esac
74 | done
75 |
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string"
79 |
--------------------------------------------------------------------------------
/bin/mkcls4brown:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ## By Jon Dehdari, 2015, public domain
3 | ## Compatibility wrapper for brown-cluster, using mkcls's command-line arguments
4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
5 |
6 | mkcls_cmd_args='
7 | mkcls command-line arguments:
8 |
9 | -p training input text file (default: train)
10 | -V cluster output file
11 | -c number of word clusters (default: 100)
12 | -m minimum word count (default: 1)
13 |
14 | Ignored arguments:
15 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
16 | -e set stochastic optimization parameter (for gamma, nu, alpha)
17 | -h set hapax init name
18 | -i set initialization value {ran,aio,gda,freq,other} (default: ran)
19 | -k set category selection {det,ran,best} (default: best)
20 | -l use LO, and set rho
21 | -M maximum number of optimization steps
22 | -n number of optimization runs (default: 1)
23 | -N set optimize parameter count (default: 10)
24 | -o graph output
25 | -O set one-with-hapas (default: 1)
26 | -P training ngram file
27 | -r set random seed (default: 532567487)
28 | -s set maximum runtime seconds
29 | -v verbose mode
30 | -w set word selection {det,ran,incr} (default: det)
31 | -y use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 |
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | min_count=1
36 | classes=100
37 | in_file='train'
38 |
39 |
40 | while [ $# -gt 0 ]; do
41 |
42 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
43 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then
44 | echo "$mkcls_cmd_args"
45 | exit
46 | fi
47 |
48 | ## Ugh. Use a space between flags and their values
49 | flag=$(echo "$1" | grep -o '^-.')
50 | arg=${1#-?}
51 | case $flag in
52 | -p)
53 | in_file="$arg"
54 | shift
55 | ;;
56 | -V)
57 | out_file="$arg"
58 | shift
59 | ;;
60 | -c)
61 | classes="$arg"
62 | shift
63 | ;;
64 | -m)
65 | min_count="$arg"
66 | shift
67 | ;;
68 | *)
69 | shift
70 | ;;
71 | esac
72 | done
73 |
74 | cmd_string="$(dirname $0)/wcluster --threads 4 --min-occur $min_count --c $classes --text $in_file --output_dir ${out_file}_brown_dir "
75 |
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string" && \
79 | $(dirname $0)/hier2flat_no_freqs.sh < ${out_file}_brown_dir/paths > $out_file && \
80 | \rm ${out_file}_brown_dir/log # really verbose for large corpora
81 |
--------------------------------------------------------------------------------
/bin/mkcls4word2vec:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ## By Jon Dehdari, 2015, public domain
3 | ## Compatibility wrapper for word2vec, using mkcls's command-line arguments
4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know
5 |
6 | mkcls_cmd_args='
7 | mkcls command-line arguments:
8 |
9 | -p training input text file (default: train)
10 | -V cluster output file
11 | -c number of word clusters (default: 100)
12 | -m minimum word count (default: 1)
13 |
14 | Ignored arguments:
15 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing))
16 | -e set stochastic optimization parameter (for gamma, nu, alpha)
17 | -h set hapax init name
18 | -i set initialization value {ran,aio,gda,freq,other} (default: ran)
19 | -k set category selection {det,ran,best} (default: best)
20 | -l use LO, and set rho
21 | -M maximum number of optimization steps
22 | -n number of optimization runs (default: 1)
23 | -N set optimize parameter count (default: 10)
24 | -o graph output
25 | -O set one-with-hapas (default: 1)
26 | -P training ngram file
27 | -r set random seed (default: 532567487)
28 | -s set maximum runtime seconds
29 | -v verbose mode
30 | -w set word selection {det,ran,incr} (default: det)
31 | -y use special criterion, and set sigma distortion (default: 5.0)
32 | '
33 |
34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them
35 | min_count=1
36 | classes=100
37 | in_file='train'
38 |
39 |
40 | while [ $# -gt 0 ]; do
41 |
42 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this
43 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then
44 | echo "$mkcls_cmd_args"
45 | exit
46 | fi
47 |
48 | ## Ugh. Use a space between flags and their values
49 | flag=$(echo "$1" | grep -o '^-.')
50 | arg=${1#-?}
51 | case $flag in
52 | -p)
53 | in_file="$arg"
54 | shift
55 | ;;
56 | -V)
57 | out_file="$arg"
58 | shift
59 | ;;
60 | -c)
61 | classes="$arg"
62 | shift
63 | ;;
64 | -m)
65 | min_count="$arg"
66 | shift
67 | ;;
68 | *)
69 | shift
70 | ;;
71 | esac
72 | done
73 |
74 | cmd_string="$(dirname $0)/word2vec -min-count $min_count -classes $classes -size $classes -train $in_file -output $out_file "
75 |
76 | echo 'Executing:' >&2
77 | echo "$cmd_string" >&2
78 | eval "$cmd_string" && \
79 | perl -p -i -e 's/ /\t/g' $out_file
80 |
--------------------------------------------------------------------------------
/bin/ngram_counts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | ## By Jon Dehdari, 2015, public domain
3 | ## Counts ngrams, including joined ngrams, from text corpus
4 |
5 | import sys
6 |
7 | ngram_order = 4
8 | ngrams = []
9 | for i in range(ngram_order):
10 | ngrams.append({})
11 |
12 | for line in sys.stdin:
13 | line = line.rstrip()
14 | tokens = line.split()
15 | #tokens.insert(0, "")
16 | #tokens.append("")
17 | #print(tokens)
18 | len_tokens = len(tokens)
19 |
20 | for i in range(len_tokens):
21 |
22 | # i := leftmost position
23 | # j := rightmost position of current sub-ngram
24 | # k := rightmost position of all sub-ngrams
25 |
26 | k = len_tokens if i+ngram_order >= len_tokens else i + ngram_order
27 | #print("i=",i, "k=", k, tokens[i:k])
28 |
29 | # Build-up joined ngrams
30 | for j in range(i+1,k+1):
31 | joined_ngram = '_'.join(tokens[i:j])
32 | if (j+1 < k):
33 | if joined_ngram in ngrams[0]:
34 | ngrams[0][joined_ngram] += 1
35 | else :
36 | ngrams[0][joined_ngram] = 1
37 |
38 | #print(" j=",j, joined_ngram)
39 |
40 | # Process sub-ngrams
41 | num_subcuts = j - (i+1)
42 | while (num_subcuts >= 1):
43 | if ( (j == k) and (num_subcuts % 2)): # skip imbalanced subcuts
44 | num_subcuts -= 1
45 | continue
46 | subcut = ' '.join([ '_'.join(tokens[i:i+num_subcuts]), '_'.join(tokens[i+num_subcuts:j]) ])
47 | if (subcut in ngrams[1]):
48 | ngrams[1][subcut] +=1
49 | else :
50 | ngrams[1][subcut] = 1
51 |
52 | #print(" num_subcuts=", num_subcuts, "subcut=<<",subcut, ">>")
53 | num_subcuts -= 1
54 |
55 | for i in range(ngram_order):
56 | print()
57 | for k, v in sorted(ngrams[i].items()):
58 | print(k, "\t", v, sep='')
59 |
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # Python ClusterCat
2 |
3 |
4 | ## Installation
5 | First follow the [installation instructions](../README.md) in the above directory.
6 | After that, you normally don't need to install anything here. You can load the module `clustercat` using either Python 2 or 3.
7 |
8 | cd python
9 | python3
10 | >>> import clustercat as cc
11 | >>> clustering = cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1)
12 | >>> print(clustering)
13 |
14 | If you get an error message saying that it is unable to access clustercat binary, follow all the instructions in the error message.
15 | You'll need more text input than the toy example above to produce useful clusters.
16 |
17 | To import this module from a different directory, you can add the module's directory to `$PYTHONPATH`:
18 |
19 | cd python
20 | echo "export PYTHONPATH=\$PYTHONPATH:`pwd`" >> ~/.bashrc
21 | source ~/.bashrc
22 |
23 | ## Python ClusterCat Functions
24 | ### `cluster(text=None, in_file=None, ...)`
25 | Produce a clustering, given a textual input. There is one required argument (the training input text), and many optional arguments. The one required argument is **either** `text` **or** `in_file`. The argument `text` is a list of Python strings. The argument `in_file` is a path to a text file, consisting of preprocessed (eg. tokenized) one-sentence-per-line text. The use of `text` is probably not a good idea for large corpora.
26 |
27 | ```Python
28 | cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1)
29 | cc.cluster(in_file='/tmp/corpus.tok.txt', min_count=3)
30 | ```
31 |
32 | The other optional arguments are described by running the compiled clustercat binary with the `--help` argument, except that the leading `--` from the shell argument is removed, and `-` is replaced with `_`. So for example, instead of `--tune-cycles 15`, the Python function argument would be `tune_cycles=15` .
33 |
34 | Returns a dictionary of the form `{ word : cluster_id }` .
35 |
36 |
37 | ### `save(mapping, out, format='tsv')`
38 | Save a clustering (dictionary) to file. By default the output file is a tab-separated listing of words and their cluster ID.
39 |
40 | ```Python
41 | cc.save(clustering, 'clusters.tsv')
42 | ```
43 |
44 |
45 | ### `load(in_file, format="tsv")`
46 | Load a clustering from a file. By default the input file is a tab-separated listing of words and their cluster ID.
47 | Returns a dictionary of the clustering.
48 |
49 | ```Python
50 | clustering = cc.load('clusters.tsv')
51 | ```
52 |
53 |
54 | ### `tag_string(mapping, text, unk="")`
55 | Tag a string with the corresponding cluster ID's. If a word is not found in the clustering, use `unk`.
56 | Returns a string.
57 |
58 | ```Python
59 | tagged_sent = cc.tag_string(clustering, "this is a test")
60 | ```
61 |
62 | ### `tag_stdin(mapping, unk="")`
63 | This calls `tag_string()` for each line in `stdin`, and prints the result to `stdout`.
64 |
--------------------------------------------------------------------------------
/python/clustercat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # By Jon Dehdari, 2016
3 | # MIT License
4 | """ Fast, flexible word clusters """
5 |
6 | import sys
7 | import os
8 | import subprocess
9 | import distutils.spawn
10 |
11 | unk = ''
12 |
13 | def load(in_file=None, format='tsv'):
14 | """ Load a clustering from a file. By default the input file is a
15 | tab-separated listing of words and their cluster ID. Returns a dictionary of
16 | the clustering.
17 |
18 | Args:
19 | in_file (string): path to input file
20 | format (string): input file format (default: tsv)
21 |
22 | Returns:
23 | dict: word-to-tag mapping
24 | """
25 |
26 | mapping = {}
27 | if format == 'tsv':
28 | with open(in_file) as f:
29 | # Primary sort by value (cluster ID), secondary sort by key (word)
30 | for line in f:
31 | # Keep the full split line instead of key, val to allow for
32 | # counts in optional third column
33 | tokens = line.split()
34 | mapping[tokens[0]] = int(tokens[1])
35 |
36 | return mapping
37 |
38 |
39 | def save(mapping=None, out=None, format='tsv'):
40 | """ Save a clustering (dictionary) to file. By default the output file is
41 | a tab-separated listing of words and their cluster ID.
42 |
43 | Args:
44 | mapping (dict): word-to-tag mapping
45 | out (string): path to output file
46 | format (string): output file format (default: tsv)
47 | """
48 |
49 | if format == 'tsv':
50 | with open(out, 'w') as outfile:
51 | # Primary sort by value (cluster ID), secondary sort by key (word)
52 | for key in sorted(sorted(mapping), key=mapping.get):
53 | line = str(key) + '\t' + str(mapping[key]) + '\n'
54 | outfile.write(line)
55 |
56 |
57 | def tag_string(mapping=None, text=None, unk=unk):
58 | """Tag a string with the corresponding cluster ID's. If a word is not
59 | found in the clustering, use unk.
60 |
61 | Args:
62 | mapping (dict): word-to-tag mapping
63 | text (string): the string to be tagged
64 | unk (string): what to label unknown/unseen words that are not in
65 | mapping (default: )
66 |
67 | Returns:
68 | string: sequence of tags
69 | """
70 |
71 | newsent = ""
72 | for word in text.split():
73 | if word in mapping:
74 | newsent += ' ' + str(mapping[word])
75 | elif unk in mapping:
76 | newsent += ' ' + str(mapping[unk])
77 | else:
78 | newsent += ' ' + ""
79 | return newsent.lstrip()
80 |
81 |
82 | def tag_stdin(mapping=None, unk=unk):
83 | """ This calls tag_string() for each line in stdin, and prints the
84 | result to stdout.
85 |
86 | Args:
87 | mapping (dict): word-to-tag mapping
88 | unk (string): what to label unknown/unseen words that are not in
89 | mapping (default: )
90 | """
91 |
92 | for line in sys.stdin:
93 | print(tag_string(mapping=mapping, text=line, unk=unk))
94 |
95 |
96 | def cluster(text=None, in_file=None, classes=None, class_file=None,
97 | class_offset=None, forward_lambda=None, ngram_input=None,
98 | min_count=None, out=None, print_freqs=None, quiet=None,
99 | refine=None, rev_alternate=None, threads=None, tune_cycles=None,
100 | unidirectional=None, verbose=None, word_vectors=None):
101 | """
102 | Produce a clustering, given a textual input. There is one required argument
103 | (the training input text), and many optional arguments. The one required
104 | argument is either text or in_file. The argument text is a list of Python
105 | strings. The argument in_file is a path to a text file, consisting of
106 | preprocessed (eg. tokenized) one-sentence-per-line text. The use of text
107 | is probably not a good idea for large corpora.
108 |
109 | The other optional arguments are described by running the compiled
110 | clustercat binary with the --help argument, except that the
111 | leading -- from the shell argument is removed, and - is replaced with _.
112 | So for example, instead of --tune-cycles 15, the Python function argument
113 | would be tune_cycles=15 .
114 |
115 | Returns a dictionary of the form { word : cluster_id } .
116 | """
117 |
118 | # First check to see if we can access clustercat binary relative to this
119 | # module. If not, try $PATH. If not, :-(
120 | # Python 2 doesn't return absolute path in __file__
121 | cc_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
122 | cc_bin = os.path.join(cc_dir, 'bin', 'clustercat')
123 | if os.path.isfile(cc_bin):
124 | cmd_str = [cc_bin]
125 | elif distutils.spawn.find_executable("clustercat"):
126 | cmd_str = ["clustercat"]
127 | else:
128 | print("Error: Unable to access clustercat binary from either ", cc_dir, " or $PATH. In the parent directory, first run 'make install', and then add $HOME/bin/ to your $PATH, by typing the following command:\necho 'PATH=$PATH:$HOME/bin' >> $HOME/.bashrc && source $HOME/.bashrc")
129 | exit(1)
130 |
131 |
132 | # Now translate function arguments to command-line arguments
133 | clustercat_params = {"in_file": "--in", "out": "--out",
134 | "classes": "--classes",
135 | "class_file": "--class-file",
136 | "class_offset": "--class-offset",
137 | "forward_lambda": "--forward-lambda",
138 | "ngram_input": "--ngram-input",
139 | "min_count": "--min-count",
140 | "refine": "--refine",
141 | "rev_alternate": "--rev-alternate",
142 | "threads": "--threads",
143 | "tune_cycles": "--tune-cycles",
144 | "word_vectors": "--word-vectors"
145 | }
146 |
147 | boolean_params = {"print_freqs": "--print-freqs",
148 | "quiet": "--quiet",
149 | "unidirectional": "--unidirectional",
150 | "verbose": "--verbose"
151 | }
152 |
153 | for arg, value in locals().items():
154 | # Check for boolean parameters
155 | if arg in boolean_params and value is True:
156 | cmd_str.append(boolean_params[arg])
157 | # Other non-boolean parameters that are not None
158 | elif arg in clustercat_params and value is not None:
159 | cmd_str.append(clustercat_params[arg])
160 | cmd_str.append(str(value))
161 |
162 | #print(cmd_str, file=sys.stderr) # Use Python 3 interpreter
163 |
164 | cmd_out = ''
165 | if text and not in_file:
166 | p1 = subprocess.Popen(["printf", "\n".join(text)],
167 | stdout=subprocess.PIPE, universal_newlines=True)
168 | p2 = subprocess.Popen(cmd_str, stdin=p1.stdout, stdout=subprocess.PIPE,
169 | universal_newlines=True)
170 | p1.stdout.close()
171 | cmd_out = p2.communicate()[0]
172 | elif in_file and not text:
173 | cmd_out = subprocess.check_output(cmd_str, universal_newlines=True)
174 | else:
175 | print("Error: supply either text or in_file argument to clustercat.cluster(), but not both")
176 |
177 | clusters = {}
178 | for line in cmd_out.split("\n"):
179 | split_line = line.split("\t")
180 | try:
181 | clusters[split_line[0]] = int(split_line[1])
182 | except:
183 | pass
184 | return clusters
185 |
186 |
187 | def main():
188 | """ No real reason to use this as a standalone script. Just invoke the
189 | C-compiled binary for standalone applications. But here you
190 | go, anyways.
191 | """
192 | import argparse
193 | parser = argparse.ArgumentParser(description='Clusters words, or tags them')
194 |
195 | parser.add_argument('-i', '--in', help="Load input training file")
196 | parser.add_argument('-o', '--out', help="Save final mapping to file")
197 | parser.add_argument('-t', '--tag', help="Tag stdin input, using clustering in supplied argument")
198 | args = parser.parse_args()
199 |
200 | if args.tag:
201 | mapping = load(in_file=args.tag)
202 | tag_stdin(mapping=mapping)
203 | else:
204 | mapping = cluster(text=sys.stdin)
205 | if args.out:
206 | save(mapping=mapping, out=args.out)
207 | else:
208 | print(mapping)
209 |
210 | if __name__ == '__main__':
211 | main()
212 |
--------------------------------------------------------------------------------
/src/clustercat-array.c:
--------------------------------------------------------------------------------
1 | #include // variadic functions for arrncat
2 | #include
3 | #include
4 | #include
5 | #include "clustercat.h" // macros
6 |
7 | // Returns 0 if all values in array are 0.0; returns 1 otherwise
8 | int anyf(const float array[], unsigned int arr_len) {
9 | while (arr_len--) {
10 | if (array[arr_len])
11 | return 1;
12 | }
13 | return 0;
14 | }
15 |
16 | // Returns 0 if all values in array are 0.0; returns 1 otherwise
17 | int any(const double array[], unsigned int arr_len) {
18 | while (arr_len--) {
19 | if (array[arr_len])
20 | return 1;
21 | }
22 | return 0;
23 | }
24 |
25 | // Returns 1 if all values in array are non-zero; returns 0 otherwise
26 | int allf(const float array[], unsigned int arr_len) {
27 | while (arr_len--) {
28 | if (!array[arr_len])
29 | return 0;
30 | }
31 | return 1;
32 | }
33 |
34 | // Returns 1 if all values in array are non-zero; returns 0 otherwise
35 | int all(const double array[], unsigned int arr_len) {
36 | while (arr_len--) {
37 | if (!array[arr_len])
38 | return 0;
39 | }
40 | return 1;
41 | }
42 |
43 | float sumf(const float array[], unsigned int arr_len) {
44 | float sum = 0.0;
45 | while (arr_len--) {
46 | sum += array[arr_len];
47 | }
48 | return sum;
49 | }
50 |
51 | double sum(const double array[], unsigned int arr_len) {
52 | double sum = 0.0;
53 | while (arr_len--) {
54 | sum += array[arr_len];
55 | }
56 | return sum;
57 | }
58 |
59 | float productf(const float array[], unsigned int arr_len) {
60 | float product = 1.0;
61 | while (arr_len--) {
62 | product *= array[arr_len];
63 | }
64 | return product;
65 | }
66 |
67 | double product(const double array[], unsigned int arr_len) {
68 | double product = 1.0;
69 | while (arr_len--) {
70 | product *= array[arr_len];
71 | }
72 | return product;
73 | }
74 |
75 | float minf(const float array[], unsigned int arr_len) {
76 | arr_len--;
77 | float min = array[arr_len];
78 | while (1) {
79 | //printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout);
80 | if (array[arr_len] < min)
81 | min = array[arr_len];
82 | if (arr_len == 0)
83 | break;
84 | arr_len--;
85 | }
86 | return min;
87 | }
88 |
89 | double min(const double array[], unsigned int arr_len) {
90 | arr_len--;
91 | double min = array[arr_len];
92 | while (1) {
93 | //printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout);
94 | if (array[arr_len] < min)
95 | min = array[arr_len];
96 | if (arr_len == 0)
97 | break;
98 | arr_len--;
99 | }
100 | return min;
101 | }
102 |
103 | float maxf(const float array[], unsigned int arr_len) {
104 | arr_len--;
105 | float max = array[arr_len];
106 | while (1) {
107 | if (array[arr_len] > max)
108 | max = array[arr_len];
109 | if (arr_len == 0)
110 | break;
111 | arr_len--;
112 | }
113 | return max;
114 | }
115 |
116 | double max(const double array[], unsigned int arr_len) {
117 | arr_len--;
118 | double max = array[arr_len];
119 | while (1) {
120 | if (array[arr_len] > max)
121 | max = array[arr_len];
122 | if (arr_len == 0)
123 | break;
124 | arr_len--;
125 | }
126 | return max;
127 | }
128 |
129 | unsigned int which_minf(const float array[], const unsigned int arr_len) {
130 | unsigned int which_min = 0;
131 | float min = array[0];
132 |
133 | unsigned int i = 1;
134 | for (; i < arr_len; i++) {
135 | if (array[i] < min) {
136 | which_min = i;
137 | min = array[i];
138 | }
139 | }
140 | return which_min;
141 | }
142 |
143 | unsigned int which_min(const double array[], const unsigned int arr_len) {
144 | unsigned int which_min = 0;
145 | double min = array[0];
146 |
147 | unsigned int i = 1;
148 | for (; i < arr_len; i++) {
149 | if (array[i] < min) {
150 | which_min = i;
151 | min = array[i];
152 | }
153 | }
154 | return which_min;
155 | }
156 |
157 | unsigned int which_maxf(const float array[], const unsigned int arr_len) {
158 | unsigned int which_max = 0;
159 | float max = array[0];
160 |
161 | unsigned int i = 1;
162 | for (; i < arr_len; i++) {
163 | if (array[i] > max) {
164 | which_max = i;
165 | max = array[i];
166 | }
167 | }
168 | return which_max;
169 | }
170 |
171 | unsigned int which_max(const double array[], const unsigned int arr_len) {
172 | unsigned int which_max = 0;
173 | double max = array[0];
174 |
175 | unsigned int i = 1;
176 | for (; i < arr_len; i++) {
177 | if (array[i] > max) {
178 | which_max = i;
179 | max = array[i];
180 | }
181 | }
182 | return which_max;
183 | }
184 |
185 | void fprint_array(FILE *stream, const double array[const], const unsigned int arr_len, char * restrict sep) {
186 | //fputs("{ ", stream);
187 | unsigned int i = 0;
188 | for (; i < arr_len-1; i++)
189 | fprintf(stream, "%g%s", array[i], sep);
190 | fprintf(stream, "%g\n", array[arr_len-1]);
191 | }
192 |
193 | void fprint_arrayf(FILE *stream, const float array[const], const unsigned int arr_len, char * restrict sep) {
194 | //fputs("{ ", stream);
195 | unsigned int i = 0;
196 | for (; i < arr_len-1; i++)
197 | fprintf(stream, "%g%s", array[i], sep);
198 | fprintf(stream, "%g\n", array[arr_len-1]);
199 | }
200 |
201 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep) {
202 | char line[STDIN_SENT_MAX_CHARS];
203 | if (fgets(line, sizeof(line), stream) == NULL) // Get line
204 | return 0;
205 | int elems = 0;
206 | char * restrict token;
207 | if ((token = strtok(line, sep)) == NULL)
208 | return 0;
209 | while (token) {
210 | array[elems] = atof(token);
211 | elems++;
212 | token = strtok(NULL, sep);
213 | }
214 |
215 | return elems;
216 | }
217 |
218 |
219 | // Analogous to strncat(), but with variable number of arguments
220 | void arrncat(double full_array[], const unsigned int full_array_len, ...) {
221 | va_list argptr;
222 | va_start(argptr, full_array_len);
223 |
224 | double * restrict offset = full_array;
225 | double * restrict full_array_last = full_array + full_array_len;
226 | //printf("30: full_array=%p, offset=%p, full_array_len=%u, sizeof(double)=%lu, len*size=%lu, full_array_last=%p, diff=%li\n", full_array, offset, full_array_len, sizeof(double), full_array_len*sizeof(double), full_array_last, full_array_last - full_array);
227 |
228 | while (offset < full_array_last) {
229 | double * restrict arr = va_arg(argptr, double*);
230 | //printf("31\n");
231 | unsigned int arr_len = va_arg(argptr, unsigned int);
232 | //printf("32: arr_len=%u\n", arr_len);
233 | unsigned int arr_len_bytes = arr_len * sizeof(double);
234 | //printf("33: full_array=%p, offset=%p, *<-=%g, *+1=%g, full_array_len=%u, arr_len=%u, arr_len_bytes=%u, arr[0]=%g, arr[1]=%g, arr_last=%g\n", full_array, offset, *offset, *(offset+1), full_array_len, arr_len, arr_len_bytes, arr[0], arr[1], arr[arr_len-1]); fflush(stdout);
235 | memcpy(offset, arr, arr_len_bytes);
236 | //printf("34: offset=%p, *<-=%g, *+1=%g, *-1=%g, full_array_last=%p arr_len_bytes=%u\n", offset, *offset, *(offset+1), *(offset-1), full_array_last, arr_len_bytes); fflush(stdout);
237 | offset += arr_len;
238 | //printf("35: full_array=%p, offset=%p, full_array_last=%p arr_len_bytes=%u\n", full_array, offset, full_array_last, arr_len_bytes); fflush(stdout);
239 | //printf("36: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n");
240 | }
241 | va_end(argptr);
242 | //printf("37: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n");
243 | }
244 |
--------------------------------------------------------------------------------
/src/clustercat-array.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_DKLM_ARRAY_HEADER
2 | #define INCLUDE_DKLM_ARRAY_HEADER
3 |
4 | int any(const double array[], unsigned int arr_len);
5 | int anyf(const float array[], unsigned int arr_len);
6 | int all(const double array[], unsigned int arr_len);
7 | int allf(const float array[], unsigned int arr_len);
8 |
9 | double sum(const double array[], unsigned int arr_len);
10 | float sumf(const float array[], unsigned int arr_len);
11 | double product(const double array[], unsigned int arr_len);
12 | float productf(const float array[], unsigned int arr_len);
13 |
14 | double min(const double array[], unsigned int arr_len);
15 | float minf(const float array[], unsigned int arr_len);
16 | double max(const double array[], unsigned int arr_len);
17 | float maxf(const float array[], unsigned int arr_len);
18 |
19 | unsigned int which_min(const double array[], const unsigned int arr_len);
20 | unsigned int which_minf(const float array[], const unsigned int arr_len);
21 | unsigned int which_max(const double array[], const unsigned int arr_len);
22 | unsigned int which_maxf(const float array[], const unsigned int arr_len);
23 |
24 | void fprint_array(FILE *stream, const double array[], const unsigned int arr_len, char * restrict sep);
25 | void fprint_arrayf(FILE *stream, const float array[], const unsigned int arr_len, char * restrict sep);
26 |
27 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep);
28 |
29 | void arrncat(double full_array[], const unsigned int full_array_len, ...);
30 |
31 | #endif // INCLUDE_HEADER
32 |
--------------------------------------------------------------------------------
/src/clustercat-cluster.c:
--------------------------------------------------------------------------------
1 | #include // clock_t, clock(), CLOCKS_PER_SEC, etc.
2 | #include // FLT_MAX, etc.
3 | #include "clustercat-cluster.h"
4 | #include "clustercat-array.h"
5 | #include "clustercat-math.h"
6 |
7 | float entropy_term(const float entropy_terms[const], const unsigned int i);
8 | double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move);
9 | double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move);
10 |
11 | inline float entropy_term(const float entropy_terms[const], const unsigned int i) {
12 | if (i < ENTROPY_TERMS_MAX)
13 | return entropy_terms[i];
14 | else
15 | return i * log2f(i);
16 | }
17 |
18 | inline double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) {
19 | // See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008): https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
20 | register double delta = 0.0;
21 | const unsigned int count_class = count_array[from_class];
22 | if (count_class > 1)
23 | delta = entropy_term(entropy_terms, count_class);
24 | const unsigned int new_count_class = count_class - word_count;
25 | if (new_count_class > 1)
26 | delta -= entropy_term(entropy_terms, new_count_class);
27 | //printf("rm42: word=%u, word_count=%u, from_class=%u, count_class=%u, new_count_class=%u (count_class - word_count), delta=%g\n", word, word_count, from_class, count_class, new_count_class, delta); fflush(stdout);
28 |
29 | if (! is_tentative_move)
30 | count_array[from_class] = new_count_class;
31 |
32 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
33 | word_id_t prev_word = word_bigrams[word].predecessors[i];
34 | //printf(" rm43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * from_class=%u)\n", i, word_bigrams[word].length, word, (prev_word * cmd_args.num_classes + from_class), prev_word, cmd_args.num_classes, from_class); fflush(stdout);
35 | const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + from_class];
36 | if (word_class_count > 1) // Can't do log(0); no need for 1
37 | delta -= entropy_term(entropy_terms, word_class_count);
38 | const unsigned int new_word_class_count = word_class_count - word_bigrams[word].bigram_counts[i];
39 | delta += entropy_term(entropy_terms, new_word_class_count);
40 | //printf(" rm45: word=%u (#=%u), prev_word=%u, #()=%u, from_class=%u, i=%u, count_class=%u, new_count_class=%u, =<%u,%u>, #()=%u, new_#()=%u (w-c - %u), delta=%g\n", word, word_count, prev_word, word_bigrams[word].bigram_counts[i], from_class, i, count_class, new_count_class, prev_word, from_class, word_class_count, new_word_class_count, word_bigrams[word].bigram_counts[i], delta); fflush(stdout);
41 | //print_word_class_counts(cmd_args, model_metadata, word_class_counts);
42 | if (! is_tentative_move)
43 | word_class_counts[prev_word * cmd_args.num_classes + from_class] = new_word_class_count;
44 |
45 | }
46 |
47 | if (cmd_args.rev_alternate && (!is_tentative_move)) { // also update reversed word-class counts
48 | for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) {
49 | const word_id_t next_word = word_bigrams_rev[word].predecessors[i];
50 | const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + from_class];
51 | const unsigned int new_word_class_rev_count = word_class_rev_count - word_bigrams_rev[word].bigram_counts[i];
52 | //printf(" rm47: rev_next_word=%u, rev_#()=%u, rev_new_#()=%u\n", next_word, word_class_rev_count, new_word_class_rev_count); fflush(stdout);
53 | //print_word_class_counts(cmd_args, model_metadata, word_class_rev_counts);
54 | word_class_rev_counts[next_word * cmd_args.num_classes + from_class] = new_word_class_rev_count;
55 | }
56 | }
57 |
58 | return delta;
59 | }
60 |
61 | inline double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) {
62 | // See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008): https://www.aclweb.org/anthology/P/P08/P08-1086.pdf
63 | unsigned int count_class = count_array[to_class];
64 | if (!count_class) // class is empty
65 | count_class = 1;
66 | const unsigned int new_count_class = count_class + word_count; // Differs from paper: replace "-" with "+"
67 | register double delta = entropy_term(entropy_terms, count_class) - entropy_term(entropy_terms, new_count_class);
68 | //printf("mv42: word=%u, word_count=%u, to_class=%u, count_class=%u, new_count_class=%u, delta=%g, is_tentative_move=%d\n", word, word_count, to_class, count_class, new_count_class, delta, is_tentative_move); fflush(stdout);
69 | const float backward_lambda = 1 - cmd_args.forward_lambda;
70 |
71 | if (! is_tentative_move)
72 | count_array[to_class] = new_count_class;
73 |
74 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
75 | word_id_t prev_word = word_bigrams[word].predecessors[i];
76 | //printf(" mv43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * to_class=%u)\n", i, word_bigrams[word].length, word, (prev_word * cmd_args.num_classes + to_class), prev_word, cmd_args.num_classes, to_class); fflush(stdout);
77 | const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + to_class];
78 | if (word_class_count > 1) { // Can't do log(0); no need for 1
79 | if (cmd_args.unidirectional) {
80 | delta -= entropy_term(entropy_terms, word_class_count);
81 | } else {
82 | delta -= entropy_term(entropy_terms, word_class_count) * cmd_args.forward_lambda;
83 | }
84 | }
85 | const unsigned int new_word_class_count = word_class_count + word_bigrams[word].bigram_counts[i]; // Differs from paper: replace "-" with "+"
86 | if (new_word_class_count > 1) { // Can't do log(0)
87 | if (cmd_args.unidirectional) {
88 | delta += entropy_term(entropy_terms, new_word_class_count);
89 | } else {
90 | delta += entropy_term(entropy_terms, new_word_class_count) * cmd_args.forward_lambda;
91 | }
92 | }
93 | //printf(" mv45: word=%u; prev_word=%u, to_class=%u, i=%u, word_count=%u, count_class=%u, new_count_class=%u, =<%u,%hu>, #()=%u, new_#()=%u, delta=%g\n", word, prev_word, to_class, i, word_count, count_class, new_count_class, prev_word, to_class, word_class_count, new_word_class_count, delta); fflush(stdout);
94 | if (! is_tentative_move)
95 | word_class_counts[prev_word * cmd_args.num_classes + to_class] = new_word_class_count;
96 |
97 | }
98 |
99 | if (cmd_args.rev_alternate) { // also update reversed word-class counts; reversed order of conditionals since the first clause here is more common in this function
100 | for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) {
101 | const word_id_t next_word = word_bigrams_rev[word].predecessors[i];
102 | const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + to_class];
103 | if (word_class_rev_count > 1) // Can't do log(0); no need for 1
104 | if (!cmd_args.unidirectional)
105 | delta -= entropy_term(entropy_terms, word_class_rev_count) * backward_lambda;
106 |
107 | const unsigned int new_word_class_rev_count = word_class_rev_count + word_bigrams_rev[word].bigram_counts[i];
108 | if (new_word_class_rev_count > 1) // Can't do log(0); no need for 1
109 | if (!cmd_args.unidirectional)
110 | //delta += entropy_term(entropy_terms, word_class_rev_count) * backward_lambda;
111 | delta += entropy_term(entropy_terms, new_word_class_rev_count) * backward_lambda;
112 | //printf("word=%u, word_class_rev_count=%u, new_word_class_rev_count=%u, delta=%g\n", word, word_class_rev_count, new_word_class_rev_count, delta);
113 | if (!is_tentative_move)
114 | word_class_rev_counts[next_word * cmd_args.num_classes + to_class] = new_word_class_rev_count;
115 | }
116 | }
117 |
118 | return delta;
119 | }
120 |
121 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) {
122 | unsigned long steps = 0;
123 |
124 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) { // Exchange algorithm: See Sven Martin, Jörg Liermann, Hermann Ney. 1998. Algorithms For Bigram And Trigram Word Clustering. Speech Communication 24. 19-37. http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354
125 | // Get initial logprob
126 | count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *));
127 | init_count_arrays(cmd_args, count_arrays);
128 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays);
129 | unsigned int num_classes_current = (cmd_args.num_classes > 15) && (cmd_args.refine) ? powi(2,cmd_args.refine) : cmd_args.num_classes; // Don't bother with class refinement if the number of classes is really small. powi() is declared in clustercat-math.h
130 |
131 | // Build precomputed entropy terms
132 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
133 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
134 |
135 | if (cmd_args.verbose > 3) {
136 | printf("cluster(): 42: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) {
137 | printf("c_%u=%lu, ", i, (unsigned long)count_arrays[0][i]);
138 | class_sum += count_arrays[0][i];
139 | } printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout);
140 | }
141 | double best_log_prob = training_data_log_likelihood(cmd_args, model_metadata, count_arrays, word_counts, word2class);
142 |
143 | if (cmd_args.verbose >= -1) {
144 | fprintf(stderr, "%s: Expected Steps: %'lu (%'u word types x %'u classes x %'u cycles); initial logprob=%g, PP=%g\n", argv_0_basename, (unsigned long)model_metadata.type_count * cmd_args.num_classes * cmd_args.tune_cycles, model_metadata.type_count, cmd_args.num_classes, cmd_args.tune_cycles, best_log_prob, perplexity(best_log_prob, (model_metadata.token_count + model_metadata.line_count))); fflush(stderr);
145 | }
146 |
147 | time_t time_start_cycles;
148 | time(&time_start_cycles);
149 | unsigned short cycle = 1; // Keep this around afterwards to print out number of actually-completed cycles
150 | word_id_t moved_count = 0;
151 | count_arrays_t temp_count_arrays = malloc(cmd_args.max_array * sizeof(void *));
152 | init_count_arrays(cmd_args, temp_count_arrays);
153 | for (; cycle <= cmd_args.tune_cycles; cycle++) {
154 | if (cmd_args.refine && (cycle == 4)) // Current setting forces bump to full cluster size after 3 iterations, but you can change this line and the next for a different schedule
155 | num_classes_current = cmd_args.num_classes;
156 | if ((num_classes_current != cmd_args.num_classes) && (num_classes_current > (cmd_args.num_classes / 4.0))) { // If the coarse cluster size is close to the final size, just go do the final size
157 | num_classes_current = cmd_args.num_classes;
158 | time(&time_start_cycles); // restart timer, when full clustering starts
159 | }
160 |
161 | const bool is_nonreversed_cycle = (cmd_args.rev_alternate == 0) || (cycle % (cmd_args.rev_alternate+1)); // Only do a reverse predictive exchange (using ) after every cmd_arg.rev_alternate cycles; if rev_alternate==0 then always do this part.
162 |
163 | clear_count_arrays(cmd_args, temp_count_arrays);
164 | double queried_log_prob = 0.0;
165 | if (model_metadata.token_count < 5e8 || cycle == cmd_args.tune_cycles || cycle == 2 || cycle == 3) { // For large training sets, only calculate PP on the interesting iterations
166 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, temp_count_arrays);
167 | queried_log_prob = training_data_log_likelihood(cmd_args, model_metadata, temp_count_arrays, word_counts, word2class);
168 | }
169 |
170 | // ETA stuff
171 | const time_t time_this_cycle = time(NULL);
172 | const double time_elapsed = difftime(time_this_cycle, time_start_cycles) + 7.0; // a little is added since time prediction in early cycles tend to be too optimistic
173 | const double time_avg_per_cycle = (time_elapsed / ((double)cycle-1));
174 | const unsigned int remaining_cycles = cmd_args.tune_cycles - cycle + 1;
175 | const double time_remaining = ( time_avg_per_cycle * remaining_cycles);
176 | const time_t eta = time_this_cycle + time_remaining;
177 |
178 | if (cmd_args.verbose >= -1) {
179 | if (is_nonreversed_cycle)
180 | fprintf(stderr, "ccat: Normal cycle %-2u", cycle);
181 | else
182 | fprintf(stderr, "ccat: Rev cycle %-2u", cycle);
183 | fprintf(stderr, " C=%-3u", num_classes_current);
184 | if (cycle > 1) {
185 | fprintf(stderr, " Words moved last cycle: %.2g%% (%u/%u).", (100 * (moved_count / (float)model_metadata.type_count)), moved_count, model_metadata.type_count);
186 | if (cycle > 4) {
187 | char eta_string[300];
188 | strftime(eta_string, 300, "%x %X", localtime(&eta));
189 | fprintf(stderr, " Time left: %lim %lis. ETA: %s", (long)time_remaining/60, ((long)time_remaining % 60), eta_string);
190 | }
191 | if (queried_log_prob) {
192 | if (cmd_args.ngram_input) {
193 | fprintf(stderr, " LL=%g", queried_log_prob); // can't get reliable PP if input is ngram counts
194 | } else {
195 | fprintf(stderr, " LL=%.3g PP=%g", queried_log_prob, perplexity(queried_log_prob,(model_metadata.token_count + model_metadata.line_count)));
196 | }
197 | }
198 | fprintf(stderr, "\n");
199 | }
200 | else if ( cmd_args.refine)
201 | fprintf(stderr, " Starting with %u coarse classes, for the first few cycles\n", num_classes_current);
202 | else
203 | fprintf(stderr, "\n");
204 | fflush(stderr);
205 | }
206 | moved_count = 0;
207 |
208 | //#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps) // non-determinism
209 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
210 | //for (word_id_t word_i = model_metadata.type_count-1; word_i != -1; word_i--) {
211 | if (cycle < 3 && word_i < num_classes_current) // don't move high-frequency words in the first (few) iteration(s)
212 | continue;
213 | const word_count_t word_i_count = word_bigrams[word_i].headword_count;
214 | const wclass_t old_class = word2class[word_i];
215 | double scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array
216 | memset(scores, 0, sizeof(double) * cmd_args.num_classes);
217 | //const double delta_remove_word = pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays, true);
218 | //const double delta_remove_word = 0.0; // Not really necessary
219 | //const double delta_remove_word_rev = 0.0; // Not really necessary
220 |
221 | //printf("cluster(): 43: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) {
222 | // printf("c_%u=%u, ", i, count_arrays[0][i]);
223 | // class_sum += count_arrays[0][i];
224 | //} printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout);
225 |
226 | #pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps)
227 | for (wclass_t class = 0; class < num_classes_current; class++) { // class values range from 0 to num_classes_current-1
228 | if (is_nonreversed_cycle) {
229 | scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true);
230 | } else { // This is the reversed one
231 | scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, true);
232 | }
233 | steps++;
234 | }
235 | //scores[old_class] -= 0.80 / cycle; // TA
236 |
237 | const wclass_t best_hypothesis_class = which_max(scores, num_classes_current);
238 | const double best_hypothesis_score = max(scores, num_classes_current);
239 |
240 | if (cmd_args.verbose > 1) {
241 | printf("Orig score for word w_«%u» using class «%hu» is %g; Hypos %u-%u: ", word_i, old_class, scores[old_class], 1, num_classes_current);
242 | fprint_array(stdout, scores, num_classes_current, ","); fflush(stdout);
243 | //if (best_hypothesis_score > 0) { // Shouldn't happen
244 | // fprintf(stderr, "Error: best_hypothesis_score=%g for class %hu > 0\n", best_hypothesis_score, best_hypothesis_class); fflush(stderr);
245 | // exit(9);
246 | //}
247 | }
248 |
249 | if (old_class != best_hypothesis_class) { // We've improved
250 | moved_count++;
251 |
252 | if (cmd_args.verbose > 0) {
253 | fprintf(stderr, " Moving id=%-7u count=%-7lu %-18s %u -> %u\t(%g -> %g)\n", word_i, (unsigned long)word_bigrams[word_i].headword_count, word_list[word_i], old_class, best_hypothesis_class, scores[old_class], best_hypothesis_score); fflush(stderr);
254 | }
255 | //word2class[word_i] = best_hypothesis_class;
256 | word2class[word_i] = best_hypothesis_class;
257 | if (isnan(best_hypothesis_score)) { // shouldn't happen
258 | fprintf(stderr, "Error: best_hypothesis_score=%g :-(\n", best_hypothesis_score); fflush(stderr);
259 | exit(5);
260 | } else {
261 | best_log_prob += best_hypothesis_score;
262 | }
263 |
264 | if (is_nonreversed_cycle) {
265 | pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false);
266 | pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false);
267 | } else { // This is the reversed one
268 | pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false);
269 | pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false);
270 | }
271 | }
272 | }
273 |
274 | //if (!moved_count) // Nothing moved in last cycle, so that's it
275 | // break;
276 | }
277 |
278 | if (cmd_args.verbose >= -1) {
279 | fprintf(stderr, "%s: Completed steps: %'lu\n", argv_0_basename, steps); fflush(stderr);
280 | }
281 | //fprintf(stderr, "%s: Completed steps: %'lu (%'u word types x %'u classes x %'u cycles); best logprob=%g, PP=%g\n", argv_0_basename, steps, model_metadata.type_count, num_classes_current, cycle-1, best_log_prob, perplexity(best_log_prob,(model_metadata.token_count - model_metadata.line_count))); fflush(stderr);
282 |
283 | if (cmd_args.class_algo == EXCHANGE_BROWN)
284 | post_exchange_brown_cluster(cmd_args, model_metadata, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays);
285 |
286 | free_count_arrays(cmd_args, temp_count_arrays);
287 | free(temp_count_arrays);
288 | free_count_arrays(cmd_args, count_arrays);
289 | free(count_arrays);
290 | free(entropy_terms);
291 |
292 | } else if (cmd_args.class_algo == BROWN) { // Agglomerative clustering. Stops when the number of current clusters is equal to the desired number in cmd_args.num_classes
293 | // "Things equal to nothing else are equal to each other." --Anon
294 | for (unsigned long current_num_classes = model_metadata.type_count; current_num_classes > cmd_args.num_classes; current_num_classes--) {
295 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
296 | float log_probs[cmd_args.num_classes];
297 | //#pragma omp parallel for num_threads(cmd_args.num_threads)
298 | for (wclass_t class = 0; class < cmd_args.num_classes; class++, steps++) {
299 | // Get log prob
300 | log_probs[class] = -1 * (class+1); // Dummy predicate
301 | }
302 | wclass_t best_class = which_maxf(log_probs, cmd_args.num_classes);
303 | printf("Moving w_%u to class %u\n", word_i, best_class);
304 | }
305 | }
306 | }
307 | }
308 |
309 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) {
310 | count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *));
311 | init_count_arrays(cmd_args, count_arrays);
312 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays);
313 |
314 | // Build precomputed entropy terms
315 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
316 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
317 |
318 | if ( ! cmd_args.print_freqs) // greedo compatible
319 | fprintf(out_file, "%lu %u\n", (long unsigned)model_metadata.type_count, cmd_args.num_classes); // Like output in word2vec
320 |
321 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
322 | const word_count_t word_i_count = word_bigrams[word_i].headword_count;
323 | float scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array. We use a float here to be compatible with word2vec
324 | float score_min = FLT_MAX; // use this later for rescaling
325 |
326 | #pragma omp parallel for num_threads(cmd_args.num_threads)
327 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) { // class values range from 0 to cmd_args.num_classes-1
328 | scores[class] = sqrt( -(float)pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true));
329 | if (scores[class] < score_min)
330 | score_min = scores[class];
331 | }
332 |
333 | // Rescale vectors
334 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
335 | scores[class] -= score_min;
336 | }
337 |
338 | if (cmd_args.print_freqs) // greedo compatible
339 | fprintf(out_file, "%lu %s ", (long unsigned) word_i_count, word_list[word_i]);
340 | else // word2vec compatible
341 | fprintf(out_file, "%s ", word_list[word_i]);
342 |
343 | if (cmd_args.print_word_vectors == TEXT_VEC)
344 | fprint_arrayf(out_file, scores, cmd_args.num_classes, " ");
345 | else
346 | fwrite(scores, sizeof(float), cmd_args.num_classes, out_file);
347 | }
348 |
349 | free_count_arrays(cmd_args, count_arrays);
350 | free(count_arrays);
351 | free(entropy_terms);
352 | }
353 |
354 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays) {
355 |
356 | // Build precomputed entropy terms
357 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float));
358 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX);
359 |
360 | // Convert word2class to an array of classes pointing to arrays of words, which will successively get merged together
361 | struct_class_listing class2words[cmd_args.num_classes];
362 | memset(class2words, 0, sizeof(struct_class_listing) * cmd_args.num_classes);
363 | get_class_listing(cmd_args, model_metadata, word2class, class2words); // invert word2class array so that we know what words are associated with a given class
364 |
365 | // Loop through classes, finding best pair of classes to merge. Use pex_move_word() to find best pairs. Record merges separately to reduce overhead.
366 | for (wclass_t total_merges = 0; total_merges < cmd_args.num_classes-1; total_merges++) {
367 | // The scores arrays don't need to be private in the OMP parallelization, since each thread is writing to different elements in the array
368 | wclass_t scores_1_which[cmd_args.num_classes];
369 | double scores_1_val[cmd_args.num_classes];
370 | memset(scores_1_which, 0, sizeof(wclass_t) * cmd_args.num_classes);
371 | memset(scores_1_val, 0, sizeof(double) * cmd_args.num_classes);
372 |
373 | #pragma omp parallel for num_threads(cmd_args.num_threads)
374 | for (wclass_t class_1 = 0; class_1 < cmd_args.num_classes-1; class_1++) {
375 | const size_t scores_2_length = cmd_args.num_classes - class_1;
376 | double scores_2[scores_2_length];
377 | memset(scores_2, 0, sizeof(double) * scores_2_length);
378 |
379 | for (wclass_t class_2 = class_1+1; class_2 < cmd_args.num_classes; class_2++) {
380 | for (size_t word_offset = 0; word_offset < class2words[class_2].length; word_offset++) { // Sum of all words
381 | const word_id_t word = class2words[class_2].words[word_offset];
382 | scores_2[class_2] += pex_move_word(cmd_args, word, word_bigrams[word].headword_count, class_1, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true);
383 | }
384 | scores_1_which[class_1] = which_max(scores_2, scores_2_length);
385 | scores_1_val[class_1] = max(scores_2, scores_2_length);
386 |
387 | }
388 | //const double best_pairing_val = max(scores_1_val, cmd_args.num_classes);
389 | }
390 | }
391 |
392 | free_class_listing(cmd_args, class2words);
393 | free(entropy_terms);
394 | }
395 |
396 |
397 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words) {
398 | // Invert word2class array so that we know what words are associated with a given class
399 |
400 | // First pass through the word2class array to get counts of how many words are associated with a given class, then later allocate enough memory for these
401 | for (word_id_t word = 0; word < model_metadata.type_count; word++) {
402 | const wclass_t class = word2class[word];
403 | class2words[class].length++;
404 | }
405 |
406 | // Allocate enough memory for all words in a given class, then zero-out length values, so that we know where next word should go
407 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
408 | class2words[class].words = malloc(sizeof(word_id_t) * class2words[class].length);
409 | class2words[class].length = 0;
410 | }
411 |
412 | // Now add each word to the word array, and increment local offset
413 | for (word_id_t word = 0; word < model_metadata.type_count; word++) {
414 | const wclass_t class = word2class[word];
415 | class2words[class].words[class2words[class].length] = word;
416 | class2words[class].length++; // The final value of this should be the same as before we zeroed this value out
417 | }
418 | }
419 |
420 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words) {
421 | for (wclass_t class = 0; class < cmd_args.num_classes; class++)
422 | free(class2words[class].words);
423 | }
424 |
425 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max) {
426 | entropy_terms[0] = 0.0;
427 | #pragma omp parallel for num_threads(cmd_args.num_threads)
428 | for (unsigned long i = 1; i < entropy_terms_max; i++)
429 | entropy_terms[i] = i * log2f(i);
430 | }
431 |
--------------------------------------------------------------------------------
/src/clustercat-cluster.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CC_CLUSTER_HEADER
2 | #define INCLUDE_CC_CLUSTER_HEADER
3 |
4 | #include "clustercat.h"
5 |
6 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of word_id's all within the same class. We also keep track of the length of that array.
7 | word_id_t * words;
8 | unsigned int length;
9 | } struct_class_listing;
10 |
11 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts);
12 |
13 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts);
14 |
15 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays);
16 |
17 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max);
18 |
19 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words);
20 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words);
21 | #endif // INCLUDE_HEADER
22 |
--------------------------------------------------------------------------------
/src/clustercat-data.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_DATA_HEADER
2 | #define INCLUDE_CLUSTERCAT_DATA_HEADER
3 |
4 | #include "clustercat-map.h"
5 | //#include "clustercat-tree.h"
6 |
7 | // Thanks Dipstick
8 | #define STR(x) #x
9 | #define SHOW_DEFINE(x) printf("%s=%s\n", #x, STR(x))
10 | // SHOW_DEFINE(DATA_STRUCT_FLOAT_NAME); // for example
11 |
12 | // Default to storing word-word entries in hash table using uthash
13 | // You can change this by compiling with -DATA_STORE_TREE_LCRS or -DATA_STORE_TRIE
14 | #if defined ATA_STORE_KHASH // https://github.com/attractivechaos/klib
15 | #define DATA_STRUCT_FLOAT_HUMAN_NAME "khash_map"
16 | #define DATA_STRUCT_FLOAT_NAME word_word_float_khash
17 | #define DATA_STRUCT_FLOAT_ADDR
18 | #define DATA_STRUCT_FLOAT_TYPE kh_struct_khash_float_t
19 | #define DATA_STRUCT_FLOAT_TYPE_IN_STRUCT kh_struct_khash_float_t
20 | #define DATA_STRUCT_FLOAT_SIZE sizeof(kh_struct_khash_float_t)
21 | #define DECLARE_DATA_STRUCT_FLOAT KHASH_MAP_INIT_STR(DATA_STRUCT_FLOAT_TYPE, float);
22 | #define INIT_DATA_STRUCT_FLOAT khash_t(struct_khash_float) * DATA_STRUCT_FLOAT_NAME = kh_init(struct_khash_float);
23 | #define UPDATE_ENTRY_FLOAT(db,key,val) { \
24 | int ret; \
25 | khint_t k = kh_put(struct_khash_float, (&db), (key), &ret); \
26 | if (!ret) kh_del(struct_khash_float, (&db), (k)); \
27 | kh_value((&db), (k)) = (val); \
28 | }
29 | #define FIND_ENTRY_FLOAT(db,key) ( kh_get(struct_khash_float, (db), (key)))
30 | //#define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) ({ \
31 | // unsigned long number_of_entries = 0; \
32 | // for (khint_t k = kh_begin(db); k != kh_end(db); ++k) \
33 | // if (kh_exist(db, k)) { \
34 | // printf("foobar\n"); \
35 | //// printf("%s%s%c%i\n", prefix, entry->key, sep_char, entry->count);
36 | // number_of_entries++; \
37 | // } \
38 | // return number_of_entries; \
39 | //})
40 | #define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) (1)
41 | #endif
42 |
43 | typedef struct {
44 | struct_map_word *word_map;
45 | struct_map_word *word_word_map;
46 | struct_map_word *ngram_map;
47 | struct_map_word *class_map;
48 | char **unique_words;
49 | } struct_model_maps;
50 |
51 |
52 | #endif // INCLUDE_HEADER
53 |
--------------------------------------------------------------------------------
/src/clustercat-dbg.c:
--------------------------------------------------------------------------------
1 | #include "clustercat-dbg.h"
2 |
3 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts) {
4 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) {
5 | printf("Class=%u Offsets=%u,%u,...%u:\n\t", class, class, class+cmd_args.num_classes, (model_metadata.type_count-1) * cmd_args.num_classes + class);
6 | for (word_id_t word = 0; word < model_metadata.type_count; word++) {
7 | printf("#(<%u,%hu>)=%u ", word, class, word_class_counts[word * cmd_args.num_classes + class]);
8 | }
9 | printf("\n");
10 | }
11 | fflush(stdout);
12 | }
13 |
14 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list) {
15 | printf("word_bigrams:\n"); fflush(stdout);
16 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) {
17 | printf(" %18s=%u -> {%lu, [", word_list[word_i], word_i, word_bigrams[word_i].length); fflush(stdout);
18 | for (word_id_t word_j = 0; word_j < word_bigrams[word_i].length; word_j++) {
19 | if (word_j > 0)
20 | printf(", ");
21 | printf("%s=%u (%ux)", word_list[word_bigrams[word_i].predecessors[word_j]], word_bigrams[word_i].predecessors[word_j], word_bigrams[word_i].bigram_counts[word_j]);
22 | }
23 | printf("]}\n"); fflush(stdout);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/clustercat-dbg.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CC_DBG_HEADER
2 | #define INCLUDE_CC_DBG_HEADER
3 |
4 | #include "clustercat.h"
5 |
6 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts);
7 |
8 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list);
9 |
10 | #endif // INCLUDE_HEADER
11 |
--------------------------------------------------------------------------------
/src/clustercat-import-class-file.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "clustercat-import-class-file.h"
4 | #include "clustercat-map.h"
5 |
6 | // Parse TSV file input and overwrite relevant word mappings
7 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes) {
8 | char * restrict line_end;
9 | char * restrict line = calloc(MAX_WORD_LEN + 9, 1);
10 | const word_id_t unk_id = map_find_id(word_map, UNKNOWN_WORD, -1);
11 |
12 | FILE *file = fopen(class_file_name, "r");
13 | if (!file) {
14 | fprintf(stderr, "%s: fopen of '%s' failed: %s.\n", argv_0_basename, class_file_name, strerror(errno));
15 | exit(EXIT_FAILURE);
16 | }
17 | while (fgets(line, MAX_WORD_LEN + 8, file) != 0) {
18 |
19 | line_end = strchr(line, '\n');
20 | *line_end = '\0';
21 |
22 | // Parse each line
23 | unsigned int keylen = strcspn(line, PRIMARY_SEP_STRING);
24 | line[keylen] = '\0'; // Split key and count
25 | char * restrict key = line;
26 | wclass_t class = atoi(line + keylen + 1);
27 | if (num_classes <= class) {
28 | fprintf(stderr, " Error: Imported word classes from file \"%s\" must be in a range [0,%u-1]. Word \"%s\" has class %i. If --num-classes is unset, a value is automatically chosen. See --help\n", class_file_name, num_classes, key, class); fflush(stderr);
29 | exit(13);
30 | }
31 | //printf("keylen=%i, key=<<%s>>, class=<<%d>>\n", keylen, key, class);
32 | word_id_t key_int = map_find_id(word_map, key, unk_id);
33 | word2class[key_int] = class;
34 | }
35 |
36 | fclose(file);
37 | free(line);
38 | }
39 |
--------------------------------------------------------------------------------
/src/clustercat-import-class-file.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER
2 | #define INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER
3 |
4 | #include "clustercat.h" // wclass_t
5 |
6 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes);
7 |
8 | #endif // INCLUDE_HEADER
9 |
--------------------------------------------------------------------------------
/src/clustercat-io.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "clustercat.h"
4 | #include "clustercat-data.h"
5 | #include "clustercat-array.h"
6 | #include "clustercat-io.h"
7 |
8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage) {
9 | struct_model_metadata model_metadata = {0};
10 | map_update_count(initial_word_map, UNKNOWN_WORD, 0, 0); // initialize entry for , , and
11 | map_update_count(initial_word_map, "", 0, 1);
12 | map_update_count(initial_word_map, "", 0, 2);
13 | const word_id_t unk_id = map_find_id(initial_word_map, UNKNOWN_WORD, 0);
14 | const word_id_t start_id = map_find_id(initial_word_map, "", 1);
15 | const word_id_t end_id = map_find_id(initial_word_map, "", 2);
16 | const size_t sizeof_struct_map_word = sizeof(struct_map_word);
17 | const size_t sizeof_struct_map_bigram = sizeof(struct_map_bigram);
18 | model_metadata.type_count = 3; // start with , , and , and .
19 |
20 | // n-gram input
21 | if (cmd_args.ngram_input) {
22 | char line[STDIN_SENT_MAX_CHARS];
23 | register unsigned int strlen_line = 0;
24 | register unsigned long line_num = 1;
25 | register char * count_split_pos = NULL;
26 | register char * word_split_pos = NULL;
27 | register unsigned long count = 0;
28 |
29 | while (!feof(file)) {
30 | if (! fgets(line, STDIN_SENT_MAX_CHARS, file))
31 | break;
32 | if (*line == '\n') // ignore empty lines
33 | continue;
34 | strlen_line = strlen(line);
35 | if (strlen_line == STDIN_SENT_MAX_CHARS-1)
36 | fprintf(stderr, "\n%s: Warning: Input line too long, at buffer line %lu. The full line was:\n%s\n", argv_0_basename, line_num, line);
37 | line[strlen_line-1] = '\0'; // rm newline
38 |
39 | // Split words from counts
40 | count_split_pos = strchr(line, '\t');
41 | *count_split_pos = '\0';
42 | if (count_split_pos == NULL) {
43 | fprintf(stderr, "\n%s: Warning: Malformed n-gram input line number %lu. The line was:\n%s\n", argv_0_basename, line_num, line); fflush(stderr);
44 | } else {
45 | count = strtoul(count_split_pos+1, NULL, 10);
46 | }
47 |
48 | // Try to split word1 from word2
49 | word_split_pos = strchr(line, ' ');
50 |
51 | if (word_split_pos) { // Line has bigrams
52 | *word_split_pos = '\0';
53 |
54 | // Lookup each word
55 | const word_id_t w1 = map_find_id(initial_word_map, line, unk_id);
56 | const word_id_t w2 = map_find_id(initial_word_map, word_split_pos+1, unk_id);
57 | if (w1 == unk_id || w2 == unk_id) // Unseen word(s) in bigram
58 | fprintf(stderr, "%s: Warning: Unseen word(s) in bigram '%s %s' on input line %lu will be assigned to '%s'. Otherwise, include in unigram counts first.\n", argv_0_basename, line, word_split_pos+1, line_num, UNKNOWN_WORD);
59 |
60 | // Form bigram
61 | const struct_word_bigram bigram = {w1, w2};
62 |
63 | // Update bigram count
64 | if (map_update_bigram(initial_bigram_map, &bigram, count)) // increment previous+ bigram in bigram map
65 | *memusage += sizeof_struct_map_bigram;
66 |
67 | } else { // Line has unigrams
68 | if (model_metadata.type_count == map_update_count(initial_word_map, line, count, model_metadata.type_count)) { // 's word_id is set to 0.
69 | model_metadata.type_count++;
70 | *memusage += sizeof_struct_map_word;
71 | }
72 |
73 | }
74 |
75 | //if (word_split_pos) // line could be unigram count
76 | // printf("w1=<<%s>>; w2=<<%s>>; count=<<%s>>==%lu\n", line, word_split_pos+1, count_split_pos+1, count);
77 | //else
78 | // printf("w1=<<%s>>; count=<<%s>>==%lu\n", line, count_split_pos+1, count);
79 | //fflush(stdout);
80 |
81 | line_num++;
82 | }
83 |
84 |
85 | // Normal text input
86 | } else {
87 | char curr_word[MAX_WORD_LEN + 1]; curr_word[MAX_WORD_LEN] = '\0';
88 | register unsigned int chars_in_sent = 0;
89 | register int ch = 0;
90 | unsigned int curr_word_pos = 0;
91 | unsigned int prev_word_id = start_id;
92 |
93 | while (!feof(file)) {
94 | ch = getc(file);
95 | chars_in_sent++;
96 | //printf("«%c» ", ch); fflush(stdout);
97 | if (ch == ' ' || ch == '\t' || ch == '\n') { // end of a word
98 |
99 | if (chars_in_sent > STDIN_SENT_MAX_CHARS) { // Line too long
100 | curr_word_pos = 0;
101 | curr_word[0] = '\0'; // truncate word
102 | } else {
103 | curr_word[curr_word_pos] = '\0'; // terminate word
104 | }
105 |
106 | //printf("chars_in_sent=%u; max_chars=%u; curr_word=%s\n", chars_in_sent, STDIN_SENT_MAX_CHARS, curr_word); fflush(stdout);
107 |
108 | if (!strncmp(curr_word, "", 1)) { // ignore empty words, due to leading, trailing, and multiple spaces
109 | //printf("skipping empty word; ch=«%c»\n", ch); fflush(stdout);
110 | if (ch == '\n') { // trailing spaces require more stuff to do
111 | const struct_word_bigram bigram = {prev_word_id, end_id};
112 | if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+ bigram in bigram map
113 | *memusage += sizeof_struct_map_bigram;
114 | chars_in_sent = 0;
115 | prev_word_id = start_id;
116 | model_metadata.line_count++;
117 | }
118 | continue;
119 | }
120 | //printf("curr_word=%s, prev_id=%u\n", curr_word, prev_word_id); fflush(stdout);
121 | model_metadata.token_count++;
122 | curr_word_pos = 0;
123 | // increment current word in word map
124 | const word_id_t curr_word_id = map_increment_count(initial_word_map, curr_word, model_metadata.type_count); // 's word_id is set to 0.
125 |
126 | if (curr_word_id == model_metadata.type_count) { // previous call to map_increment_count() had a new word
127 | model_metadata.type_count++;
128 | *memusage += sizeof_struct_map_word;
129 | }
130 |
131 | // increment previous+current bigram in bigram map
132 | const struct_word_bigram bigram = {prev_word_id, curr_word_id};
133 | //printf("{%u,%u}\n", prev_word_id, curr_word_id); fflush(stdout);
134 | if (map_increment_bigram(initial_bigram_map, &bigram)) // true if bigram is new
135 | *memusage += sizeof_struct_map_bigram;
136 |
137 | //printf("process_input(): curr_word=<<%s>>; curr_word_id=%u, prev_word_id=%u\n", curr_word, curr_word_id, prev_word_id); fflush(stdout);
138 | if (ch == '\n') { // end of line
139 | const struct_word_bigram bigram = {curr_word_id, end_id};
140 | if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+ bigram in bigram map
141 | *memusage += sizeof_struct_map_bigram;
142 | chars_in_sent = 0;
143 | prev_word_id = start_id;
144 | model_metadata.line_count++;
145 | } else {
146 | prev_word_id = curr_word_id;
147 | }
148 |
149 | } else { // normal character; within a word
150 | if (curr_word_pos > MAX_WORD_LEN) // word is too long; do nothing until space or newline
151 | continue;
152 | else
153 | curr_word[curr_word_pos++] = ch;
154 | }
155 | }
156 | }
157 |
158 | // Set counts of and once, based on line_count
159 | map_update_count(initial_word_map, "", model_metadata.line_count, 1);
160 | map_update_count(initial_word_map, "", model_metadata.line_count, 2);
161 | return model_metadata;
162 | }
163 |
--------------------------------------------------------------------------------
/src/clustercat-io.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_IO
2 | #define INCLUDE_CLUSTERCAT_IO
3 |
4 | #include "clustercat.h"
5 | #include "clustercat-data.h"
6 |
7 | // Import
8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage);
9 |
10 | #endif // INCLUDE_HEADER
11 |
--------------------------------------------------------------------------------
/src/clustercat-map.c:
--------------------------------------------------------------------------------
1 | #include "clustercat-map.h"
2 |
3 | inline bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram) {
4 | struct_map_bigram *local_s;
5 | HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash?
6 | if (local_s == NULL) {
7 | local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram));
8 | //memcpy(local_s->key, bigram, sizeof(struct_word_bigram));
9 | local_s->key = *bigram;
10 | local_s->count = 1;
11 | HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s);
12 | return true;
13 | } else {
14 | (local_s->count)++;
15 | return false;
16 | }
17 | }
18 |
19 | inline bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count) {
20 | struct_map_bigram *local_s;
21 | HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash?
22 | if (local_s == NULL) {
23 | local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram));
24 | //memcpy(local_s->key, bigram, sizeof(struct_word_bigram));
25 | local_s->key = *bigram;
26 | local_s->count = count;
27 | HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s);
28 | return true;
29 | } else {
30 | local_s->count += count;
31 | return false;
32 | }
33 | }
34 |
35 | void map_print_bigrams(struct_map_bigram **bigram_map, char **word_list) {
36 | struct_map_bigram *entry, *tmp;
37 | struct_word_bigram bigram_key;
38 | word_id_t w_1, w_2;
39 | word_bigram_count_t count;
40 |
41 | printf("bigram_map:\n");
42 | HASH_ITER(hh, *bigram_map, entry, tmp) {
43 | count = entry->count;
44 | bigram_key = entry->key;
45 | w_1 = bigram_key.word_1;
46 | w_2 = bigram_key.word_2;
47 | if (w_1 == (word_id_t)-1 || w_2 == (word_id_t)-1) // Don't print dummy values
48 | continue;
49 | printf(" {%s=%u, %s=%u}: #=%u\n", word_list[w_1], w_1, word_list[w_2], w_2, count);
50 | //printf(" {%u, %u}: #=%u\n", w_1, w_2, count); fflush(stdout);
51 | }
52 | printf("\n"); fflush(stdout);
53 | }
54 |
55 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id) {
56 | // Iterates through initial bigram hash map and builds a new hash map based on the mapping of old word id's to new ids. Alongside this, it also builds a reversed counterpart.
57 | struct_map_bigram *entry, *tmp;
58 | struct_word_bigram orig_bigram, new_bigram, new_bigram_rev;
59 | word_id_t w_1, w_2;
60 | word_bigram_count_t count;
61 | //printf("initial_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map));
62 | //printf("word_id_remap71: [%u,%u,%u,%u,%u,%u,...]\n", word_id_remap[0], word_id_remap[1], word_id_remap[2], word_id_remap[3], word_id_remap[4], word_id_remap[5]);
63 |
64 | HASH_ITER(hh, *initial_bigram_map, entry, tmp) {
65 | count = entry->count;
66 | orig_bigram = entry->key;
67 | w_1 = word_id_remap[orig_bigram.word_1];
68 | w_2 = word_id_remap[orig_bigram.word_2];
69 | if (w_1 == (word_id_t) -1) // reassign temporary placeholder unk_id to final unk_id
70 | w_1 = real_unk_id;
71 | if (w_2 == (word_id_t) -1)
72 | w_2 = real_unk_id;
73 | new_bigram = (struct_word_bigram) {w_1, w_2};
74 | new_bigram_rev = (struct_word_bigram) {w_2, w_1};
75 | //printf("remap_and_rev_bigram_map: count=%u, orig_w_1=%u, new_w_1=%u, orig_w_2=%u, new_w_2=%u\n", count, orig_bigram.word_1, w_1, orig_bigram.word_2, w_2); fflush(stdout);
76 |
77 | //#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
78 | {
79 | //#pragma omp section
80 | { map_update_bigram(new_bigram_map, &new_bigram, count); }
81 | //const word_bigram_count_t bigram_count = map_update_bigram(&new_bigram_map, &new_bigram, count);
82 | //printf("map_update_bigram: {%u,%u} += %u; now %u\n", new_bigram.word_1, new_bigram.word_2, count, bigram_count);
83 | //#pragma omp section
84 | { map_update_bigram(new_bigram_map_rev, &new_bigram_rev, count); }
85 | }
86 | }
87 | }
88 |
89 | inline void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count) { // Based on uthash's docs
90 | struct_map_word *local_s;
91 |
92 | //HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
93 | //if (local_s == NULL) {
94 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
95 | unsigned short strlen_entry_key = strlen(entry_key);
96 | local_s->key = malloc(strlen_entry_key + 1);
97 | strcpy(local_s->key, entry_key);
98 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
99 | //}
100 | local_s->count = count;
101 | }
102 |
103 | inline void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class) {
104 | struct_map_word_class *local_s;
105 |
106 | //HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
107 | //if (local_s == NULL) {
108 | local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class));
109 | strncpy(local_s->key, entry_key, KEYLEN-1);
110 | HASH_ADD_STR(*map, key, local_s);
111 | //}
112 | local_s->word_count = word_count;
113 | local_s->class = entry_class;
114 | }
115 |
116 | inline void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned short entry_class) {
117 | struct_map_word_class *local_s;
118 |
119 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
120 | if (local_s == NULL) {
121 | local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class));
122 | strncpy(local_s->key, entry_key, KEYLEN-1);
123 | HASH_ADD_STR(*map, key, local_s);
124 | }
125 | local_s->class = entry_class;
126 | }
127 |
128 | inline void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) {
129 | struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering
130 |
131 | #pragma omp critical (map_set_word_id_lookup)
132 | {
133 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
134 | }
135 | if (local_s == NULL) {
136 | printf("Error: map_set_word_id(): word '%s' should already be in word_map\n", entry_key); // Shouldn't happen
137 | exit(5);
138 | }
139 | #pragma omp critical (map_set_word_id_assignment)
140 | { local_s->word_id = word_id; }
141 | }
142 |
143 | inline word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) { // Based on uthash's docs
144 | struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering
145 |
146 | #pragma omp critical (map_increment_count_lookup)
147 | {
148 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
149 | if (local_s == NULL) {
150 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
151 | local_s->count = 0;
152 | local_s->word_id = word_id;
153 | unsigned short strlen_entry_key = strlen(entry_key);
154 | local_s->key = malloc(strlen_entry_key + 1);
155 | strcpy(local_s->key, entry_key);
156 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
157 | }
158 | }
159 | #pragma omp critical (map_increment_count_increment)
160 | { ++local_s->count; }
161 | //printf("map: count of %s is now %u\n", entry_key, local_s->count);
162 | return local_s->word_id;
163 | }
164 |
165 | inline wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]) { // Based on uthash's docs
166 | struct_map_class *local_s;
167 | size_t sizeof_key = sizeof(wclass_t) * CLASSLEN;
168 | //printf("map++: sizeof_key=%zu, CLASSLEN=%u, cls_entry=[%hu,%hu,%hu,%hu]\n", sizeof_key, CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]);
169 |
170 | //#pragma omp critical // not needed since each thread gets its own class_map
171 | {
172 | //printf("***41***: sizeof_key=%zu, sizeof(wclass_t)=%zu, CLASSLEN=%u, key=<%u,%u,%u,%u>\n", sizeof_key, sizeof(wclass_t), CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]); fflush(stdout);
173 | HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash?
174 | if (local_s == NULL) {
175 | local_s = (struct_map_class *)malloc(sizeof(struct_map_class));
176 | local_s->count = 0;
177 | memcpy(local_s->key, entry_key, sizeof_key);
178 | HASH_ADD(hh, *map, key, sizeof_key, local_s);
179 | }
180 | //printf("\t***42***: count: %u\n", local_s->count); fflush(stdout);
181 | }
182 | #pragma omp critical (map_increment_count_fixed_width_increment)
183 | { ++local_s->count; }
184 | //printf("map: count of [%hu,%hu,%hu,%hu] is now %u\n", entry_key[0],entry_key[1],entry_key[2],entry_key[3], local_s->count);
185 | return local_s->count;
186 | }
187 |
188 | inline wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]) { // Based on uthash's docs
189 | struct_map_class *local_s;
190 | size_t sizeof_key = sizeof(wclass_t) * CLASSLEN;
191 | wclass_count_t local_count = 0;
192 |
193 | HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash?
194 | if (local_s != NULL) { // Deal with OOV
195 | local_count = local_s->count;
196 | }
197 | //printf("map: count=%u for cls_entry=[%hu,%hu,%hu,%hu]\n", local_count, entry_key[0], entry_key[1], entry_key[2], entry_key[3]);
198 | return local_count;
199 | }
200 |
201 | inline word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id) { // Based on uthash's docs
202 | struct_map_word *local_s;
203 |
204 | #pragma omp critical
205 | {
206 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash?
207 | if (local_s == NULL) {
208 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word));
209 | local_s->count = count;
210 | local_s->word_id = word_id;
211 | unsigned short strlen_entry_key = strlen(entry_key);
212 | local_s->key = malloc(strlen_entry_key + 1);
213 | strcpy(local_s->key, entry_key);
214 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s);
215 | } else {
216 | local_s->count += count;
217 | }
218 | }
219 | return local_s->word_id;
220 | }
221 |
222 | inline word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs
223 | struct_map_word *local_s;
224 | word_count_t local_count = 0;
225 |
226 | HASH_FIND_STR(*map, entry_key, local_s); // local_s: output pointer
227 | if (local_s != NULL) { // Deal with OOV
228 | local_count = local_s->count;
229 | }
230 | return local_count;
231 | }
232 |
233 | inline word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id) { // Based on uthash's docs
234 | struct_map_word *local_s;
235 | word_id_t local_id = unknown_id;
236 |
237 | HASH_FIND_STR(*map, entry_key, local_s);
238 | if (local_s != NULL) { // Deal with OOV
239 | local_id = local_s->word_id;
240 | }
241 | return local_id;
242 | }
243 |
244 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs
245 | struct_map_word *local_s;
246 |
247 | HASH_FIND_STR(*map, entry_key, local_s);
248 | return *local_s;
249 | }
250 |
251 | inline wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk) {
252 | struct_map_word_class *local_s;
253 |
254 | HASH_FIND_STR(*map, entry_key, local_s); // local_s: output pointer
255 | if (local_s != NULL) { // Word is found
256 | return local_s->class;
257 | } else { // Word is not found
258 | return unk;
259 | }
260 | }
261 |
262 | word_id_t get_keys(struct_map_word *map[const], char *keys[]) {
263 | struct_map_word *entry, *tmp;
264 | word_id_t number_of_keys = 0;
265 |
266 | HASH_ITER(hh, *map, entry, tmp) {
267 | // Build-up array of keys
268 | unsigned short wlen = strlen(entry->key);
269 | keys[number_of_keys] = (char *) malloc(wlen + 1);
270 | strcpy(keys[number_of_keys], entry->key);
271 | //printf("key=%s, i=%lu, count=%u\n", entry->key, (unsigned long)number_of_keys, entry->count);
272 | number_of_keys++;
273 | }
274 | return number_of_keys;
275 | }
276 |
277 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]) { // most useful if map is already sorted by count; then you can directly map from old id to new id.
278 | struct_map_word *entry, *tmp;
279 | word_id_t number_of_keys = 0; // 0-2 are reserved for , , and
280 |
281 | HASH_ITER(hh, *map, entry, tmp) {
282 | //word_ids[number_of_keys] = entry->word_id; // Build-up array of word_id's, from new id to old one
283 | const word_id_t word_id = entry->word_id;
284 | //if (word_id < 3) // don't change id's for , , or
285 | // continue;
286 | word_ids[word_id] = number_of_keys; // Build-up array of word_id's, from old id to new one
287 | //printf("get_ids: old_id=%u\n", word_id); fflush(stdout);
288 | number_of_keys++;
289 | }
290 | return number_of_keys;
291 | }
292 |
293 | void delete_entry(struct_map_word **map, struct_map_word *entry) { // Based on uthash's docs
294 | HASH_DEL(*map, entry); // entry: pointer to deletee
295 | free(entry->key); // key is a malloc'd string
296 | free(entry);
297 | }
298 |
299 | void delete_all(struct_map_word **map) {
300 | struct_map_word *current_entry, *tmp;
301 |
302 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
303 | HASH_DEL(*map, current_entry); // delete it (map advances to next)
304 | free(current_entry); // free it
305 | }
306 | }
307 |
308 | void delete_all_class(struct_map_class **map) {
309 | struct_map_class *current_entry, *tmp;
310 |
311 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
312 | HASH_DEL(*map, current_entry); // delete it (map advances to next)
313 | free(current_entry); // free it
314 | }
315 | }
316 |
317 | void delete_all_bigram(struct_map_bigram **map) {
318 | struct_map_bigram *current_entry, *tmp;
319 |
320 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs
321 | HASH_DEL(*map, current_entry); // delete it (map advances to next)
322 | free(current_entry); // free it
323 | }
324 | }
325 |
326 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs) {
327 | struct_map_word_class *map = NULL;
328 |
329 | for (word_id_t word_id = 0; word_id < type_count; word_id++) { // Populate new word2class_map, so we can do fun stuff like primary- and secondary-sort easily
330 | //printf("adding %s=%hu to temp word2class_map\n", word_list[word_id], word2class[word_id]); fflush(stdout);
331 | map_add_class(&map, word_list[word_id], (unsigned long)word_counts[word_id], word2class[word_id]);
332 | }
333 |
334 | sort_by_key(&map); // Tertiary sort, alphabetically by key
335 | word_class_sort_by_count(&map); // Secondary sort, by count
336 | sort_by_class(&map); // Primary sort, numerically by class
337 |
338 | struct_map_word_class *s;
339 | for (s = map; s != NULL; s = (struct_map_word_class *)(s->hh.next)) {
340 | fprintf(out_file, "%s\t%li", s->key, (long)(s->class) + class_offset);
341 | if (print_freqs)
342 | fprintf(out_file, "\t%lu", (long unsigned)(s->word_count));
343 | fprintf(out_file, "\n");
344 | HASH_DEL(map, s); // delete it (map advances to next)
345 | free(s->key); // free it
346 | //fprintf(stderr, "49.11: next=%zu\n", (struct_map_word_class *)(s->hh.next)); fflush(stderr);
347 | }
348 | }
349 |
350 | int count_sort(struct_map_word *a, struct_map_word *b) { // Based on uthash's docs
351 | return (b->count - a->count); // sort descending: most frequent to least frequent
352 | }
353 |
354 | void sort_by_count(struct_map_word **map) { // Based on uthash's docs
355 | HASH_SORT(*map, count_sort);
356 | }
357 |
358 | int id_sort(struct_map_word *a, struct_map_word *b) {
359 | return (a->word_id - b->word_id); // sort ascending
360 | }
361 |
362 | void sort_by_id(struct_map_word **map) {
363 | HASH_SORT(*map, id_sort);
364 | }
365 |
366 | int word_class_count_sort(struct_map_word_class *a, struct_map_word_class *b) {
367 | return (b->word_count - a->word_count); // sort descending: most frequent to least frequent
368 | }
369 |
370 | void word_class_sort_by_count(struct_map_word_class **map) {
371 | HASH_SORT(*map, word_class_count_sort);
372 | }
373 |
374 | int key_sort(struct_map_word_class *a, struct_map_word_class *b) {
375 | return strcmp(a->key, b->key);
376 | }
377 |
378 | void sort_by_key(struct_map_word_class **map) {
379 | HASH_SORT(*map, key_sort);
380 | }
381 |
382 | int class_sort(struct_map_word_class *a, struct_map_word_class *b) { // Based on uthash's docs
383 | return (a->class - b->class);
384 | }
385 |
386 | void sort_by_class(struct_map_word_class **map) {
387 | HASH_SORT(*map, class_sort);
388 | }
389 |
390 | inline int bigram_sort_word_1(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs
391 | return ((a->key).word_1 - (b->key).word_1);
392 | }
393 |
394 | inline int bigram_sort_word_2(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs
395 | return ((a->key).word_2 - (b->key).word_2);
396 | }
397 |
398 | void sort_bigrams(struct_map_bigram **map) {
399 | HASH_SORT(*map, bigram_sort_word_2);
400 | //HASH_SORT(*map, bigram_sort_word_1);
401 | }
402 |
403 | unsigned long map_count(struct_map_word *map[const]) {
404 | return HASH_COUNT(*map);
405 | }
406 |
407 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count) {
408 | struct_map_word *entry, *tmp;
409 | unsigned long number_of_entries = 0;
410 |
411 | HASH_ITER(hh, *map, entry, tmp) {
412 | if (entry->count >= min_count) {
413 | printf("%s%s%c%lu\n", prefix, entry->key, sep_char, (unsigned long)entry->count);
414 | number_of_entries++;
415 | }
416 | }
417 | return number_of_entries;
418 | }
419 |
--------------------------------------------------------------------------------
/src/clustercat-map.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_MAP_HEADER
2 | #define INCLUDE_CLUSTERCAT_MAP_HEADER
3 |
4 | #include
5 | #include
6 | #include "uthash.h"
7 |
8 | #ifdef ATA_STORE_KHASH
9 | #include "khash.h"
10 | KHASH_MAP_INIT_STR(struct_khash_float, float);
11 | #endif
12 |
13 | // Defaults
14 | #define KEYLEN 80
15 | #define CLASSLEN 3 // Longest possible class ngram to store
16 | typedef unsigned short wclass_t; // Max number of word classes
17 | typedef unsigned int wclass_count_t; // Max count of a given word class
18 | typedef unsigned int word_id_t; // Max number of words
19 | typedef unsigned int word_count_t; // Max count of a given word class
20 | typedef unsigned int word_bigram_count_t; // Max count of a given word bigram
21 | typedef unsigned int class_bigram_count_t; // Max count of a given class bigram
22 | typedef unsigned int word_class_count_t; // Max count of a given tuple
23 |
24 | typedef struct {
25 | word_id_t word_1;
26 | word_id_t word_2;
27 | } struct_word_bigram;
28 |
29 |
30 | typedef struct { // We need an O(1) map that we can iterate over later
31 | struct_word_bigram key;
32 | word_bigram_count_t count;
33 | UT_hash_handle hh; // makes this structure hashable
34 | } struct_map_bigram;
35 |
36 | typedef struct {
37 | char * restrict key;
38 | word_count_t count;
39 | word_id_t word_id;
40 | UT_hash_handle hh; // makes this structure hashable
41 | } struct_map_word;
42 |
43 | typedef struct { // Maps a class to its count
44 | wclass_t key[CLASSLEN];
45 | wclass_count_t count;
46 | UT_hash_handle hh; // makes this structure hashable
47 | } struct_map_class;
48 |
49 | typedef struct { // Maps a word to its class
50 | char key[KEYLEN];
51 | unsigned long word_count;
52 | wclass_t class;
53 | UT_hash_handle hh; // makes this structure hashable
54 | } struct_map_word_class;
55 |
56 | void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count);
57 |
58 | void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class);
59 |
60 | void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const wclass_t entry_class);
61 |
62 | void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id);
63 |
64 | word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id);
65 |
66 | wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]);
67 |
68 | bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram);
69 | bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count);
70 | void map_print_bigrams(struct_map_bigram **map, char **word_list);
71 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id);
72 |
73 | word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id);
74 |
75 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key);
76 | word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key);
77 | wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]);
78 |
79 | word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id);
80 |
81 | wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk);
82 |
83 | word_id_t get_keys(struct_map_word *map[const], char *keys[]);
84 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]);
85 |
86 | void sort_by_class(struct_map_word_class **map);
87 | void sort_by_key(struct_map_word_class **map);
88 | void sort_by_id(struct_map_word **map);
89 | void sort_by_count(struct_map_word **map);
90 | void word_class_sort_by_count(struct_map_word_class **map);
91 | void sort_bigrams(struct_map_bigram **map);
92 |
93 | unsigned long map_count(struct_map_word *map[const]);
94 |
95 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count);
96 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs);
97 |
98 | void delete_all(struct_map_word **map);
99 | void delete_all_class(struct_map_class **map);
100 | void delete_all_bigram(struct_map_bigram **map);
101 | void delete_entry(struct_map_word **map, struct_map_word *entry);
102 |
103 | #endif // INCLUDE_HEADER
104 |
--------------------------------------------------------------------------------
/src/clustercat-math.c:
--------------------------------------------------------------------------------
1 | #include "clustercat.h" // Model importing/exporting functions
2 | #include "clustercat-math.h"
3 |
4 | double dot_product(const double probs[const], const double weights[const], int length) {
5 | double sum = 0;
6 | double sum_weights = 0;
7 | length--;
8 |
9 | for (; length >= 0; --length) {
10 | sum_weights += weights[length];
11 | sum += probs[length] * weights[length];
12 | //printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length);
13 | }
14 | //printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights);
15 | return sum_weights ? (sum / sum_weights) : 0.0;
16 | }
17 |
18 | float dot_productf(const float probs[const], const float weights[const], int length) {
19 | float sum = 0;
20 | float sum_weights = 0;
21 | length--;
22 |
23 | for (; length >= 0; --length) {
24 | sum_weights += weights[length];
25 | sum += probs[length] * weights[length];
26 | //printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length);
27 | }
28 | //printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights);
29 | return sum_weights ? (sum / sum_weights) : 0.0;
30 | }
31 |
32 | long int powi(long int base, long int exp) { // Integer exponentiation
33 | long int result = 1;
34 | while (exp--)
35 | result *= base;
36 | return result;
37 | }
38 |
39 | double perplexity(const double log_probs, const unsigned long num_words_queried) {
40 | // Assumes log_probs used log2()
41 | return pow(2, -log_probs / (double)num_words_queried);
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/src/clustercat-math.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_MATH
2 | #define INCLUDE_CLUSTERCAT_MATH
3 |
4 | double dot_product(const double probs[const], const double weights[const], int length);
5 | float dot_productf(const float probs[const], const float weights[const], int length);
6 |
7 | long int powi(long int base, long int exp);
8 |
9 | double perplexity(const double log_probs, const unsigned long num_words_queried);
10 |
11 | #endif // INCLUDE_HEADER
12 |
--------------------------------------------------------------------------------
/src/clustercat-tokenize.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include "clustercat-tokenize.h"
3 |
4 | // Simple threadsafe tokenization for plaintext, copying words into **sent_words
5 | // Remember to free using tokenize_simple_free()
6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words) {
7 | sentlen_t i;
8 | char * restrict pch;
9 |
10 | sent_words[0] = "";
11 |
12 | for (i = 1, pch = sent_string; i < SENT_LEN_MAX ; i++) {
13 | sentlen_t toklen = strcspn(pch, " \n\t");
14 |
15 | if (toklen == 0) { // End of sentence
16 | sent_words[i] = "";
17 | break;
18 | }
19 |
20 | sent_words[i] = malloc(toklen+1);
21 | strncpy(sent_words[i], pch, toklen); // Threadsafe copy doesn't touch original
22 | sent_words[i][toklen] = '\0';
23 |
24 | pch += toklen+1;
25 | }
26 |
27 | return i;
28 | }
29 |
30 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length) {
31 | sentlen_t i = 1;
32 | for (; i < length-1; ++i) { // Assumes word_0 is and word_sentlen is , which weren't malloc'd
33 | free(sent_words[i]);
34 | }
35 | free(sent_words);
36 | }
37 |
--------------------------------------------------------------------------------
/src/clustercat-tokenize.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_TOKENIZE
2 | #define INCLUDE_CLUSTERCAT_TOKENIZE
3 |
4 | #include "clustercat.h"
5 |
6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words);
7 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length);
8 |
9 | #endif // INCLUDE_HEADER
10 |
--------------------------------------------------------------------------------
/src/clustercat.c:
--------------------------------------------------------------------------------
1 | /** Induces word categories
2 | * By Jon Dehdari, 2014-2016
3 | * Usage: ./clustercat [options] < corpus.tok.txt > classes.tsv
4 | **/
5 |
6 | #include // UCHAR_MAX, UINT_MAX
7 | #include // DBL_MAX, etc.
8 | #include // isnan()
9 | #include // clock_t, clock(), CLOCKS_PER_SEC
10 | #include
11 | #include // OPTIONAL! Comment-out on non-Posix machines, and the function setlocale() in the first line of main()
12 |
13 | #include "clustercat.h" // Model importing/exporting functions
14 | #include "clustercat-array.h" // which_maxf()
15 | #include "clustercat-data.h"
16 | #include "clustercat-cluster.h" // cluster()
17 | #include "clustercat-dbg.h" // for printing out various complex data structures
18 | #include "clustercat-import-class-file.h" // import_class_file()
19 | #include "clustercat-io.h" // process_input()
20 | #include "clustercat-math.h" // perplexity(), powi()
21 |
22 | #define USAGE_LEN 10000
23 | #define LOG2ADD(a,b) (log2(a) + log2(1 + (b) / (a) ))
24 |
25 | // Declarations
26 | void get_usage_string(char * restrict usage_string, int usage_len);
27 | void parse_cmd_args(const int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args);
28 | char * restrict class_algo = NULL;
29 | char * restrict in_train_file_string = NULL;
30 | char * restrict out_file_string = NULL;
31 | char * restrict initial_class_file = NULL;
32 | char * argv_0_basename = NULL;
33 |
34 | struct_map_word *word_map = NULL; // Must initialize to NULL
35 | struct_map_bigram *initial_bigram_map = NULL; // Must initialize to NULL
36 | struct_map_bigram *new_bigram_map = NULL; // Must initialize to NULL
37 | struct_map_bigram *new_bigram_map_rev = NULL; // Must initialize to NULL
38 | char usage[USAGE_LEN];
39 | size_t memusage = 0;
40 |
41 |
42 | // Defaults
43 | struct cmd_args cmd_args = {
44 | .class_algo = EXCHANGE,
45 | .class_offset = 0,
46 | .forward_lambda = 0.55,
47 | .min_count = 3, // or max(2, floor(N^0.14 - 7))
48 | .max_array = 2,
49 | .ngram_input = false,
50 | .num_threads = 8,
51 | .num_classes = 0,
52 | .print_freqs = false,
53 | .print_word_vectors = NO_VEC,
54 | .refine = 2,
55 | .rev_alternate = 3,
56 | .tune_cycles = 15,
57 | .unidirectional = false,
58 | .verbose = 0,
59 | };
60 |
61 |
62 |
63 | int main(int argc, char **argv) {
64 | setlocale(LC_ALL, ""); // Comment-out on non-Posix systems
65 | clock_t time_start = clock();
66 | time_t time_t_start;
67 | time(&time_t_start);
68 | argv_0_basename = basename(argv[0]);
69 | get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere
70 |
71 | //printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args));
72 | parse_cmd_args(argc, argv, usage, &cmd_args);
73 |
74 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN)
75 | memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage
76 |
77 | struct_model_metadata global_metadata;
78 |
79 | // The list of unique words should always include , unknown word, and
80 | map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first
81 | map_update_count(&word_map, "", 0, 1);
82 | map_update_count(&word_map, "", 0, 2);
83 |
84 | // Open input
85 | FILE *in_train_file = stdin;
86 | if (in_train_file_string)
87 | in_train_file = fopen(in_train_file_string, "r");
88 | if (in_train_file == NULL) {
89 | fprintf(stderr, "%s: Error: Unable to open input file %s\n", argv_0_basename, in_train_file_string); fflush(stderr);
90 | exit(15);
91 | }
92 |
93 | // Process input sentences
94 | size_t input_memusage = 0;
95 | const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage);
96 | memusage += input_memusage;
97 | fclose(in_train_file);
98 |
99 | clock_t time_input_processed = clock();
100 | if (cmd_args.verbose >= -1) {
101 | fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr);
102 | }
103 |
104 | global_metadata.token_count = input_model_metadata.token_count;
105 | global_metadata.type_count = map_count(&word_map);
106 |
107 | // Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's
108 | sort_by_count(&word_map);
109 | word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count);
110 | get_ids(&word_map, word_id_remap);
111 | word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap);
112 |
113 | // Get list of unique words
114 | char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count);
115 | memusage += sizeof(char*) * global_metadata.type_count;
116 | reassign_word_ids(&word_map, word_list, word_id_remap);
117 | get_keys(&word_map, word_list);
118 | sort_by_id(&word_map);
119 |
120 |
121 | // Check or set number of classes
122 | if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low
123 | fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u). Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr);
124 | exit(3);
125 | } else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all
126 | cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2);
127 | }
128 |
129 | // Build array of word_counts
130 | word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count);
131 | memusage += sizeof(word_count_t) * global_metadata.type_count;
132 | build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count);
133 |
134 | // Initialize clusters, and possibly read-in external class file
135 | wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count);
136 | memusage += sizeof(wclass_t) * global_metadata.type_count;
137 | init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list);
138 | if (initial_class_file != NULL)
139 | import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file
140 |
141 | // Remap word_id's in initial_bigram_map
142 | remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1));
143 | global_metadata.start_sent_id = map_find_id(&word_map, "", -1);; // need this for tallying emission probs
144 | global_metadata.end_sent_id = map_find_id(&word_map, "", -1);; // need this for tallying emission probs
145 | global_metadata.line_count = map_find_count(&word_map, ""); // Used for calculating perplexity
146 |
147 | if (global_metadata.line_count == 0) {
148 | fprintf(stderr, "%s: Warning: Number of lines is 0. Include and in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr);
149 | }
150 |
151 | //printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout);
152 | //printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout);
153 | free(word_id_remap);
154 | memusage -= sizeof(word_id_t) * input_model_metadata.type_count;
155 | delete_all(&word_map); // static
156 | delete_all_bigram(&initial_bigram_map); // static
157 | memusage -= input_memusage;
158 |
159 | // Initialize and set word bigram listing
160 | clock_t time_bigram_start = clock();
161 | size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0;
162 | struct_word_bigram_entry * restrict word_bigrams = NULL;
163 | struct_word_bigram_entry * restrict word_bigrams_rev = NULL;
164 |
165 | if (cmd_args.verbose >= -1) {
166 | fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr);
167 | }
168 |
169 | #pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
170 | {
171 | #pragma omp section
172 | {
173 | //sort_bigrams(&new_bigram_map); // speeds things up later
174 | word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
175 | memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
176 | bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map);
177 | // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
178 | for (word_id_t word = 0; word < global_metadata.type_count; word++)
179 | word_bigrams[word].headword_count = word_counts[word];
180 | }
181 |
182 | // Initialize and set *reverse* word bigram listing
183 | #pragma omp section
184 | {
185 | if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
186 | //sort_bigrams(&new_bigram_map_rev); // speeds things up later
187 | word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
188 | memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
189 | bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev);
190 | // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
191 | for (word_id_t word = 0; word < global_metadata.type_count; word++)
192 | word_bigrams_rev[word].headword_count = word_counts[word];
193 | }
194 | }
195 | }
196 |
197 | delete_all_bigram(&new_bigram_map);
198 | delete_all_bigram(&new_bigram_map_rev);
199 | memusage += bigram_memusage + bigram_rev_memusage;
200 | clock_t time_bigram_end = clock();
201 | if (cmd_args.verbose >= -1) {
202 | fprintf(stderr, "in %'.2f CPU secs. Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr);
203 | }
204 |
205 | //print_word_bigrams(global_metadata, word_bigrams, word_list);
206 |
207 | // Build counts, which consists of a word followed by a given class
208 | word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
209 | if (word_class_counts == NULL) {
210 | fprintf(stderr, "%s: Error: Unable to allocate enough memory for . %'.1f MB needed. Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
211 | exit(13);
212 | }
213 | memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
214 | fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
215 | build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/);
216 | //print_word_class_counts(cmd_args, global_metadata, word_class_counts);
217 |
218 | // Build reverse: counts: class followed by word. This and the normal one are both pretty fast, so no need to parallelize this
219 | word_class_count_t * restrict word_class_rev_counts = NULL;
220 | if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
221 | word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
222 | if (word_class_rev_counts == NULL) {
223 | fprintf(stderr, "%s: Warning: Unable to allocate enough memory for . %'.1f MB needed. Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
224 | cmd_args.rev_alternate = 0;
225 | } else {
226 | memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
227 | fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
228 | build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/);
229 | }
230 |
231 | }
232 |
233 | // Calculate memusage for count_arrays
234 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) {
235 | memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t));
236 | //printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout);
237 | }
238 |
239 | clock_t time_model_built = clock();
240 | if (cmd_args.verbose >= -1) {
241 | fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr);
242 | }
243 | if (cmd_args.verbose >= -1) {
244 | fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr);
245 | }
246 |
247 | cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
248 |
249 | // Now print the final word2class mapping
250 | if (cmd_args.verbose >= 0) {
251 | FILE *out_file = stdout;
252 | if (out_file_string)
253 | out_file = fopen(out_file_string, "w");
254 | if (out_file == NULL) {
255 | fprintf(stderr, "%s: Error: Unable to open output file %s\n", argv_0_basename, out_file_string); fflush(stderr);
256 | exit(16);
257 | }
258 | if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) {
259 | print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs);
260 | } else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) {
261 | print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
262 | }
263 | fclose(out_file);
264 | }
265 |
266 | clock_t time_clustered = clock();
267 | time_t time_t_end;
268 | time(&time_t_end);
269 | double time_secs_total = difftime(time_t_end, time_t_start);
270 | if (cmd_args.verbose >= -1)
271 | fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds. Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60) );
272 |
273 | free(word2class);
274 | free(word_bigrams);
275 | free(word_list);
276 | free(word_counts);
277 | exit(0);
278 | }
279 |
280 |
281 | void get_usage_string(char * restrict usage_string, int usage_len) {
282 |
283 | snprintf(usage_string, usage_len, "ClusterCat (c) 2014-2016 Jon Dehdari - LGPL v3 or Mozilla Public License v2\n\
284 | \n\
285 | Usage: clustercat [options] < corpus.tok.txt > classes.tsv \n\
286 | \n\
287 | Function: Induces word categories from plaintext\n\
288 | \n\
289 | Options:\n\
290 | -c, --classes Set number of word classes (default: 1.2 * square root of vocabulary size)\n\
291 | --class-file Initialize exchange word classes from an existing clustering tsv file (default: pseudo-random initialization\n\
292 | for exchange). If you use this option, you probably can set --tune-cycles to 3 or so\n\
293 | --class-offset Print final word classes starting at a given number (default: %d)\n\
294 | --forward-lambda Set interpolation weight for forward bigram class model, in range of [0,1] (default: %g)\n\
295 | -h, --help Print this usage\n\
296 | --in Specify input training file (default: stdin)\n\
297 | --ngram-input Input is a listing of n-grams and their counts. Otherwise input is a normal corpus\n\
298 | --min-count Minimum count of entries in training set to consider (default: %d occurrences)\n\
299 | --max-array Set maximum order of n-grams for which to use an array instead of a sparse hash map (default: %d-grams)\n\
300 | --out Specify output file (default: stdout)\n\
301 | --print-freqs Print word frequencies after words and classes in final clustering output (useful for visualization)\n\
302 | -q, --quiet Print less output. Use additional -q for even less output\n\
303 | --refine Set initial class refinement value (c==0 -> no refinement; otherwise 2^n. Default:c==2 -> 4 initial clusters)\n\
304 | --rev-alternate How often to alternate using reverse predictive exchange. 0==never, 1==after every normal cycle (default: %u)\n\
305 | -j, --threads Set number of threads to run simultaneously (default: %d threads)\n\
306 | --tune-cycles Set max number of cycles to tune on (default: %d cycles)\n\
307 | --unidirectional Disable simultaneous bidirectional predictive exchange. Results in faster cycles, but slower & worse convergence\n\
308 | If you want to do basic predictive exchange, use: --rev-alternate 0 --unidirectional\n\
309 | -v, --verbose Print additional info to stderr. Use additional -v for more verbosity\n\
310 | --word-vectors Print word vectors (a.k.a. word embeddings) instead of discrete classes.\n\
311 | Specify as either 'text' or 'binary'. The binary format is compatible with word2vec\n\
312 | \n\
313 | ", cmd_args.class_offset, cmd_args.forward_lambda, cmd_args.min_count, cmd_args.max_array, cmd_args.rev_alternate, cmd_args.num_threads, cmd_args.tune_cycles);
314 | }
315 | // --class-algo Set class-induction algorithm {brown,exchange,exchange-then-brown} (default: exchange)\n\
316 | // -o, --order Maximum n-gram order in training set to consider (default: %d-grams)\n\
317 | // -w, --weights 'f f ...' Set class interpolation weights for: 3-gram, 2-gram, 1-gram, rev 2-gram, rev 3-gram. (default: %s)\n\
318 |
319 | void parse_cmd_args(int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args) {
320 | for (int arg_i = 0; arg_i < argc; arg_i++) // Print command-line invocation, for reproducibility
321 | if (cmd_args->verbose >= -1) {
322 | fprintf(stderr, "%s ", argv[arg_i]); fflush(stderr);
323 | }
324 | if (cmd_args->verbose >= -1) {
325 | fprintf(stderr, "\n"); fflush(stderr);
326 | }
327 |
328 | for (int arg_i = 1; arg_i < argc; arg_i++) {
329 | if (!(strcmp(argv[arg_i], "-h") && strcmp(argv[arg_i], "--help"))) {
330 | printf("%s", usage);
331 | exit(0);
332 | } else if (!strcmp(argv[arg_i], "--class-algo")) {
333 | char * restrict class_algo_string = argv[arg_i+1];
334 | arg_i++;
335 | if (!strcmp(class_algo_string, "brown"))
336 | cmd_args->class_algo = BROWN;
337 | else if (!strcmp(class_algo_string, "exchange"))
338 | cmd_args->class_algo = EXCHANGE;
339 | else if (!strcmp(class_algo_string, "exchange-then-brown"))
340 | cmd_args->class_algo = EXCHANGE_BROWN;
341 | else { printf("%s", usage); exit(1); }
342 | } else if (!strcmp(argv[arg_i], "--class-file")) {
343 | initial_class_file = argv[arg_i+1];
344 | arg_i++;
345 | } else if (!strcmp(argv[arg_i], "--class-offset")) {
346 | cmd_args->class_offset = (signed char)atoi(argv[arg_i+1]);
347 | arg_i++;
348 | } else if (!strcmp(argv[arg_i], "--forward-lambda")) {
349 | cmd_args->forward_lambda = (float)atof(argv[arg_i+1]);
350 | arg_i++;
351 | } else if (!strcmp(argv[arg_i], "--in")) {
352 | in_train_file_string = argv[arg_i+1];
353 | arg_i++;
354 | } else if (!(strcmp(argv[arg_i], "-j") && strcmp(argv[arg_i], "--threads") && strcmp(argv[arg_i], "--jobs"))) {
355 | cmd_args->num_threads = (unsigned int) atol(argv[arg_i+1]);
356 | arg_i++;
357 | } else if (!strcmp(argv[arg_i], "--min-count")) {
358 | cmd_args->min_count = (unsigned int) atol(argv[arg_i+1]);
359 | arg_i++;
360 | } else if (!strcmp(argv[arg_i], "--max-array")) {
361 | cmd_args->max_array = (unsigned char) atol(argv[arg_i+1]);
362 | if ((cmd_args->max_array) < 1 || (cmd_args->max_array > 3)) {
363 | printf("%s: --max-array value should be between 1-3\n", argv_0_basename);
364 | fflush(stderr);
365 | exit(10);
366 | }
367 | arg_i++;
368 | } else if (!(strcmp(argv[arg_i], "--ngram-input"))) {
369 | cmd_args->ngram_input = true;
370 | } else if (!(strcmp(argv[arg_i], "-c") && strcmp(argv[arg_i], "-n") && strcmp(argv[arg_i], "--classes") && strcmp(argv[arg_i], "--num-classes"))) {
371 | cmd_args->num_classes = (wclass_t) atol(argv[arg_i+1]);
372 | arg_i++;
373 | } else if (!strcmp(argv[arg_i], "--out")) {
374 | out_file_string = argv[arg_i+1];
375 | arg_i++;
376 | } else if (!(strcmp(argv[arg_i], "--print-freqs"))) {
377 | cmd_args->print_freqs = true;
378 | } else if (!(strcmp(argv[arg_i], "-q") && strcmp(argv[arg_i], "--quiet"))) {
379 | cmd_args->verbose--;
380 | } else if (!(strcmp(argv[arg_i], "--refine"))) {
381 | cmd_args->refine = (unsigned char) atol(argv[arg_i+1]);
382 | arg_i++;
383 | } else if (!strcmp(argv[arg_i], "--rev-alternate")) {
384 | cmd_args->rev_alternate = (unsigned char) atoi(argv[arg_i+1]);
385 | arg_i++;
386 | } else if (!strcmp(argv[arg_i], "--tune-cycles")) {
387 | cmd_args->tune_cycles = (unsigned short) atol(argv[arg_i+1]);
388 | arg_i++;
389 | } else if (!(strcmp(argv[arg_i], "--unidirectional"))) {
390 | cmd_args->unidirectional = true;
391 | } else if (!(strcmp(argv[arg_i], "-v") && strcmp(argv[arg_i], "--verbose"))) {
392 | cmd_args->verbose++;
393 | } else if (!(strcmp(argv[arg_i], "--word-vectors"))) {
394 | char * restrict print_word_vectors_string = argv[arg_i+1];
395 | arg_i++;
396 | if (!strcmp(print_word_vectors_string, "text"))
397 | cmd_args->print_word_vectors = TEXT_VEC;
398 | else if (!strcmp(print_word_vectors_string, "binary"))
399 | cmd_args->print_word_vectors = BINARY_VEC;
400 | else { printf("Error: Please specify either 'text' or 'binary' after the --word-vectors flag.\n\n%s", usage); exit(1); }
401 | } else if (!strncmp(argv[arg_i], "-", 1)) { // Unknown flag
402 | printf("%s: Unknown command-line argument: %s\n\n", argv_0_basename, argv[arg_i]);
403 | printf("%s", usage); fflush(stderr);
404 | exit(2);
405 | }
406 | }
407 | }
408 |
409 | void build_word_count_array(struct_map_word **word_map, char * restrict word_list[const], word_count_t word_counts[restrict], const word_id_t type_count) {
410 | for (word_id_t i = 0; i < type_count; i++) {
411 | word_counts[i] = map_find_count(word_map, word_list[i]);
412 | }
413 | }
414 |
415 | void populate_word_ids(struct_map_word **word_map, char * restrict word_list[const], const word_id_t type_count) {
416 | for (word_id_t i = 0; i < type_count; i++) {
417 | map_set_word_id(word_map, word_list[i], i);
418 | }
419 | }
420 |
421 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap) {
422 | sort_by_count(word_map);
423 | struct_map_word *entry, *tmp;
424 | word_id_t i = 0;
425 |
426 | HASH_ITER(hh, *word_map, entry, tmp) {
427 | const word_id_t word_id = entry->word_id;
428 | char * word = entry->key;
429 | word_id_remap[word_id] = i; // set remap
430 | word_list[i] = entry->key;
431 | //printf("reassigning w=%s %u -> %u; count=%u\n", entry->key, word_id, i, entry->count); fflush(stdout);
432 | map_set_word_id(word_map, word, i); // reset word_id in word_map
433 | i++;
434 | }
435 | }
436 |
437 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** word_map, word_id_t * restrict word_id_remap) { // word_map must already be sorted by word frequency!
438 |
439 | unsigned long number_of_deleted_words = 0;
440 | unsigned long vocab_size = model_metadata->type_count; // Save this to separate variable since we'll modify model_metadata.type_count later
441 | // Get keys
442 | // Iterate over keys
443 | // If count of key_i < threshold,
444 | // increment count of by count of key_i,
445 | // decrement model_metadata.type_count by one
446 | // free & delete entry in map,
447 |
448 | char **local_word_list = (char **)malloc(model_metadata->type_count * sizeof(char*));
449 | //char * local_word_list[model_metadata->type_count];
450 | if (vocab_size != get_keys(word_map, local_word_list)) {
451 | printf("Error: model_metadata->type_count (%lu) != get_keys() (%lu)\n", (long unsigned) vocab_size, (long unsigned) get_keys(word_map, local_word_list) ); fflush(stderr);
452 | exit(4);
453 | }
454 |
455 | unsigned long new_id = 0;
456 | for (unsigned long word_i = 0; word_i < vocab_size; word_i++, new_id++) {
457 | char * word = local_word_list[word_i];
458 | //if ((!strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) || (!strncmp(word, "", MAX_WORD_LEN)) || (!strncmp(word, "", MAX_WORD_LEN))) { // Deal with , , and
459 | // //new_id--;
460 | // continue;
461 | //}
462 |
463 | unsigned long word_i_count = map_find_count(word_map, word); // We'll use this a couple times
464 | if ((word_i_count < cmd_args.min_count) && (strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) && (strncmp(word, "", MAX_WORD_LEN)) && (strncmp(word, "", MAX_WORD_LEN))) { // Don't delete
465 | number_of_deleted_words++;
466 | if (cmd_args.verbose > 3) {
467 | printf("Filtering-out word: %s (old id=%lu, new id=0) (%lu < %hu);\tcount(%s)=%lu\n", word, word_i, (unsigned long)word_i_count, cmd_args.min_count, UNKNOWN_WORD, (unsigned long)map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
468 | }
469 | word_id_remap[map_find_id(word_map, word, (word_id_t) -1)] = (word_id_t) -1; // set value of dud word in remap to temporary unk, which is -1. This gets changed later
470 | map_update_count(word_map, UNKNOWN_WORD, word_i_count, 0);
471 | model_metadata->type_count--;
472 | struct_map_word *local_s;
473 | HASH_FIND_STR(*word_map, word, local_s);
474 | delete_entry(word_map, local_s);
475 | } else { // Keep word
476 | //printf("Keeping word: %s (old id=%u, new id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
477 | //map_set_word_id(word_map, word, new_id); // word_id's 0-2 are reserved for , , and
478 | //printf(" Kept word: %s (new map id=%u, new_id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout);
479 | }
480 | }
481 | //map_set_word_id(word_map, UNKNOWN_WORD, 0); // word_id's 0-2 are reserved for , , and
482 | //map_set_word_id(word_map, "", 1); // word_id's 0-2 are reserved for , , and
483 | //map_set_word_id(word_map, "", 2); // word_id's 0-2 are reserved for , , and
484 |
485 | free(local_word_list);
486 | return number_of_deleted_words;
487 | }
488 |
489 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays) { // Right now it's a drop-in replacement for tally_class_counts_in_store(), but it's not the best way of doing things (eg. for unigram counts, tallying & querying in two separate steps, etc). So this will need to be modified after getting rid of the sent-store
490 | for (word_id_t word_id = 0; word_id < model_metadata.type_count; word_id++) {
491 | const wclass_t headword_class = word2class[word_id];
492 | count_arrays[0][headword_class] += word_bigrams[word_id].headword_count;
493 | //printf("tally_class_ngram_counts: word=??, word_id=%u, type_count=%u, headword_class=%hu, headword_count=%u, class_count=%lu\n", word_id, model_metadata.type_count, headword_class, word_bigrams[word_id].headword_count, (unsigned long)count_arrays[0][headword_class]); fflush(stdout);
494 | for (unsigned int i = 0; i < word_bigrams[word_id].length; i++) {
495 | const word_id_t prev_word = word_bigrams[word_id].predecessors[i];
496 | wclass_t prev_class = word2class[prev_word];
497 | const size_t offset = prev_class + cmd_args.num_classes * headword_class;
498 | //printf(" tally_class_ngram_counts: prev_word=%u, prev_class=%hu, offset=%zu\n", prev_word, prev_class, offset); fflush(stdout);
499 | count_arrays[1][offset] += word_bigrams[word_id].bigram_counts[i];
500 | }
501 | }
502 | }
503 |
504 |
505 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]) {
506 | register unsigned long word_i = 0;
507 |
508 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) { // It doesn't really matter how you initialize word classes in exchange algo. This assigns words from the word list an incrementing class number from [0,num_classes-1]. So it's a simple pseudo-randomized initialization.
509 | register wclass_t class = 0; // [0,num_classes-1]
510 | for (; word_i < vocab_size; word_i++, class++) {
511 | if (class == cmd_args.num_classes) // reset
512 | class = 0;
513 | if (cmd_args.verbose > 3)
514 | printf("cls=%-4u w_i=%-8lu #(w)=%-8u str(w)=%-20s vocab_size=%u\n", class, word_i, word_counts[word_i], word_list[word_i], vocab_size);
515 | word2class[word_i] = class;
516 | }
517 |
518 | } else if (cmd_args.class_algo == BROWN) { // Really simple initialization: one class per word
519 | for (unsigned long class = 0; word_i < vocab_size; word_i++, class++)
520 | word2class[word_i] = class;
521 | }
522 | }
523 |
524 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map) {
525 |
526 | // Build a hash map of bigrams, since we need random access when traversing the corpus.
527 | // Then we convert that to an array of linked lists, since we'll need sequential access during the clustering phase of predictive exchange clustering.
528 |
529 | sort_bigrams(&bigram_map);
530 |
531 | register size_t memusage = 0;
532 | register word_id_t word_2;
533 | register word_id_t word_2_last = 0;
534 | register unsigned int length = 0;
535 | word_id_t * word_buffer = malloc(sizeof(word_id_t) * MAX_WORD_PREDECESSORS);
536 | word_bigram_count_t * count_buffer = malloc(sizeof(word_bigram_count_t) * MAX_WORD_PREDECESSORS);
537 |
538 | // Add a dummy entry at the end of the hash map in order to simplify iterating through it, since it must track changes in head words.
539 | struct_word_bigram dummy = {-1, -1}; // Make sure this bigram is new, so that it's appended to end
540 | map_update_bigram(&bigram_map, &dummy, 0);
541 |
542 | // Iterate through bigram map to get counts of word_2's, so we know how much to allocate for each predecessor list
543 | struct_map_bigram *entry, *tmp;
544 | HASH_ITER(hh, bigram_map, entry, tmp) {
545 | word_2 = (entry->key).word_2;
546 | //printf("\n[%u,%u]=%u, w2_last=%u, length=%u\n", (entry->key).word_1, (entry->key).word_2, entry->count, word_2_last, length); fflush(stdout);
547 | if (word_2 == word_2_last) { // Within successive entry; ie. 2nd entry or greater
548 | word_buffer[length] = (entry->key).word_1;
549 | count_buffer[length] = entry->count;
550 | if (length < MAX_WORD_PREDECESSORS)
551 | length++;
552 | else {
553 | printf("Error: MAX_WORD_PREDECESSORS exceeded (%lu). Increase it in clustercat.h and recompile. Add the -B flag to 'make' to force recompilation.\n", (long unsigned int)MAX_WORD_PREDECESSORS); fflush(stderr);
554 | exit(14);
555 | }
556 | } else { // New entry; process previous entry
557 | word_bigrams[word_2_last].length = length;
558 | word_bigrams[word_2_last].predecessors = malloc(length * sizeof(word_id_t));
559 | memcpy(word_bigrams[word_2_last].predecessors, word_buffer, length * sizeof(word_id_t));
560 | memusage += length * sizeof(word_id_t);
561 | word_bigrams[word_2_last].bigram_counts = malloc(length * sizeof(word_bigram_count_t));
562 | memcpy(word_bigrams[word_2_last].bigram_counts, count_buffer , length * sizeof(word_bigram_count_t));
563 | memusage += length * sizeof(word_bigram_count_t);
564 | //printf("word_2_last=%u, length=%u word_1s: ", word_2_last, length);
565 | //for (unsigned int i = 0; i < length; i++) {
566 | // printf("<%u,%u> ", word_bigrams[word_2_last].predecessors[i], word_bigrams[word_2_last].bigram_counts[i]);
567 | //}
568 | //printf("\n");
569 |
570 | word_2_last = word_2;
571 | word_buffer[0] = (entry->key).word_1;
572 | count_buffer[0] = entry->count;
573 | length = 1;
574 | }
575 | }
576 |
577 | free(word_buffer);
578 | free(count_buffer);
579 | //delete_all_bigram(&map_bigram);
580 |
581 | return memusage;
582 | }
583 |
584 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/) {
585 | //long sum = 0;
586 | // set counts
587 | for (word_id_t word = 0; word < type_count; word++) {
588 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) {
589 | word_id_t prev_word = word_bigrams[word].predecessors[i];
590 | const wclass_t class_i = word2class[word];
591 | word_class_counts[prev_word * cmd_args.num_classes + class_i] += word_bigrams[word].bigram_counts[i];
592 | //printf("i=%hu, <%s,%s>=<%u,%u>, =<%u,%u>, num_classes=%u, offset=%u (%u * %u + %u), orig_val=%u\n", i, word_list[prev_word], word_list[word], prev_word, word, prev_word, class_i, cmd_args.num_classes, prev_word * cmd_args.num_classes + class_i, prev_word, cmd_args.num_classes, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i]); fflush(stdout);
593 | //sum += word_bigrams[word].bigram_counts[i];
594 | //printf(" <%u,%u>=%u at pos %zu\n", prev_word, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i], ((size_t)prev_word * cmd_args.num_classes + class_i)); fflush(stdout);
595 | }
596 | }
597 | //printf(": sum: %lu; [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u...]\n", sum, word_class_counts[0], word_class_counts[1], word_class_counts[2], word_class_counts[3], word_class_counts[4], word_class_counts[5], word_class_counts[6], word_class_counts[7], word_class_counts[8], word_class_counts[9]);
598 | }
599 |
600 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]) {
601 | const double backward_lambda = 1 - cmd_args.forward_lambda;
602 |
603 | // Transition Probs
604 | double transition_logprob = 0;
605 | // Bigrams
606 | #pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:transition_logprob)
607 | for (word_bigram_count_t ngram = 0; ngram < (powi(cmd_args.num_classes, 2)); ngram++) {
608 | const class_bigram_count_t bigram_count = count_arrays[1][ngram];
609 | if (!bigram_count) // bigram doesn't exist in training set
610 | continue;
611 | const wclass_t c_1 = ngram % cmd_args.num_classes;
612 | const wclass_t c_2 = ngram / cmd_args.num_classes;
613 | const wclass_count_t c_1_count = count_arrays[0][c_1];
614 | const wclass_count_t c_2_count = count_arrays[0][c_2];
615 | const double a = cmd_args.forward_lambda * (bigram_count / (double)c_1_count);
616 | const double b = backward_lambda * (bigram_count / (double)c_2_count);
617 | transition_logprob += LOG2ADD(a,b) * bigram_count;
618 | //printf("ngram=%u, c_1=%u, #(c_1)=%lu, c_2=%u, #(c_2)=%lu, #(c_1,c_2)=%lu, trans_prob=%g\n", ngram, c_1, (unsigned long)c_1_count, c_2, (unsigned long)c_2_count, (unsigned long)bigram_count, transition_logprob); fflush(stdout);
619 | }
620 |
621 | // Emission Probs
622 | //long double emission_prob = 0;
623 | double emission_logprob = 0;
624 | //#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:emission_logprob)
625 | for (word_id_t word = 0; word < model_metadata.type_count; word++) {
626 | //if (word == model_metadata.start_sent_id) // Don't tally emission prob for
627 | // continue;
628 | const word_count_t word_count = word_counts[word];
629 | if (!word_count) // Don't tally emission prob for if min-count is 1
630 | continue;
631 | const wclass_t class = word2class[word];
632 | const wclass_count_t class_count = count_arrays[0][class];
633 | emission_logprob += log2(word_count / (double)class_count) * word_count;
634 | //printf("word=%u, class=%u, emission_logprob=%g after += %g = log2(word_count=%lu / class_count=%u) * word_count=%lu\n", word, (unsigned int)class, emission_logprob, log2(word_count / (double)class_count) * word_count, (unsigned long)word_count, class_count, (unsigned long)word_count); fflush(stdout);
635 | }
636 |
637 | //printf("emission_logprob=%g, transition_logprob=%g, LL=%g\n", emission_logprob, transition_logprob, emission_logprob + transition_logprob);
638 | return emission_logprob + transition_logprob;
639 | }
640 |
641 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
642 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
643 | count_arrays[i-1] = calloc(powi(cmd_args.num_classes, i), sizeof(wclass_count_t)); // powi() is in clustercat-math.c
644 | if (count_arrays[i-1] == NULL) {
645 | fprintf(stderr, "%s: Error: Unable to allocate enough memory for %u-grams. I tried to allocate %zu MB per thread (%zuB * %u^%u). Reduce the number of desired classes using --classes (current value: %u)\n", argv_0_basename, i, sizeof(wclass_count_t) * powi(cmd_args.num_classes, i) / 1048576, sizeof(wclass_count_t), cmd_args.num_classes, i, cmd_args.num_classes ); fflush(stderr);
646 | exit(12);
647 | }
648 | //printf("Allocating %zu B (cmd_args.num_classes=%u^i=%u * sizeof(uint)=%zu)\n", (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), cmd_args.num_classes, i, sizeof(wclass_count_t));
649 | }
650 | }
651 |
652 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
653 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
654 | memset(count_arrays[i-1], 0, powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)); // powi() is in clustercat-math.c
655 | }
656 | }
657 |
658 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) {
659 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ...
660 | free(count_arrays[i-1]);
661 | }
662 | }
663 |
--------------------------------------------------------------------------------
/src/clustercat.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDE_CLUSTERCAT_HEADER
2 | #define INCLUDE_CLUSTERCAT_HEADER
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include // log(), exp(), pow()
9 | #include // basename()
10 | #include // USHRT_MAX, UINT_MAX
11 | #include
12 | #include "clustercat-math.h" // powi()
13 |
14 | // Defaults
15 | #define PRIMARY_SEP_CHAR '\t'
16 | #define PRIMARY_SEP_STRING "\t"
17 | #define SECONDARY_SEP_CHAR ' '
18 | #define SECONDARY_SEP_STRING " "
19 | #define TOK_CHARS " \t\n"
20 | #define UNKNOWN_WORD ""
21 | // Number of characters to read-in for each line
22 | #define STDIN_SENT_MAX_CHARS 8000
23 | #define MAX_WORD_LEN 128
24 | #define MAX_WORD_PREDECESSORS 20000000
25 | #define ENTROPY_TERMS_MAX 10000000
26 |
27 | enum class_algos {EXCHANGE, BROWN, EXCHANGE_BROWN};
28 | enum print_word_vectors {NO_VEC, TEXT_VEC, BINARY_VEC};
29 |
30 | #include "clustercat-data.h" // bad. chicken-and-egg typedef deps
31 |
32 | typedef unsigned short sentlen_t; // Number of words in a sentence
33 | #define SENT_LEN_MAX USHRT_MAX
34 | //typedef unsigned short wclass_t; // Defined in clustercat-map.h
35 | //typedef unsigned int word_id_t; // Defined in clustercat-map.h
36 | typedef word_count_t * * restrict count_arrays_t;
37 | typedef word_count_t * restrict count_array_t;
38 |
39 | typedef struct {
40 | unsigned long token_count;
41 | unsigned long line_count;
42 | word_id_t type_count;
43 | word_id_t start_sent_id; // need this for tallying emission probs
44 | word_id_t end_sent_id; // need this for tallying emission probs
45 | } struct_model_metadata;
46 |
47 | // typedef {...} struct_word_bigram; // see clustercat-map.h
48 |
49 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of successors to a given word, as well as the length of that array
50 | word_id_t * predecessors;
51 | word_bigram_count_t * bigram_counts;
52 | unsigned long length;
53 | word_count_t headword_count;
54 | } struct_word_bigram_entry;
55 |
56 | extern char *argv_0_basename; // Allow for global access to filename
57 |
58 | struct cmd_args {
59 | float forward_lambda;
60 | wclass_t num_classes;
61 | unsigned short min_count : 12;
62 | signed char verbose : 4; // Negative values increasingly suppress normal output
63 | unsigned short tune_cycles : 8;
64 | unsigned char refine; // 0=no refinement; otherwise 2^n
65 | signed char class_offset: 4;
66 | unsigned short num_threads : 8;
67 | unsigned char rev_alternate: 3; // How often to alternate using reverse pex. 0 == never, 1 == after every one normal pex cycles, ...
68 | unsigned char max_array : 2;
69 | unsigned char class_algo : 2; // enum class_algos
70 | unsigned char print_word_vectors : 2; // enum print_word_vectors
71 | bool ngram_input;
72 | bool print_freqs;
73 | bool unidirectional;
74 | };
75 |
76 | void populate_word_ids(struct_map_word **ngram_map, char * restrict unique_words[const], const word_id_t type_count);
77 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap);
78 | void build_word_count_array(struct_map_word **ngram_map, char * restrict unique_words[const], word_count_t word_counts[restrict], const word_id_t type_count);
79 |
80 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays);
81 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** ngram_map, word_id_t * restrict word_id_remap);
82 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]);
83 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map);
84 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/);
85 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]);
86 |
87 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
88 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
89 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays);
90 |
91 | // Like atoi/strtol, but doesn't interpret each char's ascii value 0..9 . Hence [104,101] ("he") -> 26725 (ie. (104*256)+101). [3,7,11] -> 198411 (3*256*256) + (7*256) + 11)
92 | // Using a class n-gram array is fast, at the expense of memory usage for lots of unattested ngrams, especially for higher-order n-grams.
93 | // Trigrams are probably the highest order you'd want to use as an array, since the memory usage would be: sizeof(wclass_t) * |C|^3 where |C| is the number of word classes.
94 | // |C| can be represented using an unsigned short (16 bits == 65k classes) for exchange clustering, but probably should be an unsigned int (32 bit == 4 billion classes) for Brown clustering, since initially every word type is its own class.
95 | inline size_t array_offset(wclass_t * pointer, const unsigned int max, const wclass_t num_classes) {
96 | register uint_fast8_t ptr_i = 1;
97 | register size_t total_offset = (*pointer);
98 |
99 | for (; ptr_i < max; ptr_i++) { // little endian
100 | //printf("1: atosize_t: pointer=%p; all vals: [%hu,%hu,%hu]; total_offset=%zu; max=%u\n", pointer, *pointer, *(pointer+1), *(pointer+2), total_offset, max); fflush(stdout);
101 | total_offset += (pointer[ptr_i]) * powi(num_classes, ptr_i);
102 | //printf("2: adding ((pointer[%u]=%u)* powi(%hu, %u)=%lu)=%lu\n", ptr_i, pointer[ptr_i], num_classes, ptr_i, powi(num_classes, ptr_i), pointer[ptr_i] * powi(num_classes, ptr_i)); fflush(stdout);
103 | }
104 | //printf("3: atosize_t: pointer=%p; val0=%hu; total_offset=%zu; max=%u\n\n", pointer, *pointer, total_offset, max); fflush(stdout);
105 | return total_offset;
106 | }
107 |
108 |
109 |
110 | #endif // INCLUDE_HEADER
111 |
--------------------------------------------------------------------------------
/src/ext/uthash/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2005-2014, Troy D. Hanson http://troydhanson.github.com/uthash/
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 |
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
11 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
12 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
13 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
14 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
15 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
16 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
17 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
18 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
19 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
20 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21 |
22 |
--------------------------------------------------------------------------------
/src/ext/uthash/README.md:
--------------------------------------------------------------------------------
1 |
2 | Documentation for uthash is available at:
3 |
4 | http://troydhanson.github.com/uthash/
5 |
6 |
7 |
--------------------------------------------------------------------------------
/src/ext/word2vec/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/src/ext/word2vec/README.txt:
--------------------------------------------------------------------------------
1 | Tools for computing distributed representtion of words
2 | ------------------------------------------------------
3 |
4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
5 |
6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
8 | - desired vector dimensionality
9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 | - training algorithm: hierarchical softmax and / or negative sampling
11 | - threshold for downsampling the frequent words
12 | - number of threads to use
13 | - the format of the output word vector file (text or binary)
14 |
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets.
16 |
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 |
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 |
22 |
--------------------------------------------------------------------------------
/src/ext/word2vec/distance.c:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include
17 | #include
18 | #include
19 |
20 | const long long max_size = 2000; // max length of strings
21 | const long long N = 40; // number of closest words that will be shown
22 | const long long max_w = 50; // max length of vocabulary entries
23 |
24 | int main(int argc, char **argv) {
25 | FILE *f;
26 | char st1[max_size];
27 | char *bestw[N];
28 | char file_name[max_size], st[100][max_size];
29 | float dist, len, bestd[N], vec[max_size];
30 | long long words, size, a, b, c, d, cn, bi[100];
31 | float *M;
32 | char *vocab;
33 | if (argc < 2) {
34 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n");
35 | return 0;
36 | }
37 | strcpy(file_name, argv[1]);
38 | f = fopen(file_name, "rb");
39 | if (f == NULL) {
40 | printf("Input file not found\n");
41 | return -1;
42 | }
43 | fscanf(f, "%lld", &words);
44 | fscanf(f, "%lld", &size);
45 | vocab = (char *)malloc((long long)words * max_w * sizeof(char));
46 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float));
48 | if (M == NULL) {
49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
50 | return -1;
51 | }
52 | for (b = 0; b < words; b++) {
53 | a = 0;
54 | while (1) {
55 | vocab[b * max_w + a] = fgetc(f);
56 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
57 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
58 | }
59 | vocab[b * max_w + a] = 0;
60 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
61 | len = 0;
62 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
63 | len = sqrt(len);
64 | for (a = 0; a < size; a++) M[a + b * size] /= len;
65 | }
66 | fclose(f);
67 | while (1) {
68 | for (a = 0; a < N; a++) bestd[a] = 0;
69 | for (a = 0; a < N; a++) bestw[a][0] = 0;
70 | printf("Enter word or sentence (EXIT or CTRL-d to break): ");
71 | a = 0;
72 | while (1) {
73 | st1[a] = fgetc(stdin);
74 | if ((st1[a] == '\n') || (a >= max_size - 1)) {
75 | st1[a] = 0;
76 | break;
77 | }
78 | a++;
79 | }
80 | if ((!strcmp(st1, "EXIT")) || st1[0] == -1) {
81 | printf("\n");
82 | break;
83 | }
84 | cn = 0;
85 | b = 0;
86 | c = 0;
87 | while (1) {
88 | st[cn][b] = st1[c];
89 | b++;
90 | c++;
91 | st[cn][b] = 0;
92 | if (st1[c] == 0) break;
93 | if (st1[c] == ' ') {
94 | cn++;
95 | b = 0;
96 | c++;
97 | }
98 | }
99 | cn++;
100 | for (a = 0; a < cn; a++) {
101 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
102 | if (b == words) b = -1;
103 | bi[a] = b;
104 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
105 | if (b == -1) {
106 | printf("Out of dictionary word!\n");
107 | break;
108 | }
109 | }
110 | if (b == -1) continue;
111 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n");
112 | for (a = 0; a < size; a++) vec[a] = 0;
113 | for (b = 0; b < cn; b++) {
114 | if (bi[b] == -1) continue;
115 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
116 | }
117 | len = 0;
118 | for (a = 0; a < size; a++) len += vec[a] * vec[a];
119 | len = sqrt(len);
120 | for (a = 0; a < size; a++) vec[a] /= len;
121 | for (a = 0; a < N; a++) bestd[a] = -1;
122 | for (a = 0; a < N; a++) bestw[a][0] = 0;
123 | for (c = 0; c < words; c++) {
124 | a = 0;
125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 | if (a == 1) continue;
127 | dist = 0;
128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 | for (a = 0; a < N; a++) {
130 | if (dist > bestd[a]) {
131 | for (d = N - 1; d > a; d--) {
132 | bestd[d] = bestd[d - 1];
133 | strcpy(bestw[d], bestw[d - 1]);
134 | }
135 | bestd[a] = dist;
136 | strcpy(bestw[a], &vocab[c * max_w]);
137 | break;
138 | }
139 | }
140 | }
141 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
142 | }
143 | return 0;
144 | }
145 |
--------------------------------------------------------------------------------
/src/ext/word2vec/makefile:
--------------------------------------------------------------------------------
1 | CC = gcc
2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
4 |
5 | all: distance word-analogy
6 |
7 | distance : distance.c
8 | $(CC) distance.c -o distance $(CFLAGS)
9 | word-analogy : word-analogy.c
10 | $(CC) word-analogy.c -o word-analogy $(CFLAGS)
11 |
12 | clean:
13 | rm -rf distance word-analogy
14 |
--------------------------------------------------------------------------------
/src/ext/word2vec/word-analogy.c:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include
17 | #include
18 | #include
19 |
20 | const long long max_size = 2000; // max length of strings
21 | const long long N = 40; // number of closest words that will be shown
22 | const long long max_w = 50; // max length of vocabulary entries
23 |
24 | int main(int argc, char **argv) {
25 | FILE *f;
26 | char st1[max_size];
27 | char bestw[N][max_size];
28 | char file_name[max_size], st[100][max_size];
29 | float dist, len, bestd[N], vec[max_size];
30 | long long words, size, a, b, c, d, cn, bi[100];
31 | float *M;
32 | char *vocab;
33 | if (argc < 2) {
34 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n");
35 | return 0;
36 | }
37 | strcpy(file_name, argv[1]);
38 | f = fopen(file_name, "rb");
39 | if (f == NULL) {
40 | printf("Input file not found\n");
41 | return -1;
42 | }
43 | fscanf(f, "%lld", &words);
44 | fscanf(f, "%lld", &size);
45 | vocab = (char *)malloc((long long)words * max_w * sizeof(char));
46 | M = (float *)malloc((long long)words * (long long)size * sizeof(float));
47 | if (M == NULL) {
48 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
49 | return -1;
50 | }
51 | for (b = 0; b < words; b++) {
52 | a = 0;
53 | while (1) {
54 | vocab[b * max_w + a] = fgetc(f);
55 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
56 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
57 | }
58 | vocab[b * max_w + a] = 0;
59 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
60 | len = 0;
61 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
62 | len = sqrt(len);
63 | for (a = 0; a < size; a++) M[a + b * size] /= len;
64 | }
65 | fclose(f);
66 | while (1) {
67 | for (a = 0; a < N; a++) bestd[a] = 0;
68 | for (a = 0; a < N; a++) bestw[a][0] = 0;
69 | printf("Enter three words (EXIT or CTRL-d to break): ");
70 | a = 0;
71 | while (1) {
72 | st1[a] = fgetc(stdin);
73 | if ((st1[a] == '\n') || (a >= max_size - 1)) {
74 | st1[a] = 0;
75 | break;
76 | }
77 | a++;
78 | }
79 | if ((!strcmp(st1, "EXIT")) || st1[0] == -1) {
80 | printf("\n");
81 | break;
82 | }
83 | cn = 0;
84 | b = 0;
85 | c = 0;
86 | while (1) {
87 | st[cn][b] = st1[c];
88 | b++;
89 | c++;
90 | st[cn][b] = 0;
91 | if (st1[c] == 0) break;
92 | if (st1[c] == ' ') {
93 | cn++;
94 | b = 0;
95 | c++;
96 | }
97 | }
98 | cn++;
99 | if (cn < 3) {
100 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
101 | continue;
102 | }
103 | for (a = 0; a < cn; a++) {
104 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
105 | if (b == words) b = 0;
106 | bi[a] = b;
107 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
108 | if (b == 0) {
109 | printf("Out of dictionary word!\n");
110 | break;
111 | }
112 | }
113 | if (b == 0) continue;
114 | printf("\n Word Distance\n------------------------------------------------------------------------\n");
115 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
116 | len = 0;
117 | for (a = 0; a < size; a++) len += vec[a] * vec[a];
118 | len = sqrt(len);
119 | for (a = 0; a < size; a++) vec[a] /= len;
120 | for (a = 0; a < N; a++) bestd[a] = 0;
121 | for (a = 0; a < N; a++) bestw[a][0] = 0;
122 | for (c = 0; c < words; c++) {
123 | if (c == bi[0]) continue;
124 | if (c == bi[1]) continue;
125 | if (c == bi[2]) continue;
126 | a = 0;
127 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
128 | if (a == 1) continue;
129 | dist = 0;
130 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
131 | for (a = 0; a < N; a++) {
132 | if (dist > bestd[a]) {
133 | for (d = N - 1; d > a; d--) {
134 | bestd[d] = bestd[d - 1];
135 | strcpy(bestw[d], bestw[d - 1]);
136 | }
137 | bestd[a] = dist;
138 | strcpy(bestw[a], &vocab[c * max_w]);
139 | break;
140 | }
141 | }
142 | }
143 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
144 | }
145 | return 0;
146 | }
147 |
--------------------------------------------------------------------------------
/visualization/d3/basque_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/basque_cluster_thumbnail.png
--------------------------------------------------------------------------------
/visualization/d3/french_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/french_cluster_thumbnail.png
--------------------------------------------------------------------------------
/visualization/d3/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
38 |
39 | Word Clusters
40 |
41 | Click to zoom in/out
42 |
43 |
125 |
126 | Uses D3
127 | Download json data
128 |
129 |
--------------------------------------------------------------------------------
/visualization/d3/russian_cluster_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/russian_cluster_thumbnail.png
--------------------------------------------------------------------------------