├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── Makefile ├── README.md ├── bin ├── digit_conflate.pl ├── flat_clusters2json.pl ├── hier2flat_no_freqs.sh ├── hier2flat_with_freqs.sh ├── lowercase.pl ├── mkcls ├── mkcls4brown ├── mkcls4word2vec └── ngram_counts.py ├── python ├── README.md └── clustercat.py ├── src ├── clustercat-array.c ├── clustercat-array.h ├── clustercat-cluster.c ├── clustercat-cluster.h ├── clustercat-data.h ├── clustercat-dbg.c ├── clustercat-dbg.h ├── clustercat-import-class-file.c ├── clustercat-import-class-file.h ├── clustercat-io.c ├── clustercat-io.h ├── clustercat-map.c ├── clustercat-map.h ├── clustercat-math.c ├── clustercat-math.h ├── clustercat-tokenize.c ├── clustercat-tokenize.h ├── clustercat.c ├── clustercat.h └── ext │ ├── uthash │ ├── LICENSE │ ├── README.md │ └── src │ │ └── uthash.h │ └── word2vec │ ├── LICENSE │ ├── README.txt │ ├── distance.c │ ├── makefile │ └── word-analogy.c └── visualization └── d3 ├── basque_cluster_thumbnail.png ├── french_cluster_thumbnail.png ├── index.html └── russian_cluster_thumbnail.png /.gitignore: -------------------------------------------------------------------------------- 1 | bin/clustercat 2 | src/ext/word2vec/distance 3 | src/ext/word2vec/word-analogy 4 | *.[oa~] 5 | .*.sw[op] 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | cache: brew 3 | compiler: 4 | - clang 5 | - gcc 6 | os: 7 | - linux 8 | #- osx 9 | #before_install: 10 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi 11 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew tap homebrew/versions; fi 12 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi 13 | # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm38; fi 14 | script: 15 | #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=clang-omp; fi 16 | #- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make CC=/usr/local/bin/clang-3.8 CFLAGS="$CFLAGS -I/usr/local/opt/llvm38/lib/llvm-3.8/include/" LDFLAGS="$LDFLAGS -L/usr/local/opt/llvm38/lib/llvm-3.8/lib" ; fi 17 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make; fi 18 | notifications: 19 | email: false 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This software is licensed under either the GNU LGPL version 3 or the Mozilla 2 | Public License version 2.0 . Both licenses are listed below. 3 | 4 | 5 | 6 | 7 | 8 | 9 | GNU LESSER GENERAL PUBLIC LICENSE 10 | Version 3, 29 June 2007 11 | 12 | Copyright (C) 2007 Free Software Foundation, Inc. 13 | Everyone is permitted to copy and distribute verbatim copies 14 | of this license document, but changing it is not allowed. 15 | 16 | 17 | This version of the GNU Lesser General Public License incorporates 18 | the terms and conditions of version 3 of the GNU General Public 19 | License, supplemented by the additional permissions listed below. 20 | 21 | 0. Additional Definitions. 22 | 23 | As used herein, "this License" refers to version 3 of the GNU Lesser 24 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 25 | General Public License. 26 | 27 | "The Library" refers to a covered work governed by this License, 28 | other than an Application or a Combined Work as defined below. 29 | 30 | An "Application" is any work that makes use of an interface provided 31 | by the Library, but which is not otherwise based on the Library. 32 | Defining a subclass of a class defined by the Library is deemed a mode 33 | of using an interface provided by the Library. 34 | 35 | A "Combined Work" is a work produced by combining or linking an 36 | Application with the Library. The particular version of the Library 37 | with which the Combined Work was made is also called the "Linked 38 | Version". 39 | 40 | The "Minimal Corresponding Source" for a Combined Work means the 41 | Corresponding Source for the Combined Work, excluding any source code 42 | for portions of the Combined Work that, considered in isolation, are 43 | based on the Application, and not on the Linked Version. 44 | 45 | The "Corresponding Application Code" for a Combined Work means the 46 | object code and/or source code for the Application, including any data 47 | and utility programs needed for reproducing the Combined Work from the 48 | Application, but excluding the System Libraries of the Combined Work. 49 | 50 | 1. Exception to Section 3 of the GNU GPL. 51 | 52 | You may convey a covered work under sections 3 and 4 of this License 53 | without being bound by section 3 of the GNU GPL. 54 | 55 | 2. Conveying Modified Versions. 56 | 57 | If you modify a copy of the Library, and, in your modifications, a 58 | facility refers to a function or data to be supplied by an Application 59 | that uses the facility (other than as an argument passed when the 60 | facility is invoked), then you may convey a copy of the modified 61 | version: 62 | 63 | a) under this License, provided that you make a good faith effort to 64 | ensure that, in the event an Application does not supply the 65 | function or data, the facility still operates, and performs 66 | whatever part of its purpose remains meaningful, or 67 | 68 | b) under the GNU GPL, with none of the additional permissions of 69 | this License applicable to that copy. 70 | 71 | 3. Object Code Incorporating Material from Library Header Files. 72 | 73 | The object code form of an Application may incorporate material from 74 | a header file that is part of the Library. You may convey such object 75 | code under terms of your choice, provided that, if the incorporated 76 | material is not limited to numerical parameters, data structure 77 | layouts and accessors, or small macros, inline functions and templates 78 | (ten or fewer lines in length), you do both of the following: 79 | 80 | a) Give prominent notice with each copy of the object code that the 81 | Library is used in it and that the Library and its use are 82 | covered by this License. 83 | 84 | b) Accompany the object code with a copy of the GNU GPL and this license 85 | document. 86 | 87 | 4. Combined Works. 88 | 89 | You may convey a Combined Work under terms of your choice that, 90 | taken together, effectively do not restrict modification of the 91 | portions of the Library contained in the Combined Work and reverse 92 | engineering for debugging such modifications, if you also do each of 93 | the following: 94 | 95 | a) Give prominent notice with each copy of the Combined Work that 96 | the Library is used in it and that the Library and its use are 97 | covered by this License. 98 | 99 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 100 | document. 101 | 102 | c) For a Combined Work that displays copyright notices during 103 | execution, include the copyright notice for the Library among 104 | these notices, as well as a reference directing the user to the 105 | copies of the GNU GPL and this license document. 106 | 107 | d) Do one of the following: 108 | 109 | 0) Convey the Minimal Corresponding Source under the terms of this 110 | License, and the Corresponding Application Code in a form 111 | suitable for, and under terms that permit, the user to 112 | recombine or relink the Application with a modified version of 113 | the Linked Version to produce a modified Combined Work, in the 114 | manner specified by section 6 of the GNU GPL for conveying 115 | Corresponding Source. 116 | 117 | 1) Use a suitable shared library mechanism for linking with the 118 | Library. A suitable mechanism is one that (a) uses at run time 119 | a copy of the Library already present on the user's computer 120 | system, and (b) will operate properly with a modified version 121 | of the Library that is interface-compatible with the Linked 122 | Version. 123 | 124 | e) Provide Installation Information, but only if you would otherwise 125 | be required to provide such information under section 6 of the 126 | GNU GPL, and only to the extent that such information is 127 | necessary to install and execute a modified version of the 128 | Combined Work produced by recombining or relinking the 129 | Application with a modified version of the Linked Version. (If 130 | you use option 4d0, the Installation Information must accompany 131 | the Minimal Corresponding Source and Corresponding Application 132 | Code. If you use option 4d1, you must provide the Installation 133 | Information in the manner specified by section 6 of the GNU GPL 134 | for conveying Corresponding Source.) 135 | 136 | 5. Combined Libraries. 137 | 138 | You may place library facilities that are a work based on the 139 | Library side by side in a single library together with other library 140 | facilities that are not Applications and are not covered by this 141 | License, and convey such a combined library under terms of your 142 | choice, if you do both of the following: 143 | 144 | a) Accompany the combined library with a copy of the same work based 145 | on the Library, uncombined with any other library facilities, 146 | conveyed under the terms of this License. 147 | 148 | b) Give prominent notice with the combined library that part of it 149 | is a work based on the Library, and explaining where to find the 150 | accompanying uncombined form of the same work. 151 | 152 | 6. Revised Versions of the GNU Lesser General Public License. 153 | 154 | The Free Software Foundation may publish revised and/or new versions 155 | of the GNU Lesser General Public License from time to time. Such new 156 | versions will be similar in spirit to the present version, but may 157 | differ in detail to address new problems or concerns. 158 | 159 | Each version is given a distinguishing version number. If the 160 | Library as you received it specifies that a certain numbered version 161 | of the GNU Lesser General Public License "or any later version" 162 | applies to it, you have the option of following the terms and 163 | conditions either of that published version or of any later version 164 | published by the Free Software Foundation. If the Library as you 165 | received it does not specify a version number of the GNU Lesser 166 | General Public License, you may choose any version of the GNU Lesser 167 | General Public License ever published by the Free Software Foundation. 168 | 169 | If the Library as you received it specifies that a proxy can decide 170 | whether future versions of the GNU Lesser General Public License shall 171 | apply, that proxy's public statement of acceptance of any version is 172 | permanent authorization for you to choose that version for the 173 | Library. 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | Mozilla Public License Version 2.0 203 | ================================== 204 | 205 | 1. Definitions 206 | -------------- 207 | 208 | 1.1. "Contributor" 209 | means each individual or legal entity that creates, contributes to 210 | the creation of, or owns Covered Software. 211 | 212 | 1.2. "Contributor Version" 213 | means the combination of the Contributions of others (if any) used 214 | by a Contributor and that particular Contributor's Contribution. 215 | 216 | 1.3. "Contribution" 217 | means Covered Software of a particular Contributor. 218 | 219 | 1.4. "Covered Software" 220 | means Source Code Form to which the initial Contributor has attached 221 | the notice in Exhibit A, the Executable Form of such Source Code 222 | Form, and Modifications of such Source Code Form, in each case 223 | including portions thereof. 224 | 225 | 1.5. "Incompatible With Secondary Licenses" 226 | means 227 | 228 | (a) that the initial Contributor has attached the notice described 229 | in Exhibit B to the Covered Software; or 230 | 231 | (b) that the Covered Software was made available under the terms of 232 | version 1.1 or earlier of the License, but not also under the 233 | terms of a Secondary License. 234 | 235 | 1.6. "Executable Form" 236 | means any form of the work other than Source Code Form. 237 | 238 | 1.7. "Larger Work" 239 | means a work that combines Covered Software with other material, in 240 | a separate file or files, that is not Covered Software. 241 | 242 | 1.8. "License" 243 | means this document. 244 | 245 | 1.9. "Licensable" 246 | means having the right to grant, to the maximum extent possible, 247 | whether at the time of the initial grant or subsequently, any and 248 | all of the rights conveyed by this License. 249 | 250 | 1.10. "Modifications" 251 | means any of the following: 252 | 253 | (a) any file in Source Code Form that results from an addition to, 254 | deletion from, or modification of the contents of Covered 255 | Software; or 256 | 257 | (b) any new file in Source Code Form that contains any Covered 258 | Software. 259 | 260 | 1.11. "Patent Claims" of a Contributor 261 | means any patent claim(s), including without limitation, method, 262 | process, and apparatus claims, in any patent Licensable by such 263 | Contributor that would be infringed, but for the grant of the 264 | License, by the making, using, selling, offering for sale, having 265 | made, import, or transfer of either its Contributions or its 266 | Contributor Version. 267 | 268 | 1.12. "Secondary License" 269 | means either the GNU General Public License, Version 2.0, the GNU 270 | Lesser General Public License, Version 2.1, the GNU Affero General 271 | Public License, Version 3.0, or any later versions of those 272 | licenses. 273 | 274 | 1.13. "Source Code Form" 275 | means the form of the work preferred for making modifications. 276 | 277 | 1.14. "You" (or "Your") 278 | means an individual or a legal entity exercising rights under this 279 | License. For legal entities, "You" includes any entity that 280 | controls, is controlled by, or is under common control with You. For 281 | purposes of this definition, "control" means (a) the power, direct 282 | or indirect, to cause the direction or management of such entity, 283 | whether by contract or otherwise, or (b) ownership of more than 284 | fifty percent (50%) of the outstanding shares or beneficial 285 | ownership of such entity. 286 | 287 | 2. License Grants and Conditions 288 | -------------------------------- 289 | 290 | 2.1. Grants 291 | 292 | Each Contributor hereby grants You a world-wide, royalty-free, 293 | non-exclusive license: 294 | 295 | (a) under intellectual property rights (other than patent or trademark) 296 | Licensable by such Contributor to use, reproduce, make available, 297 | modify, display, perform, distribute, and otherwise exploit its 298 | Contributions, either on an unmodified basis, with Modifications, or 299 | as part of a Larger Work; and 300 | 301 | (b) under Patent Claims of such Contributor to make, use, sell, offer 302 | for sale, have made, import, and otherwise transfer either its 303 | Contributions or its Contributor Version. 304 | 305 | 2.2. Effective Date 306 | 307 | The licenses granted in Section 2.1 with respect to any Contribution 308 | become effective for each Contribution on the date the Contributor first 309 | distributes such Contribution. 310 | 311 | 2.3. Limitations on Grant Scope 312 | 313 | The licenses granted in this Section 2 are the only rights granted under 314 | this License. No additional rights or licenses will be implied from the 315 | distribution or licensing of Covered Software under this License. 316 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 317 | Contributor: 318 | 319 | (a) for any code that a Contributor has removed from Covered Software; 320 | or 321 | 322 | (b) for infringements caused by: (i) Your and any other third party's 323 | modifications of Covered Software, or (ii) the combination of its 324 | Contributions with other software (except as part of its Contributor 325 | Version); or 326 | 327 | (c) under Patent Claims infringed by Covered Software in the absence of 328 | its Contributions. 329 | 330 | This License does not grant any rights in the trademarks, service marks, 331 | or logos of any Contributor (except as may be necessary to comply with 332 | the notice requirements in Section 3.4). 333 | 334 | 2.4. Subsequent Licenses 335 | 336 | No Contributor makes additional grants as a result of Your choice to 337 | distribute the Covered Software under a subsequent version of this 338 | License (see Section 10.2) or under the terms of a Secondary License (if 339 | permitted under the terms of Section 3.3). 340 | 341 | 2.5. Representation 342 | 343 | Each Contributor represents that the Contributor believes its 344 | Contributions are its original creation(s) or it has sufficient rights 345 | to grant the rights to its Contributions conveyed by this License. 346 | 347 | 2.6. Fair Use 348 | 349 | This License is not intended to limit any rights You have under 350 | applicable copyright doctrines of fair use, fair dealing, or other 351 | equivalents. 352 | 353 | 2.7. Conditions 354 | 355 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 356 | in Section 2.1. 357 | 358 | 3. Responsibilities 359 | ------------------- 360 | 361 | 3.1. Distribution of Source Form 362 | 363 | All distribution of Covered Software in Source Code Form, including any 364 | Modifications that You create or to which You contribute, must be under 365 | the terms of this License. You must inform recipients that the Source 366 | Code Form of the Covered Software is governed by the terms of this 367 | License, and how they can obtain a copy of this License. You may not 368 | attempt to alter or restrict the recipients' rights in the Source Code 369 | Form. 370 | 371 | 3.2. Distribution of Executable Form 372 | 373 | If You distribute Covered Software in Executable Form then: 374 | 375 | (a) such Covered Software must also be made available in Source Code 376 | Form, as described in Section 3.1, and You must inform recipients of 377 | the Executable Form how they can obtain a copy of such Source Code 378 | Form by reasonable means in a timely manner, at a charge no more 379 | than the cost of distribution to the recipient; and 380 | 381 | (b) You may distribute such Executable Form under the terms of this 382 | License, or sublicense it under different terms, provided that the 383 | license for the Executable Form does not attempt to limit or alter 384 | the recipients' rights in the Source Code Form under this License. 385 | 386 | 3.3. Distribution of a Larger Work 387 | 388 | You may create and distribute a Larger Work under terms of Your choice, 389 | provided that You also comply with the requirements of this License for 390 | the Covered Software. If the Larger Work is a combination of Covered 391 | Software with a work governed by one or more Secondary Licenses, and the 392 | Covered Software is not Incompatible With Secondary Licenses, this 393 | License permits You to additionally distribute such Covered Software 394 | under the terms of such Secondary License(s), so that the recipient of 395 | the Larger Work may, at their option, further distribute the Covered 396 | Software under the terms of either this License or such Secondary 397 | License(s). 398 | 399 | 3.4. Notices 400 | 401 | You may not remove or alter the substance of any license notices 402 | (including copyright notices, patent notices, disclaimers of warranty, 403 | or limitations of liability) contained within the Source Code Form of 404 | the Covered Software, except that You may alter any license notices to 405 | the extent required to remedy known factual inaccuracies. 406 | 407 | 3.5. Application of Additional Terms 408 | 409 | You may choose to offer, and to charge a fee for, warranty, support, 410 | indemnity or liability obligations to one or more recipients of Covered 411 | Software. However, You may do so only on Your own behalf, and not on 412 | behalf of any Contributor. You must make it absolutely clear that any 413 | such warranty, support, indemnity, or liability obligation is offered by 414 | You alone, and You hereby agree to indemnify every Contributor for any 415 | liability incurred by such Contributor as a result of warranty, support, 416 | indemnity or liability terms You offer. You may include additional 417 | disclaimers of warranty and limitations of liability specific to any 418 | jurisdiction. 419 | 420 | 4. Inability to Comply Due to Statute or Regulation 421 | --------------------------------------------------- 422 | 423 | If it is impossible for You to comply with any of the terms of this 424 | License with respect to some or all of the Covered Software due to 425 | statute, judicial order, or regulation then You must: (a) comply with 426 | the terms of this License to the maximum extent possible; and (b) 427 | describe the limitations and the code they affect. Such description must 428 | be placed in a text file included with all distributions of the Covered 429 | Software under this License. Except to the extent prohibited by statute 430 | or regulation, such description must be sufficiently detailed for a 431 | recipient of ordinary skill to be able to understand it. 432 | 433 | 5. Termination 434 | -------------- 435 | 436 | 5.1. The rights granted under this License will terminate automatically 437 | if You fail to comply with any of its terms. However, if You become 438 | compliant, then the rights granted under this License from a particular 439 | Contributor are reinstated (a) provisionally, unless and until such 440 | Contributor explicitly and finally terminates Your grants, and (b) on an 441 | ongoing basis, if such Contributor fails to notify You of the 442 | non-compliance by some reasonable means prior to 60 days after You have 443 | come back into compliance. Moreover, Your grants from a particular 444 | Contributor are reinstated on an ongoing basis if such Contributor 445 | notifies You of the non-compliance by some reasonable means, this is the 446 | first time You have received notice of non-compliance with this License 447 | from such Contributor, and You become compliant prior to 30 days after 448 | Your receipt of the notice. 449 | 450 | 5.2. If You initiate litigation against any entity by asserting a patent 451 | infringement claim (excluding declaratory judgment actions, 452 | counter-claims, and cross-claims) alleging that a Contributor Version 453 | directly or indirectly infringes any patent, then the rights granted to 454 | You by any and all Contributors for the Covered Software under Section 455 | 2.1 of this License shall terminate. 456 | 457 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 458 | end user license agreements (excluding distributors and resellers) which 459 | have been validly granted by You or Your distributors under this License 460 | prior to termination shall survive termination. 461 | 462 | ************************************************************************ 463 | * * 464 | * 6. Disclaimer of Warranty * 465 | * ------------------------- * 466 | * * 467 | * Covered Software is provided under this License on an "as is" * 468 | * basis, without warranty of any kind, either expressed, implied, or * 469 | * statutory, including, without limitation, warranties that the * 470 | * Covered Software is free of defects, merchantable, fit for a * 471 | * particular purpose or non-infringing. The entire risk as to the * 472 | * quality and performance of the Covered Software is with You. * 473 | * Should any Covered Software prove defective in any respect, You * 474 | * (not any Contributor) assume the cost of any necessary servicing, * 475 | * repair, or correction. This disclaimer of warranty constitutes an * 476 | * essential part of this License. No use of any Covered Software is * 477 | * authorized under this License except under this disclaimer. * 478 | * * 479 | ************************************************************************ 480 | 481 | ************************************************************************ 482 | * * 483 | * 7. Limitation of Liability * 484 | * -------------------------- * 485 | * * 486 | * Under no circumstances and under no legal theory, whether tort * 487 | * (including negligence), contract, or otherwise, shall any * 488 | * Contributor, or anyone who distributes Covered Software as * 489 | * permitted above, be liable to You for any direct, indirect, * 490 | * special, incidental, or consequential damages of any character * 491 | * including, without limitation, damages for lost profits, loss of * 492 | * goodwill, work stoppage, computer failure or malfunction, or any * 493 | * and all other commercial damages or losses, even if such party * 494 | * shall have been informed of the possibility of such damages. This * 495 | * limitation of liability shall not apply to liability for death or * 496 | * personal injury resulting from such party's negligence to the * 497 | * extent applicable law prohibits such limitation. Some * 498 | * jurisdictions do not allow the exclusion or limitation of * 499 | * incidental or consequential damages, so this exclusion and * 500 | * limitation may not apply to You. * 501 | * * 502 | ************************************************************************ 503 | 504 | 8. Litigation 505 | ------------- 506 | 507 | Any litigation relating to this License may be brought only in the 508 | courts of a jurisdiction where the defendant maintains its principal 509 | place of business and such litigation shall be governed by laws of that 510 | jurisdiction, without reference to its conflict-of-law provisions. 511 | Nothing in this Section shall prevent a party's ability to bring 512 | cross-claims or counter-claims. 513 | 514 | 9. Miscellaneous 515 | ---------------- 516 | 517 | This License represents the complete agreement concerning the subject 518 | matter hereof. If any provision of this License is held to be 519 | unenforceable, such provision shall be reformed only to the extent 520 | necessary to make it enforceable. Any law or regulation which provides 521 | that the language of a contract shall be construed against the drafter 522 | shall not be used to construe this License against a Contributor. 523 | 524 | 10. Versions of the License 525 | --------------------------- 526 | 527 | 10.1. New Versions 528 | 529 | Mozilla Foundation is the license steward. Except as provided in Section 530 | 10.3, no one other than the license steward has the right to modify or 531 | publish new versions of this License. Each version will be given a 532 | distinguishing version number. 533 | 534 | 10.2. Effect of New Versions 535 | 536 | You may distribute the Covered Software under the terms of the version 537 | of the License under which You originally received the Covered Software, 538 | or under the terms of any subsequent version published by the license 539 | steward. 540 | 541 | 10.3. Modified Versions 542 | 543 | If you create software not governed by this License, and you want to 544 | create a new license for such software, you may create and use a 545 | modified version of this License if you rename the license and remove 546 | any references to the name of the license steward (except to note that 547 | such modified license differs from this License). 548 | 549 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 550 | Licenses 551 | 552 | If You choose to distribute Source Code Form that is Incompatible With 553 | Secondary Licenses under the terms of this version of the License, the 554 | notice described in Exhibit B of this License must be attached. 555 | 556 | Exhibit A - Source Code Form License Notice 557 | ------------------------------------------- 558 | 559 | This Source Code Form is subject to the terms of the Mozilla Public 560 | License, v. 2.0. If a copy of the MPL was not distributed with this 561 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 562 | 563 | If it is not possible or desirable to put the notice in a particular 564 | file, then You may include the notice in a location (such as a LICENSE 565 | file in a relevant directory) where a recipient would be likely to look 566 | for such a notice. 567 | 568 | You may add additional accurate notices of copyright ownership. 569 | 570 | Exhibit B - "Incompatible With Secondary Licenses" Notice 571 | --------------------------------------------------------- 572 | 573 | This Source Code Form is "Incompatible With Secondary Licenses", as 574 | defined by the Mozilla Public License, v. 2.0. 575 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env make 2 | 3 | CC=cc 4 | INCLUDE=-I ./src/ext/uthash/src/ 5 | ## * For -march info on your platform, type: gcc -march=native -Q --help=target (or just compile with -march=native ) 6 | ## * We include the argument -Wno-unknown-pragmas to suppress clang's lack of support for openmp 7 | ## Since we use the gnuism 'override', you don't need to modify this makefile; you can just run: make -j4 CFLAGS=-DATA_STORE_TRIE_LCRS 8 | override CFLAGS += -march=native -std=c99 -O3 -fopenmp -finline-functions -fno-math-errno -fstrict-aliasing -DHASH_FUNCTION=HASH_SAX -DHASH_BLOOM=25 -Wall -Wextra -Winline -Wstrict-aliasing -Wno-unknown-pragmas -Wno-comment -Wno-missing-field-initializers ${INCLUDE} 9 | LDLIBS=-lm -fopenmp #-ltcmalloc_minimal 10 | BIN=bin/ 11 | SRC=src/ 12 | OBJS=${SRC}/clustercat-array.o ${SRC}/clustercat-cluster.o ${SRC}/clustercat-dbg.o ${SRC}/clustercat-io.o ${SRC}/clustercat-import-class-file.o ${SRC}/clustercat-map.o ${SRC}/clustercat-math.o ${SRC}/clustercat-tokenize.o 13 | includes=${SRC}/$(wildcard *.h) 14 | date:=$(shell date +%F) 15 | machine_type:=$(shell uname -m) 16 | 17 | all: ${BIN}/clustercat 18 | .PHONY : all install tar clean 19 | 20 | clustercat.h: ${SRC}/clustercat-array.h ${SRC}/clustercat-data.h ${SRC}/clustercat-map.h 21 | 22 | 23 | ${BIN}/clustercat: ${SRC}/clustercat.c ${OBJS} 24 | ${CC} -Wl,-s $^ -o $@ ${CFLAGS} ${LDLIBS} 25 | 26 | clustercat.c: ${SRC}/clustercat.h ${SRC}/clustercat-cluster.h ${SRC}/clustercat-dbg.h ${SRC}/clustercat-io.h ${SRC}/clustercat-import-class-file.h ${SRC}/clustercat-math.h ${SRC}/clustercat-tokenize.h 27 | 28 | install: ${BIN}/clustercat 29 | cp -p ${BIN}/clustercat /usr/bin/ 2>/dev/null || \ 30 | mkdir --parents ${HOME}/bin/ && \ 31 | cp -p ${BIN}/clustercat ${HOME}/bin/ 32 | 33 | tar: ${BIN}/clustercat 34 | mkdir clustercat-${date} && \ 35 | mkdir clustercat-${date}/bin && \ 36 | mkdir clustercat-${date}/src && \ 37 | mkdir --parents clustercat-${date}/src/ext/uthash/src && \ 38 | cp -a ${BIN}/clustercat clustercat-${date}/bin/ && \ 39 | cp -a ${BIN}/clustercat clustercat-${date}/bin/clustercat.${machine_type} && \ 40 | cp -a ${SRC}/*.c ${SRC}/*.h clustercat-${date}/src/ && \ 41 | cp -a Makefile README.md LICENSE clustercat-${date}/ && \ 42 | cp -a ${SRC}/ext/uthash/src/uthash.h clustercat-${date}/src/ext/uthash/src/ && \ 43 | tar -cf clustercat-${date}.tar clustercat-${date}/ && \ 44 | gzip -9 clustercat-${date}.tar && \ 45 | rm -rf clustercat-${date}/ 46 | 47 | clean: 48 | \rm -f ${BIN}/clustercat ${SRC}/*.o 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ClusterCat: Fast, Flexible Word Clustering Software 2 | 3 | [![Build Status](https://travis-ci.org/jonsafari/clustercat.svg?branch=master)](https://travis-ci.org/jonsafari/clustercat) 4 | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0) 5 | [![License: MPL 2.0](https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg)](https://opensource.org/licenses/MPL-2.0) 6 | 7 | 8 | ## Overview 9 | 10 | ClusterCat induces word classes from unannotated text. 11 | It is programmed in modern C, with no external libraries. 12 | A Python wrapper is also provided. 13 | 14 | Word classes are unsupervised part-of-speech tags, requiring no manually-annotated corpus. 15 | Words are grouped together that share syntactic/semantic similarities. 16 | They are used in many dozens of applications within natural language processing, machine translation, neural net training, and related fields. 17 | 18 | 19 | ## Installation 20 | ### Linux 21 | You can use either GCC 4.6+ or Clang 3.7+, but GCC is usually faster. 22 | 23 | sudo apt update && sudo apt install gcc make libomp-dev 24 | make -j 25 | 26 | ### macOS / OSX 27 | The current version of Clang in Xcode doesn't fully support [OpenMP][], so instead install GCC from [Homebrew][]: 28 | 29 | brew update && brew install gcc@9 libomp && xcode-select --install 30 | make -j CC=/opt/homebrew/bin/gcc-9 31 | 32 | 33 | ## Commands 34 | The binary program `clustercat` gets compiled into the `bin` directory. 35 | 36 | **Clustering** preprocessed text (already tokenized, normalized, etc) is pretty simple: 37 | 38 | bin/clustercat [options] < train.tok.txt > clusters.tsv 39 | 40 | The word-classes are induced from a bidirectional [predictive][] [exchange algorithm][]. 41 | The format of the output class file has each line consisting of `word`*TAB*`class` (a word type, then tab, then class). 42 | 43 | Command-line argument **usage** may be obtained by running with program with the **`--help`** flag: 44 | 45 | bin/clustercat --help 46 | 47 | 48 | ## Python 49 | Installation and usage details for the Python module are described in a separate [readme](python/README.md). 50 | 51 | 52 | ## Features 53 | - Print **[word vectors][]** (a.k.a. word embeddings) using the `--word-vectors` flag. The binary format is compatible with word2vec's tools. 54 | - Start training using an **existing word cluster mapping** from other clustering software (eg. mkcls) using the `--class-file` flag. 55 | - Adjust the number of **threads** to use with the `--threads` flag. The default is 8. 56 | - Adjust the **number of clusters** or vector dimensions using the `--classes` flag. The default is approximately the square root of the vocabulary size. 57 | - Includes **compatibility wrapper script ` bin/mkcls `** that can be run just like mkcls. You can use more classes now :-) 58 | 59 | 60 | ## Comparison 61 | | Training Set | [Brown][] | ClusterCat | [mkcls][] | [Phrasal][] | [word2vec][] | 62 | | ------------ | --------- | ---------- | --------- | ----------- | ------------ | 63 | | 1 Billion English tokens, 800 clusters | 12.5 hr | **1.4** hr | 48.8 hr | 5.1 hr | 20.6 hr | 64 | | 1 Billion English tokens, 1200 clusters | 25.5 hr | **1.7** hr | 68.8 hr | 6.2 hr | 33.7 hr | 65 | | 550 Million Russian tokens, 800 clusters | 14.6 hr | **1.5** hr | 75.0 hr | 5.5 hr | 12.0 hr | 66 | 67 | 68 | ## Visualization 69 | See [bl.ocks.org][] for nice data visualizations of the clusters for various languages, including English, German, Persian, Hindi, Czech, Catalan, Tajik, Basque, Russian, French, and Maltese. 70 | 71 | For example: 72 | 73 | ![French Clustering Thumbnail](visualization/d3/french_cluster_thumbnail.png) 74 | ![Russian Clustering Thumbnail](visualization/d3/russian_cluster_thumbnail.png) 75 | ![Basque Clustering Thumbnail](visualization/d3/basque_cluster_thumbnail.png) 76 | 77 | You can generate your own graphics from ClusterCat's output. 78 | Add the flag `--print-freqs` to ClusterCat, then type the command: 79 | 80 | bin/flat_clusters2json.pl --word-labels < clusters.tsv > visualization/d3/clusters.json 81 | 82 | You can either upload the [JSON][] file to [gist.github.com][], following instructions on the [bl.ocks.org](http://bl.ocks.org) front page, or you can view the graphic locally by running a minimal webserver in the `visualization/d3` directory: 83 | 84 | python -m SimpleHTTPServer 8116 2>/dev/null & 85 | 86 | Then open a tab in your browser to [localhost:8116](http://localhost:8116) . 87 | 88 | The default settings are sensible for normal usage, but for visualization you probably want much fewer word types and clusters -- less than 10,000 word types and 120 clusters. 89 | Your browser will thank you. 90 | 91 | 92 | ## Perplexity 93 | The perplexity that ClusterCat reports uses a bidirectional bigram class language model, which is richer than the unidirectional bigram-based perplexities reported by most other software. 94 | Richer models provide a better evaluation of the quality of clusters, having more sensitivity (power) to detect improvements. 95 | If you want to directly compare the quality of clusters with a different program's output, you have a few options: 96 | 97 | 1. Load another clustering using `--class-file` , and see what the other clustering's initial bidirectional bigram perplexity is before any words get exchanged. 98 | 2. Use an external class-based language model. These are usually two-sided (unlexicalized) models, so they favor two-sided clusterers. 99 | 3. Evaluate on a downstream task. This is best. 100 | 101 | 102 | ## Contributions 103 | Contributions are welcome, via [pull requests][]. 104 | 105 | 106 | ## Citation 107 | If you use this software please cite the following 108 | 109 | Dehdari, Jon, Liling Tan, and Josef van Genabith. 2016. [BIRA: Improved Predictive Exchange Word Clustering](http://www.aclweb.org/anthology/N16-1139.pdf). 110 | In *Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)*, pages 1169–1174, San Diego, CA, USA. Association for Computational Linguistics. 111 | 112 | @inproceedings{dehdari-etal2016, 113 | author = {Dehdari, Jon and Tan, Liling and van Genabith, Josef}, 114 | title = {{BIRA}: Improved Predictive Exchange Word Clustering}, 115 | booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)}, 116 | month = {June}, 117 | year = {2016}, 118 | address = {San Diego, CA, USA}, 119 | publisher = {Association for Computational Linguistics}, 120 | pages = {1169--1174}, 121 | url = {http://www.aclweb.org/anthology/N16-1139.pdf} 122 | } 123 | 124 | [lgpl3]: https://www.gnu.org/copyleft/lesser.html 125 | [mpl2]: https://www.mozilla.org/MPL/2.0 126 | [c99]: https://en.wikipedia.org/wiki/C99 127 | [homebrew]: http://brew.sh 128 | [openmp]: https://en.wikipedia.org/wiki/OpenMP 129 | [predictive]: https://www.aclweb.org/anthology/P/P08/P08-1086.pdf 130 | [exchange algorithm]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354 131 | [brown]: https://github.com/percyliang/brown-cluster 132 | [mkcls]: https://github.com/moses-smt/mgiza 133 | [phrasal]: https://github.com/stanfordnlp/phrasal 134 | [word2vec]: https://code.google.com/archive/p/word2vec/ 135 | [word vectors]: https://en.wikipedia.org/wiki/Word_embedding 136 | [bl.ocks.org]: http://bl.ocks.org/jonsafari 137 | [JSON]: https://en.wikipedia.org/wiki/JSON 138 | [gist.github.com]: https://gist.github.com 139 | [pull requests]: https://help.github.com/articles/creating-a-pull-request 140 | -------------------------------------------------------------------------------- /bin/digit_conflate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | ## By Jon Dehdari 2013 3 | ## Conflates all digits to the same digit 4 | ## Usage: perl digit_conflate.pl [options] < in > out 5 | 6 | use strict; 7 | use Getopt::Long; 8 | 9 | ## Defaults 10 | my $digit = 5; 11 | 12 | my $usage = <<"END_OF_USAGE"; 13 | digit_conflate.pl (c) 2013 Jon Dehdari - LGPL v3 14 | 15 | Usage: perl $0 [options] < in > out 16 | 17 | Function: Conflates all digits to the same digit 18 | For example, "12,629.24" -> "55,555.55" 19 | 20 | Options: 21 | -h, --help Print this usage 22 | -d, --digit Set output digit to (default: $digit) 23 | 24 | END_OF_USAGE 25 | 26 | GetOptions( 27 | 'h|help|?' => sub { print $usage; exit; }, 28 | 'd|digit=i' => \$digit, 29 | ) or die $usage; 30 | 31 | 32 | while (<>) { 33 | s/\d/$digit/g; 34 | print; 35 | } 36 | -------------------------------------------------------------------------------- /bin/flat_clusters2json.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | ## By Jon Dehdari 2015 3 | ## Converts boring flat tsv clustering format to json for visualization 4 | ## Usage: perl clusters2json.pl [options] < in > out 5 | 6 | use strict; 7 | use Getopt::Long; 8 | 9 | my $word_labels = undef; 10 | 11 | my $usage = <<"END_OF_USAGE"; 12 | clusters2json.pl (c) 2015 Jon Dehdari - LGPL v3 or Mozilla Public License v2 13 | 14 | Usage: perl $0 [options] < in > out 15 | 16 | Function: Converts tsv clustering format to json for visualization 17 | 18 | Options: 19 | -h, --help Print this usage 20 | --word-labels Use the first word in a cluster series as the cluster label. 21 | This option is useful if the input is already sorted by frequency. 22 | 23 | END_OF_USAGE 24 | 25 | GetOptions( 26 | 'h|help|?' => sub { print $usage; exit; }, 27 | 'word-labels' => \$word_labels, 28 | ) or die $usage; 29 | 30 | my ($word, $cluster, $freq) = undef; 31 | my $last_cluster = -1; 32 | 33 | print <) { 41 | chomp; 42 | ($word, $cluster, $freq) = split; 43 | $freq or $freq = 1; # if word frequencies aren't provided 44 | 45 | $word =~ s/(["\/])/\\$1/g; # escape problematic characters 46 | #$word =~ s//>/g; 48 | 49 | if ($cluster != $last_cluster) { # We've reached a new cluster 50 | 51 | if ($last_cluster != -1) { # end cluster's children (ie words), then start new cluster 52 | print <) loop 83 | 84 | print < output 4 | 5 | binmode(STDIN, ":utf8"); 6 | binmode(STDOUT, ":utf8"); 7 | 8 | print lc while ; 9 | -------------------------------------------------------------------------------- /bin/mkcls: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ## By Jon Dehdari, 2015, public domain 3 | ## Compatibility wrapper for clustercat, using mkcls's command-line arguments 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know 5 | 6 | mkcls_cmd_args=' 7 | mkcls command-line arguments: 8 | 9 | -p training input text file (default: train) 10 | -V cluster output file 11 | -c number of word clusters (default: 100) 12 | -m minimum word count (default: 1) 13 | -v verbose mode 14 | 15 | Ignored arguments: 16 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing)) 17 | -e set stochastic optimization parameter (for gamma, nu, alpha) 18 | -h set hapax init name 19 | -i set initialization value {ran,aio,gda,freq,other} (default: ran) 20 | -k set category selection {det,ran,best} (default: best) 21 | -l use LO, and set rho 22 | -M maximum number of optimization steps 23 | -n number of optimization runs (default: 1) 24 | -N set optimize parameter count (default: 10) 25 | -o graph output 26 | -O set one-with-hapas (default: 1) 27 | -P training ngram file 28 | -r set random seed (default: 532567487) 29 | -s set maximum runtime seconds 30 | -w set word selection {det,ran,incr} (default: det) 31 | -y use special criterion, and set sigma distortion (default: 5.0) 32 | ' 33 | 34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them 35 | cmd_string="$(dirname $0)/clustercat --min-count 1 --num-classes 100 --in train " 36 | 37 | 38 | while [ $# -gt 0 ]; do 39 | 40 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this 41 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then 42 | echo "$mkcls_cmd_args" 43 | exit 44 | fi 45 | 46 | ## Ugh. Use a space between flags and their values 47 | flag=$(echo "$1" | grep -o '^-.') 48 | arg=${1#-?} 49 | case $flag in 50 | -p) 51 | cmd_string="$cmd_string --in $arg " 52 | shift 53 | ;; 54 | -V) 55 | cmd_string="$cmd_string --out $arg " 56 | shift 57 | ;; 58 | -c) 59 | cmd_string="$cmd_string --num-classes $arg " 60 | shift 61 | ;; 62 | -m) 63 | cmd_string="$cmd_string --min-count $arg " 64 | shift 65 | ;; 66 | -v) 67 | cmd_string="$cmd_string --verbose " 68 | shift 69 | ;; 70 | *) 71 | shift 72 | ;; 73 | esac 74 | done 75 | 76 | echo 'Executing:' >&2 77 | echo "$cmd_string" >&2 78 | eval "$cmd_string" 79 | -------------------------------------------------------------------------------- /bin/mkcls4brown: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ## By Jon Dehdari, 2015, public domain 3 | ## Compatibility wrapper for brown-cluster, using mkcls's command-line arguments 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know 5 | 6 | mkcls_cmd_args=' 7 | mkcls command-line arguments: 8 | 9 | -p training input text file (default: train) 10 | -V cluster output file 11 | -c number of word clusters (default: 100) 12 | -m minimum word count (default: 1) 13 | 14 | Ignored arguments: 15 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing)) 16 | -e set stochastic optimization parameter (for gamma, nu, alpha) 17 | -h set hapax init name 18 | -i set initialization value {ran,aio,gda,freq,other} (default: ran) 19 | -k set category selection {det,ran,best} (default: best) 20 | -l use LO, and set rho 21 | -M maximum number of optimization steps 22 | -n number of optimization runs (default: 1) 23 | -N set optimize parameter count (default: 10) 24 | -o graph output 25 | -O set one-with-hapas (default: 1) 26 | -P training ngram file 27 | -r set random seed (default: 532567487) 28 | -s set maximum runtime seconds 29 | -v verbose mode 30 | -w set word selection {det,ran,incr} (default: det) 31 | -y use special criterion, and set sigma distortion (default: 5.0) 32 | ' 33 | 34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them 35 | min_count=1 36 | classes=100 37 | in_file='train' 38 | 39 | 40 | while [ $# -gt 0 ]; do 41 | 42 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this 43 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then 44 | echo "$mkcls_cmd_args" 45 | exit 46 | fi 47 | 48 | ## Ugh. Use a space between flags and their values 49 | flag=$(echo "$1" | grep -o '^-.') 50 | arg=${1#-?} 51 | case $flag in 52 | -p) 53 | in_file="$arg" 54 | shift 55 | ;; 56 | -V) 57 | out_file="$arg" 58 | shift 59 | ;; 60 | -c) 61 | classes="$arg" 62 | shift 63 | ;; 64 | -m) 65 | min_count="$arg" 66 | shift 67 | ;; 68 | *) 69 | shift 70 | ;; 71 | esac 72 | done 73 | 74 | cmd_string="$(dirname $0)/wcluster --threads 4 --min-occur $min_count --c $classes --text $in_file --output_dir ${out_file}_brown_dir " 75 | 76 | echo 'Executing:' >&2 77 | echo "$cmd_string" >&2 78 | eval "$cmd_string" && \ 79 | $(dirname $0)/hier2flat_no_freqs.sh < ${out_file}_brown_dir/paths > $out_file && \ 80 | \rm ${out_file}_brown_dir/log # really verbose for large corpora 81 | -------------------------------------------------------------------------------- /bin/mkcls4word2vec: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ## By Jon Dehdari, 2015, public domain 3 | ## Compatibility wrapper for word2vec, using mkcls's command-line arguments 4 | ## If you find an error in the interpretation of mkcls's arcane command-line arguments, please let me know 5 | 6 | mkcls_cmd_args=' 7 | mkcls command-line arguments: 8 | 9 | -p training input text file (default: train) 10 | -V cluster output file 11 | -c number of word clusters (default: 100) 12 | -m minimum word count (default: 1) 13 | 14 | Ignored arguments: 15 | -a set stochastic optimization algorithm {rrt,ta,gda,sa,hc} (default: ta == Threshold Annealing)) 16 | -e set stochastic optimization parameter (for gamma, nu, alpha) 17 | -h set hapax init name 18 | -i set initialization value {ran,aio,gda,freq,other} (default: ran) 19 | -k set category selection {det,ran,best} (default: best) 20 | -l use LO, and set rho 21 | -M maximum number of optimization steps 22 | -n number of optimization runs (default: 1) 23 | -N set optimize parameter count (default: 10) 24 | -o graph output 25 | -O set one-with-hapas (default: 1) 26 | -P training ngram file 27 | -r set random seed (default: 532567487) 28 | -s set maximum runtime seconds 29 | -v verbose mode 30 | -w set word selection {det,ran,incr} (default: det) 31 | -y use special criterion, and set sigma distortion (default: 5.0) 32 | ' 33 | 34 | ## Set defaults to be like mkcls, unless they're overwritten latter by manually specifying them 35 | min_count=1 36 | classes=100 37 | in_file='train' 38 | 39 | 40 | while [ $# -gt 0 ]; do 41 | 42 | ## Let me know if you actually use the original -h argument (hapax init name), and I'll change this 43 | if [ $1 = '--help' ] || [ $1 = '-h' ]; then 44 | echo "$mkcls_cmd_args" 45 | exit 46 | fi 47 | 48 | ## Ugh. Use a space between flags and their values 49 | flag=$(echo "$1" | grep -o '^-.') 50 | arg=${1#-?} 51 | case $flag in 52 | -p) 53 | in_file="$arg" 54 | shift 55 | ;; 56 | -V) 57 | out_file="$arg" 58 | shift 59 | ;; 60 | -c) 61 | classes="$arg" 62 | shift 63 | ;; 64 | -m) 65 | min_count="$arg" 66 | shift 67 | ;; 68 | *) 69 | shift 70 | ;; 71 | esac 72 | done 73 | 74 | cmd_string="$(dirname $0)/word2vec -min-count $min_count -classes $classes -size $classes -train $in_file -output $out_file " 75 | 76 | echo 'Executing:' >&2 77 | echo "$cmd_string" >&2 78 | eval "$cmd_string" && \ 79 | perl -p -i -e 's/ /\t/g' $out_file 80 | -------------------------------------------------------------------------------- /bin/ngram_counts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ## By Jon Dehdari, 2015, public domain 3 | ## Counts ngrams, including joined ngrams, from text corpus 4 | 5 | import sys 6 | 7 | ngram_order = 4 8 | ngrams = [] 9 | for i in range(ngram_order): 10 | ngrams.append({}) 11 | 12 | for line in sys.stdin: 13 | line = line.rstrip() 14 | tokens = line.split() 15 | #tokens.insert(0, "") 16 | #tokens.append("") 17 | #print(tokens) 18 | len_tokens = len(tokens) 19 | 20 | for i in range(len_tokens): 21 | 22 | # i := leftmost position 23 | # j := rightmost position of current sub-ngram 24 | # k := rightmost position of all sub-ngrams 25 | 26 | k = len_tokens if i+ngram_order >= len_tokens else i + ngram_order 27 | #print("i=",i, "k=", k, tokens[i:k]) 28 | 29 | # Build-up joined ngrams 30 | for j in range(i+1,k+1): 31 | joined_ngram = '_'.join(tokens[i:j]) 32 | if (j+1 < k): 33 | if joined_ngram in ngrams[0]: 34 | ngrams[0][joined_ngram] += 1 35 | else : 36 | ngrams[0][joined_ngram] = 1 37 | 38 | #print(" j=",j, joined_ngram) 39 | 40 | # Process sub-ngrams 41 | num_subcuts = j - (i+1) 42 | while (num_subcuts >= 1): 43 | if ( (j == k) and (num_subcuts % 2)): # skip imbalanced subcuts 44 | num_subcuts -= 1 45 | continue 46 | subcut = ' '.join([ '_'.join(tokens[i:i+num_subcuts]), '_'.join(tokens[i+num_subcuts:j]) ]) 47 | if (subcut in ngrams[1]): 48 | ngrams[1][subcut] +=1 49 | else : 50 | ngrams[1][subcut] = 1 51 | 52 | #print(" num_subcuts=", num_subcuts, "subcut=<<",subcut, ">>") 53 | num_subcuts -= 1 54 | 55 | for i in range(ngram_order): 56 | print() 57 | for k, v in sorted(ngrams[i].items()): 58 | print(k, "\t", v, sep='') 59 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Python ClusterCat 2 | 3 | 4 | ## Installation 5 | First follow the [installation instructions](../README.md) in the above directory. 6 | After that, you normally don't need to install anything here. You can load the module `clustercat` using either Python 2 or 3. 7 | 8 | cd python 9 | python3 10 | >>> import clustercat as cc 11 | >>> clustering = cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1) 12 | >>> print(clustering) 13 | 14 | If you get an error message saying that it is unable to access clustercat binary, follow all the instructions in the error message. 15 | You'll need more text input than the toy example above to produce useful clusters. 16 | 17 | To import this module from a different directory, you can add the module's directory to `$PYTHONPATH`: 18 | 19 | cd python 20 | echo "export PYTHONPATH=\$PYTHONPATH:`pwd`" >> ~/.bashrc 21 | source ~/.bashrc 22 | 23 | ## Python ClusterCat Functions 24 | ### `cluster(text=None, in_file=None, ...)` 25 | Produce a clustering, given a textual input. There is one required argument (the training input text), and many optional arguments. The one required argument is **either** `text` **or** `in_file`. The argument `text` is a list of Python strings. The argument `in_file` is a path to a text file, consisting of preprocessed (eg. tokenized) one-sentence-per-line text. The use of `text` is probably not a good idea for large corpora. 26 | 27 | ```Python 28 | cc.cluster(text=['this is a test', 'that is only a test', 'bye'], min_count=1) 29 | cc.cluster(in_file='/tmp/corpus.tok.txt', min_count=3) 30 | ``` 31 | 32 | The other optional arguments are described by running the compiled clustercat binary with the `--help` argument, except that the leading `--` from the shell argument is removed, and `-` is replaced with `_`. So for example, instead of `--tune-cycles 15`, the Python function argument would be `tune_cycles=15` . 33 | 34 | Returns a dictionary of the form `{ word : cluster_id }` . 35 | 36 | 37 | ### `save(mapping, out, format='tsv')` 38 | Save a clustering (dictionary) to file. By default the output file is a tab-separated listing of words and their cluster ID. 39 | 40 | ```Python 41 | cc.save(clustering, 'clusters.tsv') 42 | ``` 43 | 44 | 45 | ### `load(in_file, format="tsv")` 46 | Load a clustering from a file. By default the input file is a tab-separated listing of words and their cluster ID. 47 | Returns a dictionary of the clustering. 48 | 49 | ```Python 50 | clustering = cc.load('clusters.tsv') 51 | ``` 52 | 53 | 54 | ### `tag_string(mapping, text, unk="")` 55 | Tag a string with the corresponding cluster ID's. If a word is not found in the clustering, use `unk`. 56 | Returns a string. 57 | 58 | ```Python 59 | tagged_sent = cc.tag_string(clustering, "this is a test") 60 | ``` 61 | 62 | ### `tag_stdin(mapping, unk="")` 63 | This calls `tag_string()` for each line in `stdin`, and prints the result to `stdout`. 64 | -------------------------------------------------------------------------------- /python/clustercat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # By Jon Dehdari, 2016 3 | # MIT License 4 | """ Fast, flexible word clusters """ 5 | 6 | import sys 7 | import os 8 | import subprocess 9 | import distutils.spawn 10 | 11 | unk = '' 12 | 13 | def load(in_file=None, format='tsv'): 14 | """ Load a clustering from a file. By default the input file is a 15 | tab-separated listing of words and their cluster ID. Returns a dictionary of 16 | the clustering. 17 | 18 | Args: 19 | in_file (string): path to input file 20 | format (string): input file format (default: tsv) 21 | 22 | Returns: 23 | dict: word-to-tag mapping 24 | """ 25 | 26 | mapping = {} 27 | if format == 'tsv': 28 | with open(in_file) as f: 29 | # Primary sort by value (cluster ID), secondary sort by key (word) 30 | for line in f: 31 | # Keep the full split line instead of key, val to allow for 32 | # counts in optional third column 33 | tokens = line.split() 34 | mapping[tokens[0]] = int(tokens[1]) 35 | 36 | return mapping 37 | 38 | 39 | def save(mapping=None, out=None, format='tsv'): 40 | """ Save a clustering (dictionary) to file. By default the output file is 41 | a tab-separated listing of words and their cluster ID. 42 | 43 | Args: 44 | mapping (dict): word-to-tag mapping 45 | out (string): path to output file 46 | format (string): output file format (default: tsv) 47 | """ 48 | 49 | if format == 'tsv': 50 | with open(out, 'w') as outfile: 51 | # Primary sort by value (cluster ID), secondary sort by key (word) 52 | for key in sorted(sorted(mapping), key=mapping.get): 53 | line = str(key) + '\t' + str(mapping[key]) + '\n' 54 | outfile.write(line) 55 | 56 | 57 | def tag_string(mapping=None, text=None, unk=unk): 58 | """Tag a string with the corresponding cluster ID's. If a word is not 59 | found in the clustering, use unk. 60 | 61 | Args: 62 | mapping (dict): word-to-tag mapping 63 | text (string): the string to be tagged 64 | unk (string): what to label unknown/unseen words that are not in 65 | mapping (default: ) 66 | 67 | Returns: 68 | string: sequence of tags 69 | """ 70 | 71 | newsent = "" 72 | for word in text.split(): 73 | if word in mapping: 74 | newsent += ' ' + str(mapping[word]) 75 | elif unk in mapping: 76 | newsent += ' ' + str(mapping[unk]) 77 | else: 78 | newsent += ' ' + "" 79 | return newsent.lstrip() 80 | 81 | 82 | def tag_stdin(mapping=None, unk=unk): 83 | """ This calls tag_string() for each line in stdin, and prints the 84 | result to stdout. 85 | 86 | Args: 87 | mapping (dict): word-to-tag mapping 88 | unk (string): what to label unknown/unseen words that are not in 89 | mapping (default: ) 90 | """ 91 | 92 | for line in sys.stdin: 93 | print(tag_string(mapping=mapping, text=line, unk=unk)) 94 | 95 | 96 | def cluster(text=None, in_file=None, classes=None, class_file=None, 97 | class_offset=None, forward_lambda=None, ngram_input=None, 98 | min_count=None, out=None, print_freqs=None, quiet=None, 99 | refine=None, rev_alternate=None, threads=None, tune_cycles=None, 100 | unidirectional=None, verbose=None, word_vectors=None): 101 | """ 102 | Produce a clustering, given a textual input. There is one required argument 103 | (the training input text), and many optional arguments. The one required 104 | argument is either text or in_file. The argument text is a list of Python 105 | strings. The argument in_file is a path to a text file, consisting of 106 | preprocessed (eg. tokenized) one-sentence-per-line text. The use of text 107 | is probably not a good idea for large corpora. 108 | 109 | The other optional arguments are described by running the compiled 110 | clustercat binary with the --help argument, except that the 111 | leading -- from the shell argument is removed, and - is replaced with _. 112 | So for example, instead of --tune-cycles 15, the Python function argument 113 | would be tune_cycles=15 . 114 | 115 | Returns a dictionary of the form { word : cluster_id } . 116 | """ 117 | 118 | # First check to see if we can access clustercat binary relative to this 119 | # module. If not, try $PATH. If not, :-( 120 | # Python 2 doesn't return absolute path in __file__ 121 | cc_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 122 | cc_bin = os.path.join(cc_dir, 'bin', 'clustercat') 123 | if os.path.isfile(cc_bin): 124 | cmd_str = [cc_bin] 125 | elif distutils.spawn.find_executable("clustercat"): 126 | cmd_str = ["clustercat"] 127 | else: 128 | print("Error: Unable to access clustercat binary from either ", cc_dir, " or $PATH. In the parent directory, first run 'make install', and then add $HOME/bin/ to your $PATH, by typing the following command:\necho 'PATH=$PATH:$HOME/bin' >> $HOME/.bashrc && source $HOME/.bashrc") 129 | exit(1) 130 | 131 | 132 | # Now translate function arguments to command-line arguments 133 | clustercat_params = {"in_file": "--in", "out": "--out", 134 | "classes": "--classes", 135 | "class_file": "--class-file", 136 | "class_offset": "--class-offset", 137 | "forward_lambda": "--forward-lambda", 138 | "ngram_input": "--ngram-input", 139 | "min_count": "--min-count", 140 | "refine": "--refine", 141 | "rev_alternate": "--rev-alternate", 142 | "threads": "--threads", 143 | "tune_cycles": "--tune-cycles", 144 | "word_vectors": "--word-vectors" 145 | } 146 | 147 | boolean_params = {"print_freqs": "--print-freqs", 148 | "quiet": "--quiet", 149 | "unidirectional": "--unidirectional", 150 | "verbose": "--verbose" 151 | } 152 | 153 | for arg, value in locals().items(): 154 | # Check for boolean parameters 155 | if arg in boolean_params and value is True: 156 | cmd_str.append(boolean_params[arg]) 157 | # Other non-boolean parameters that are not None 158 | elif arg in clustercat_params and value is not None: 159 | cmd_str.append(clustercat_params[arg]) 160 | cmd_str.append(str(value)) 161 | 162 | #print(cmd_str, file=sys.stderr) # Use Python 3 interpreter 163 | 164 | cmd_out = '' 165 | if text and not in_file: 166 | p1 = subprocess.Popen(["printf", "\n".join(text)], 167 | stdout=subprocess.PIPE, universal_newlines=True) 168 | p2 = subprocess.Popen(cmd_str, stdin=p1.stdout, stdout=subprocess.PIPE, 169 | universal_newlines=True) 170 | p1.stdout.close() 171 | cmd_out = p2.communicate()[0] 172 | elif in_file and not text: 173 | cmd_out = subprocess.check_output(cmd_str, universal_newlines=True) 174 | else: 175 | print("Error: supply either text or in_file argument to clustercat.cluster(), but not both") 176 | 177 | clusters = {} 178 | for line in cmd_out.split("\n"): 179 | split_line = line.split("\t") 180 | try: 181 | clusters[split_line[0]] = int(split_line[1]) 182 | except: 183 | pass 184 | return clusters 185 | 186 | 187 | def main(): 188 | """ No real reason to use this as a standalone script. Just invoke the 189 | C-compiled binary for standalone applications. But here you 190 | go, anyways. 191 | """ 192 | import argparse 193 | parser = argparse.ArgumentParser(description='Clusters words, or tags them') 194 | 195 | parser.add_argument('-i', '--in', help="Load input training file") 196 | parser.add_argument('-o', '--out', help="Save final mapping to file") 197 | parser.add_argument('-t', '--tag', help="Tag stdin input, using clustering in supplied argument") 198 | args = parser.parse_args() 199 | 200 | if args.tag: 201 | mapping = load(in_file=args.tag) 202 | tag_stdin(mapping=mapping) 203 | else: 204 | mapping = cluster(text=sys.stdin) 205 | if args.out: 206 | save(mapping=mapping, out=args.out) 207 | else: 208 | print(mapping) 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /src/clustercat-array.c: -------------------------------------------------------------------------------- 1 | #include // variadic functions for arrncat 2 | #include 3 | #include 4 | #include 5 | #include "clustercat.h" // macros 6 | 7 | // Returns 0 if all values in array are 0.0; returns 1 otherwise 8 | int anyf(const float array[], unsigned int arr_len) { 9 | while (arr_len--) { 10 | if (array[arr_len]) 11 | return 1; 12 | } 13 | return 0; 14 | } 15 | 16 | // Returns 0 if all values in array are 0.0; returns 1 otherwise 17 | int any(const double array[], unsigned int arr_len) { 18 | while (arr_len--) { 19 | if (array[arr_len]) 20 | return 1; 21 | } 22 | return 0; 23 | } 24 | 25 | // Returns 1 if all values in array are non-zero; returns 0 otherwise 26 | int allf(const float array[], unsigned int arr_len) { 27 | while (arr_len--) { 28 | if (!array[arr_len]) 29 | return 0; 30 | } 31 | return 1; 32 | } 33 | 34 | // Returns 1 if all values in array are non-zero; returns 0 otherwise 35 | int all(const double array[], unsigned int arr_len) { 36 | while (arr_len--) { 37 | if (!array[arr_len]) 38 | return 0; 39 | } 40 | return 1; 41 | } 42 | 43 | float sumf(const float array[], unsigned int arr_len) { 44 | float sum = 0.0; 45 | while (arr_len--) { 46 | sum += array[arr_len]; 47 | } 48 | return sum; 49 | } 50 | 51 | double sum(const double array[], unsigned int arr_len) { 52 | double sum = 0.0; 53 | while (arr_len--) { 54 | sum += array[arr_len]; 55 | } 56 | return sum; 57 | } 58 | 59 | float productf(const float array[], unsigned int arr_len) { 60 | float product = 1.0; 61 | while (arr_len--) { 62 | product *= array[arr_len]; 63 | } 64 | return product; 65 | } 66 | 67 | double product(const double array[], unsigned int arr_len) { 68 | double product = 1.0; 69 | while (arr_len--) { 70 | product *= array[arr_len]; 71 | } 72 | return product; 73 | } 74 | 75 | float minf(const float array[], unsigned int arr_len) { 76 | arr_len--; 77 | float min = array[arr_len]; 78 | while (1) { 79 | //printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout); 80 | if (array[arr_len] < min) 81 | min = array[arr_len]; 82 | if (arr_len == 0) 83 | break; 84 | arr_len--; 85 | } 86 | return min; 87 | } 88 | 89 | double min(const double array[], unsigned int arr_len) { 90 | arr_len--; 91 | double min = array[arr_len]; 92 | while (1) { 93 | //printf("min=%g, arr_len=%u, val=%g\n", min, arr_len, array[arr_len]); fflush(stdout); 94 | if (array[arr_len] < min) 95 | min = array[arr_len]; 96 | if (arr_len == 0) 97 | break; 98 | arr_len--; 99 | } 100 | return min; 101 | } 102 | 103 | float maxf(const float array[], unsigned int arr_len) { 104 | arr_len--; 105 | float max = array[arr_len]; 106 | while (1) { 107 | if (array[arr_len] > max) 108 | max = array[arr_len]; 109 | if (arr_len == 0) 110 | break; 111 | arr_len--; 112 | } 113 | return max; 114 | } 115 | 116 | double max(const double array[], unsigned int arr_len) { 117 | arr_len--; 118 | double max = array[arr_len]; 119 | while (1) { 120 | if (array[arr_len] > max) 121 | max = array[arr_len]; 122 | if (arr_len == 0) 123 | break; 124 | arr_len--; 125 | } 126 | return max; 127 | } 128 | 129 | unsigned int which_minf(const float array[], const unsigned int arr_len) { 130 | unsigned int which_min = 0; 131 | float min = array[0]; 132 | 133 | unsigned int i = 1; 134 | for (; i < arr_len; i++) { 135 | if (array[i] < min) { 136 | which_min = i; 137 | min = array[i]; 138 | } 139 | } 140 | return which_min; 141 | } 142 | 143 | unsigned int which_min(const double array[], const unsigned int arr_len) { 144 | unsigned int which_min = 0; 145 | double min = array[0]; 146 | 147 | unsigned int i = 1; 148 | for (; i < arr_len; i++) { 149 | if (array[i] < min) { 150 | which_min = i; 151 | min = array[i]; 152 | } 153 | } 154 | return which_min; 155 | } 156 | 157 | unsigned int which_maxf(const float array[], const unsigned int arr_len) { 158 | unsigned int which_max = 0; 159 | float max = array[0]; 160 | 161 | unsigned int i = 1; 162 | for (; i < arr_len; i++) { 163 | if (array[i] > max) { 164 | which_max = i; 165 | max = array[i]; 166 | } 167 | } 168 | return which_max; 169 | } 170 | 171 | unsigned int which_max(const double array[], const unsigned int arr_len) { 172 | unsigned int which_max = 0; 173 | double max = array[0]; 174 | 175 | unsigned int i = 1; 176 | for (; i < arr_len; i++) { 177 | if (array[i] > max) { 178 | which_max = i; 179 | max = array[i]; 180 | } 181 | } 182 | return which_max; 183 | } 184 | 185 | void fprint_array(FILE *stream, const double array[const], const unsigned int arr_len, char * restrict sep) { 186 | //fputs("{ ", stream); 187 | unsigned int i = 0; 188 | for (; i < arr_len-1; i++) 189 | fprintf(stream, "%g%s", array[i], sep); 190 | fprintf(stream, "%g\n", array[arr_len-1]); 191 | } 192 | 193 | void fprint_arrayf(FILE *stream, const float array[const], const unsigned int arr_len, char * restrict sep) { 194 | //fputs("{ ", stream); 195 | unsigned int i = 0; 196 | for (; i < arr_len-1; i++) 197 | fprintf(stream, "%g%s", array[i], sep); 198 | fprintf(stream, "%g\n", array[arr_len-1]); 199 | } 200 | 201 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep) { 202 | char line[STDIN_SENT_MAX_CHARS]; 203 | if (fgets(line, sizeof(line), stream) == NULL) // Get line 204 | return 0; 205 | int elems = 0; 206 | char * restrict token; 207 | if ((token = strtok(line, sep)) == NULL) 208 | return 0; 209 | while (token) { 210 | array[elems] = atof(token); 211 | elems++; 212 | token = strtok(NULL, sep); 213 | } 214 | 215 | return elems; 216 | } 217 | 218 | 219 | // Analogous to strncat(), but with variable number of arguments 220 | void arrncat(double full_array[], const unsigned int full_array_len, ...) { 221 | va_list argptr; 222 | va_start(argptr, full_array_len); 223 | 224 | double * restrict offset = full_array; 225 | double * restrict full_array_last = full_array + full_array_len; 226 | //printf("30: full_array=%p, offset=%p, full_array_len=%u, sizeof(double)=%lu, len*size=%lu, full_array_last=%p, diff=%li\n", full_array, offset, full_array_len, sizeof(double), full_array_len*sizeof(double), full_array_last, full_array_last - full_array); 227 | 228 | while (offset < full_array_last) { 229 | double * restrict arr = va_arg(argptr, double*); 230 | //printf("31\n"); 231 | unsigned int arr_len = va_arg(argptr, unsigned int); 232 | //printf("32: arr_len=%u\n", arr_len); 233 | unsigned int arr_len_bytes = arr_len * sizeof(double); 234 | //printf("33: full_array=%p, offset=%p, *<-=%g, *+1=%g, full_array_len=%u, arr_len=%u, arr_len_bytes=%u, arr[0]=%g, arr[1]=%g, arr_last=%g\n", full_array, offset, *offset, *(offset+1), full_array_len, arr_len, arr_len_bytes, arr[0], arr[1], arr[arr_len-1]); fflush(stdout); 235 | memcpy(offset, arr, arr_len_bytes); 236 | //printf("34: offset=%p, *<-=%g, *+1=%g, *-1=%g, full_array_last=%p arr_len_bytes=%u\n", offset, *offset, *(offset+1), *(offset-1), full_array_last, arr_len_bytes); fflush(stdout); 237 | offset += arr_len; 238 | //printf("35: full_array=%p, offset=%p, full_array_last=%p arr_len_bytes=%u\n", full_array, offset, full_array_last, arr_len_bytes); fflush(stdout); 239 | //printf("36: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n"); 240 | } 241 | va_end(argptr); 242 | //printf("37: Full array: "); fprint_array(stdout, full_array, full_array_len, ", "); printf("\n"); 243 | } 244 | -------------------------------------------------------------------------------- /src/clustercat-array.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_DKLM_ARRAY_HEADER 2 | #define INCLUDE_DKLM_ARRAY_HEADER 3 | 4 | int any(const double array[], unsigned int arr_len); 5 | int anyf(const float array[], unsigned int arr_len); 6 | int all(const double array[], unsigned int arr_len); 7 | int allf(const float array[], unsigned int arr_len); 8 | 9 | double sum(const double array[], unsigned int arr_len); 10 | float sumf(const float array[], unsigned int arr_len); 11 | double product(const double array[], unsigned int arr_len); 12 | float productf(const float array[], unsigned int arr_len); 13 | 14 | double min(const double array[], unsigned int arr_len); 15 | float minf(const float array[], unsigned int arr_len); 16 | double max(const double array[], unsigned int arr_len); 17 | float maxf(const float array[], unsigned int arr_len); 18 | 19 | unsigned int which_min(const double array[], const unsigned int arr_len); 20 | unsigned int which_minf(const float array[], const unsigned int arr_len); 21 | unsigned int which_max(const double array[], const unsigned int arr_len); 22 | unsigned int which_maxf(const float array[], const unsigned int arr_len); 23 | 24 | void fprint_array(FILE *stream, const double array[], const unsigned int arr_len, char * restrict sep); 25 | void fprint_arrayf(FILE *stream, const float array[], const unsigned int arr_len, char * restrict sep); 26 | 27 | unsigned int scan_array_of_doubles(FILE *stream, double array[], char * restrict sep); 28 | 29 | void arrncat(double full_array[], const unsigned int full_array_len, ...); 30 | 31 | #endif // INCLUDE_HEADER 32 | -------------------------------------------------------------------------------- /src/clustercat-cluster.c: -------------------------------------------------------------------------------- 1 | #include // clock_t, clock(), CLOCKS_PER_SEC, etc. 2 | #include // FLT_MAX, etc. 3 | #include "clustercat-cluster.h" 4 | #include "clustercat-array.h" 5 | #include "clustercat-math.h" 6 | 7 | float entropy_term(const float entropy_terms[const], const unsigned int i); 8 | double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move); 9 | double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move); 10 | 11 | inline float entropy_term(const float entropy_terms[const], const unsigned int i) { 12 | if (i < ENTROPY_TERMS_MAX) 13 | return entropy_terms[i]; 14 | else 15 | return i * log2f(i); 16 | } 17 | 18 | inline double pex_remove_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t from_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) { 19 | // See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008): https://www.aclweb.org/anthology/P/P08/P08-1086.pdf 20 | register double delta = 0.0; 21 | const unsigned int count_class = count_array[from_class]; 22 | if (count_class > 1) 23 | delta = entropy_term(entropy_terms, count_class); 24 | const unsigned int new_count_class = count_class - word_count; 25 | if (new_count_class > 1) 26 | delta -= entropy_term(entropy_terms, new_count_class); 27 | //printf("rm42: word=%u, word_count=%u, from_class=%u, count_class=%u, new_count_class=%u (count_class - word_count), delta=%g\n", word, word_count, from_class, count_class, new_count_class, delta); fflush(stdout); 28 | 29 | if (! is_tentative_move) 30 | count_array[from_class] = new_count_class; 31 | 32 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) { 33 | word_id_t prev_word = word_bigrams[word].predecessors[i]; 34 | //printf(" rm43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * from_class=%u)\n", i, word_bigrams[word].length, word, (prev_word * cmd_args.num_classes + from_class), prev_word, cmd_args.num_classes, from_class); fflush(stdout); 35 | const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + from_class]; 36 | if (word_class_count > 1) // Can't do log(0); no need for 1 37 | delta -= entropy_term(entropy_terms, word_class_count); 38 | const unsigned int new_word_class_count = word_class_count - word_bigrams[word].bigram_counts[i]; 39 | delta += entropy_term(entropy_terms, new_word_class_count); 40 | //printf(" rm45: word=%u (#=%u), prev_word=%u, #()=%u, from_class=%u, i=%u, count_class=%u, new_count_class=%u, =<%u,%u>, #()=%u, new_#()=%u (w-c - %u), delta=%g\n", word, word_count, prev_word, word_bigrams[word].bigram_counts[i], from_class, i, count_class, new_count_class, prev_word, from_class, word_class_count, new_word_class_count, word_bigrams[word].bigram_counts[i], delta); fflush(stdout); 41 | //print_word_class_counts(cmd_args, model_metadata, word_class_counts); 42 | if (! is_tentative_move) 43 | word_class_counts[prev_word * cmd_args.num_classes + from_class] = new_word_class_count; 44 | 45 | } 46 | 47 | if (cmd_args.rev_alternate && (!is_tentative_move)) { // also update reversed word-class counts 48 | for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) { 49 | const word_id_t next_word = word_bigrams_rev[word].predecessors[i]; 50 | const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + from_class]; 51 | const unsigned int new_word_class_rev_count = word_class_rev_count - word_bigrams_rev[word].bigram_counts[i]; 52 | //printf(" rm47: rev_next_word=%u, rev_#()=%u, rev_new_#()=%u\n", next_word, word_class_rev_count, new_word_class_rev_count); fflush(stdout); 53 | //print_word_class_counts(cmd_args, model_metadata, word_class_rev_counts); 54 | word_class_rev_counts[next_word * cmd_args.num_classes + from_class] = new_word_class_rev_count; 55 | } 56 | } 57 | 58 | return delta; 59 | } 60 | 61 | inline double pex_move_word(const struct cmd_args cmd_args, const word_id_t word, const word_count_t word_count, const wclass_t to_class, const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_array_t count_array, const float entropy_terms[const], const bool is_tentative_move) { 62 | // See Procedure MoveWord on page 758 of Uszkoreit & Brants (2008): https://www.aclweb.org/anthology/P/P08/P08-1086.pdf 63 | unsigned int count_class = count_array[to_class]; 64 | if (!count_class) // class is empty 65 | count_class = 1; 66 | const unsigned int new_count_class = count_class + word_count; // Differs from paper: replace "-" with "+" 67 | register double delta = entropy_term(entropy_terms, count_class) - entropy_term(entropy_terms, new_count_class); 68 | //printf("mv42: word=%u, word_count=%u, to_class=%u, count_class=%u, new_count_class=%u, delta=%g, is_tentative_move=%d\n", word, word_count, to_class, count_class, new_count_class, delta, is_tentative_move); fflush(stdout); 69 | const float backward_lambda = 1 - cmd_args.forward_lambda; 70 | 71 | if (! is_tentative_move) 72 | count_array[to_class] = new_count_class; 73 | 74 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) { 75 | word_id_t prev_word = word_bigrams[word].predecessors[i]; 76 | //printf(" mv43: i=%u, len=%u, word=%u, offset=%u (prev_word=%u + num_classes=%u * to_class=%u)\n", i, word_bigrams[word].length, word, (prev_word * cmd_args.num_classes + to_class), prev_word, cmd_args.num_classes, to_class); fflush(stdout); 77 | const unsigned int word_class_count = word_class_counts[prev_word * cmd_args.num_classes + to_class]; 78 | if (word_class_count > 1) { // Can't do log(0); no need for 1 79 | if (cmd_args.unidirectional) { 80 | delta -= entropy_term(entropy_terms, word_class_count); 81 | } else { 82 | delta -= entropy_term(entropy_terms, word_class_count) * cmd_args.forward_lambda; 83 | } 84 | } 85 | const unsigned int new_word_class_count = word_class_count + word_bigrams[word].bigram_counts[i]; // Differs from paper: replace "-" with "+" 86 | if (new_word_class_count > 1) { // Can't do log(0) 87 | if (cmd_args.unidirectional) { 88 | delta += entropy_term(entropy_terms, new_word_class_count); 89 | } else { 90 | delta += entropy_term(entropy_terms, new_word_class_count) * cmd_args.forward_lambda; 91 | } 92 | } 93 | //printf(" mv45: word=%u; prev_word=%u, to_class=%u, i=%u, word_count=%u, count_class=%u, new_count_class=%u, =<%u,%hu>, #()=%u, new_#()=%u, delta=%g\n", word, prev_word, to_class, i, word_count, count_class, new_count_class, prev_word, to_class, word_class_count, new_word_class_count, delta); fflush(stdout); 94 | if (! is_tentative_move) 95 | word_class_counts[prev_word * cmd_args.num_classes + to_class] = new_word_class_count; 96 | 97 | } 98 | 99 | if (cmd_args.rev_alternate) { // also update reversed word-class counts; reversed order of conditionals since the first clause here is more common in this function 100 | for (unsigned int i = 0; i < word_bigrams_rev[word].length; i++) { 101 | const word_id_t next_word = word_bigrams_rev[word].predecessors[i]; 102 | const unsigned int word_class_rev_count = word_class_rev_counts[next_word * cmd_args.num_classes + to_class]; 103 | if (word_class_rev_count > 1) // Can't do log(0); no need for 1 104 | if (!cmd_args.unidirectional) 105 | delta -= entropy_term(entropy_terms, word_class_rev_count) * backward_lambda; 106 | 107 | const unsigned int new_word_class_rev_count = word_class_rev_count + word_bigrams_rev[word].bigram_counts[i]; 108 | if (new_word_class_rev_count > 1) // Can't do log(0); no need for 1 109 | if (!cmd_args.unidirectional) 110 | //delta += entropy_term(entropy_terms, word_class_rev_count) * backward_lambda; 111 | delta += entropy_term(entropy_terms, new_word_class_rev_count) * backward_lambda; 112 | //printf("word=%u, word_class_rev_count=%u, new_word_class_rev_count=%u, delta=%g\n", word, word_class_rev_count, new_word_class_rev_count, delta); 113 | if (!is_tentative_move) 114 | word_class_rev_counts[next_word * cmd_args.num_classes + to_class] = new_word_class_rev_count; 115 | } 116 | } 117 | 118 | return delta; 119 | } 120 | 121 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) { 122 | unsigned long steps = 0; 123 | 124 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) { // Exchange algorithm: See Sven Martin, Jörg Liermann, Hermann Ney. 1998. Algorithms For Bigram And Trigram Word Clustering. Speech Communication 24. 19-37. http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.2354 125 | // Get initial logprob 126 | count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *)); 127 | init_count_arrays(cmd_args, count_arrays); 128 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays); 129 | unsigned int num_classes_current = (cmd_args.num_classes > 15) && (cmd_args.refine) ? powi(2,cmd_args.refine) : cmd_args.num_classes; // Don't bother with class refinement if the number of classes is really small. powi() is declared in clustercat-math.h 130 | 131 | // Build precomputed entropy terms 132 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float)); 133 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX); 134 | 135 | if (cmd_args.verbose > 3) { 136 | printf("cluster(): 42: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) { 137 | printf("c_%u=%lu, ", i, (unsigned long)count_arrays[0][i]); 138 | class_sum += count_arrays[0][i]; 139 | } printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout); 140 | } 141 | double best_log_prob = training_data_log_likelihood(cmd_args, model_metadata, count_arrays, word_counts, word2class); 142 | 143 | if (cmd_args.verbose >= -1) { 144 | fprintf(stderr, "%s: Expected Steps: %'lu (%'u word types x %'u classes x %'u cycles); initial logprob=%g, PP=%g\n", argv_0_basename, (unsigned long)model_metadata.type_count * cmd_args.num_classes * cmd_args.tune_cycles, model_metadata.type_count, cmd_args.num_classes, cmd_args.tune_cycles, best_log_prob, perplexity(best_log_prob, (model_metadata.token_count + model_metadata.line_count))); fflush(stderr); 145 | } 146 | 147 | time_t time_start_cycles; 148 | time(&time_start_cycles); 149 | unsigned short cycle = 1; // Keep this around afterwards to print out number of actually-completed cycles 150 | word_id_t moved_count = 0; 151 | count_arrays_t temp_count_arrays = malloc(cmd_args.max_array * sizeof(void *)); 152 | init_count_arrays(cmd_args, temp_count_arrays); 153 | for (; cycle <= cmd_args.tune_cycles; cycle++) { 154 | if (cmd_args.refine && (cycle == 4)) // Current setting forces bump to full cluster size after 3 iterations, but you can change this line and the next for a different schedule 155 | num_classes_current = cmd_args.num_classes; 156 | if ((num_classes_current != cmd_args.num_classes) && (num_classes_current > (cmd_args.num_classes / 4.0))) { // If the coarse cluster size is close to the final size, just go do the final size 157 | num_classes_current = cmd_args.num_classes; 158 | time(&time_start_cycles); // restart timer, when full clustering starts 159 | } 160 | 161 | const bool is_nonreversed_cycle = (cmd_args.rev_alternate == 0) || (cycle % (cmd_args.rev_alternate+1)); // Only do a reverse predictive exchange (using ) after every cmd_arg.rev_alternate cycles; if rev_alternate==0 then always do this part. 162 | 163 | clear_count_arrays(cmd_args, temp_count_arrays); 164 | double queried_log_prob = 0.0; 165 | if (model_metadata.token_count < 5e8 || cycle == cmd_args.tune_cycles || cycle == 2 || cycle == 3) { // For large training sets, only calculate PP on the interesting iterations 166 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, temp_count_arrays); 167 | queried_log_prob = training_data_log_likelihood(cmd_args, model_metadata, temp_count_arrays, word_counts, word2class); 168 | } 169 | 170 | // ETA stuff 171 | const time_t time_this_cycle = time(NULL); 172 | const double time_elapsed = difftime(time_this_cycle, time_start_cycles) + 7.0; // a little is added since time prediction in early cycles tend to be too optimistic 173 | const double time_avg_per_cycle = (time_elapsed / ((double)cycle-1)); 174 | const unsigned int remaining_cycles = cmd_args.tune_cycles - cycle + 1; 175 | const double time_remaining = ( time_avg_per_cycle * remaining_cycles); 176 | const time_t eta = time_this_cycle + time_remaining; 177 | 178 | if (cmd_args.verbose >= -1) { 179 | if (is_nonreversed_cycle) 180 | fprintf(stderr, "ccat: Normal cycle %-2u", cycle); 181 | else 182 | fprintf(stderr, "ccat: Rev cycle %-2u", cycle); 183 | fprintf(stderr, " C=%-3u", num_classes_current); 184 | if (cycle > 1) { 185 | fprintf(stderr, " Words moved last cycle: %.2g%% (%u/%u).", (100 * (moved_count / (float)model_metadata.type_count)), moved_count, model_metadata.type_count); 186 | if (cycle > 4) { 187 | char eta_string[300]; 188 | strftime(eta_string, 300, "%x %X", localtime(&eta)); 189 | fprintf(stderr, " Time left: %lim %lis. ETA: %s", (long)time_remaining/60, ((long)time_remaining % 60), eta_string); 190 | } 191 | if (queried_log_prob) { 192 | if (cmd_args.ngram_input) { 193 | fprintf(stderr, " LL=%g", queried_log_prob); // can't get reliable PP if input is ngram counts 194 | } else { 195 | fprintf(stderr, " LL=%.3g PP=%g", queried_log_prob, perplexity(queried_log_prob,(model_metadata.token_count + model_metadata.line_count))); 196 | } 197 | } 198 | fprintf(stderr, "\n"); 199 | } 200 | else if ( cmd_args.refine) 201 | fprintf(stderr, " Starting with %u coarse classes, for the first few cycles\n", num_classes_current); 202 | else 203 | fprintf(stderr, "\n"); 204 | fflush(stderr); 205 | } 206 | moved_count = 0; 207 | 208 | //#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps) // non-determinism 209 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) { 210 | //for (word_id_t word_i = model_metadata.type_count-1; word_i != -1; word_i--) { 211 | if (cycle < 3 && word_i < num_classes_current) // don't move high-frequency words in the first (few) iteration(s) 212 | continue; 213 | const word_count_t word_i_count = word_bigrams[word_i].headword_count; 214 | const wclass_t old_class = word2class[word_i]; 215 | double scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array 216 | memset(scores, 0, sizeof(double) * cmd_args.num_classes); 217 | //const double delta_remove_word = pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays, true); 218 | //const double delta_remove_word = 0.0; // Not really necessary 219 | //const double delta_remove_word_rev = 0.0; // Not really necessary 220 | 221 | //printf("cluster(): 43: "); long unsigned int class_sum=0; for (wclass_t i = 0; i < cmd_args.num_classes; i++) { 222 | // printf("c_%u=%u, ", i, count_arrays[0][i]); 223 | // class_sum += count_arrays[0][i]; 224 | //} printf("\nClass Sum=%lu; Corpus Tokens=%lu\n", class_sum, model_metadata.token_count); fflush(stdout); 225 | 226 | #pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:steps) 227 | for (wclass_t class = 0; class < num_classes_current; class++) { // class values range from 0 to num_classes_current-1 228 | if (is_nonreversed_cycle) { 229 | scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true); 230 | } else { // This is the reversed one 231 | scores[class] = pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, true); 232 | } 233 | steps++; 234 | } 235 | //scores[old_class] -= 0.80 / cycle; // TA 236 | 237 | const wclass_t best_hypothesis_class = which_max(scores, num_classes_current); 238 | const double best_hypothesis_score = max(scores, num_classes_current); 239 | 240 | if (cmd_args.verbose > 1) { 241 | printf("Orig score for word w_«%u» using class «%hu» is %g; Hypos %u-%u: ", word_i, old_class, scores[old_class], 1, num_classes_current); 242 | fprint_array(stdout, scores, num_classes_current, ","); fflush(stdout); 243 | //if (best_hypothesis_score > 0) { // Shouldn't happen 244 | // fprintf(stderr, "Error: best_hypothesis_score=%g for class %hu > 0\n", best_hypothesis_score, best_hypothesis_class); fflush(stderr); 245 | // exit(9); 246 | //} 247 | } 248 | 249 | if (old_class != best_hypothesis_class) { // We've improved 250 | moved_count++; 251 | 252 | if (cmd_args.verbose > 0) { 253 | fprintf(stderr, " Moving id=%-7u count=%-7lu %-18s %u -> %u\t(%g -> %g)\n", word_i, (unsigned long)word_bigrams[word_i].headword_count, word_list[word_i], old_class, best_hypothesis_class, scores[old_class], best_hypothesis_score); fflush(stderr); 254 | } 255 | //word2class[word_i] = best_hypothesis_class; 256 | word2class[word_i] = best_hypothesis_class; 257 | if (isnan(best_hypothesis_score)) { // shouldn't happen 258 | fprintf(stderr, "Error: best_hypothesis_score=%g :-(\n", best_hypothesis_score); fflush(stderr); 259 | exit(5); 260 | } else { 261 | best_log_prob += best_hypothesis_score; 262 | } 263 | 264 | if (is_nonreversed_cycle) { 265 | pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false); 266 | pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, false); 267 | } else { // This is the reversed one 268 | pex_remove_word(cmd_args, word_i, word_i_count, old_class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false); 269 | pex_move_word(cmd_args, word_i, word_i_count, best_hypothesis_class, word_bigrams_rev, word_bigrams, word_class_rev_counts, word_class_counts, count_arrays[0], entropy_terms, false); 270 | } 271 | } 272 | } 273 | 274 | //if (!moved_count) // Nothing moved in last cycle, so that's it 275 | // break; 276 | } 277 | 278 | if (cmd_args.verbose >= -1) { 279 | fprintf(stderr, "%s: Completed steps: %'lu\n", argv_0_basename, steps); fflush(stderr); 280 | } 281 | //fprintf(stderr, "%s: Completed steps: %'lu (%'u word types x %'u classes x %'u cycles); best logprob=%g, PP=%g\n", argv_0_basename, steps, model_metadata.type_count, num_classes_current, cycle-1, best_log_prob, perplexity(best_log_prob,(model_metadata.token_count - model_metadata.line_count))); fflush(stderr); 282 | 283 | if (cmd_args.class_algo == EXCHANGE_BROWN) 284 | post_exchange_brown_cluster(cmd_args, model_metadata, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays); 285 | 286 | free_count_arrays(cmd_args, temp_count_arrays); 287 | free(temp_count_arrays); 288 | free_count_arrays(cmd_args, count_arrays); 289 | free(count_arrays); 290 | free(entropy_terms); 291 | 292 | } else if (cmd_args.class_algo == BROWN) { // Agglomerative clustering. Stops when the number of current clusters is equal to the desired number in cmd_args.num_classes 293 | // "Things equal to nothing else are equal to each other." --Anon 294 | for (unsigned long current_num_classes = model_metadata.type_count; current_num_classes > cmd_args.num_classes; current_num_classes--) { 295 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) { 296 | float log_probs[cmd_args.num_classes]; 297 | //#pragma omp parallel for num_threads(cmd_args.num_threads) 298 | for (wclass_t class = 0; class < cmd_args.num_classes; class++, steps++) { 299 | // Get log prob 300 | log_probs[class] = -1 * (class+1); // Dummy predicate 301 | } 302 | wclass_t best_class = which_maxf(log_probs, cmd_args.num_classes); 303 | printf("Moving w_%u to class %u\n", word_i, best_class); 304 | } 305 | } 306 | } 307 | } 308 | 309 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts) { 310 | count_arrays_t count_arrays = malloc(cmd_args.max_array * sizeof(void *)); 311 | init_count_arrays(cmd_args, count_arrays); 312 | tally_class_ngram_counts(cmd_args, model_metadata, word_bigrams, word2class, count_arrays); 313 | 314 | // Build precomputed entropy terms 315 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float)); 316 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX); 317 | 318 | if ( ! cmd_args.print_freqs) // greedo compatible 319 | fprintf(out_file, "%lu %u\n", (long unsigned)model_metadata.type_count, cmd_args.num_classes); // Like output in word2vec 320 | 321 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) { 322 | const word_count_t word_i_count = word_bigrams[word_i].headword_count; 323 | float scores[cmd_args.num_classes]; // This doesn't need to be private in the OMP parallelization since each thead is writing to different element in the array. We use a float here to be compatible with word2vec 324 | float score_min = FLT_MAX; // use this later for rescaling 325 | 326 | #pragma omp parallel for num_threads(cmd_args.num_threads) 327 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) { // class values range from 0 to cmd_args.num_classes-1 328 | scores[class] = sqrt( -(float)pex_move_word(cmd_args, word_i, word_i_count, class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true)); 329 | if (scores[class] < score_min) 330 | score_min = scores[class]; 331 | } 332 | 333 | // Rescale vectors 334 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) { 335 | scores[class] -= score_min; 336 | } 337 | 338 | if (cmd_args.print_freqs) // greedo compatible 339 | fprintf(out_file, "%lu %s ", (long unsigned) word_i_count, word_list[word_i]); 340 | else // word2vec compatible 341 | fprintf(out_file, "%s ", word_list[word_i]); 342 | 343 | if (cmd_args.print_word_vectors == TEXT_VEC) 344 | fprint_arrayf(out_file, scores, cmd_args.num_classes, " "); 345 | else 346 | fwrite(scores, sizeof(float), cmd_args.num_classes, out_file); 347 | } 348 | 349 | free_count_arrays(cmd_args, count_arrays); 350 | free(count_arrays); 351 | free(entropy_terms); 352 | } 353 | 354 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays) { 355 | 356 | // Build precomputed entropy terms 357 | float * restrict entropy_terms = malloc(ENTROPY_TERMS_MAX * sizeof(float)); 358 | build_entropy_terms(cmd_args, entropy_terms, ENTROPY_TERMS_MAX); 359 | 360 | // Convert word2class to an array of classes pointing to arrays of words, which will successively get merged together 361 | struct_class_listing class2words[cmd_args.num_classes]; 362 | memset(class2words, 0, sizeof(struct_class_listing) * cmd_args.num_classes); 363 | get_class_listing(cmd_args, model_metadata, word2class, class2words); // invert word2class array so that we know what words are associated with a given class 364 | 365 | // Loop through classes, finding best pair of classes to merge. Use pex_move_word() to find best pairs. Record merges separately to reduce overhead. 366 | for (wclass_t total_merges = 0; total_merges < cmd_args.num_classes-1; total_merges++) { 367 | // The scores arrays don't need to be private in the OMP parallelization, since each thread is writing to different elements in the array 368 | wclass_t scores_1_which[cmd_args.num_classes]; 369 | double scores_1_val[cmd_args.num_classes]; 370 | memset(scores_1_which, 0, sizeof(wclass_t) * cmd_args.num_classes); 371 | memset(scores_1_val, 0, sizeof(double) * cmd_args.num_classes); 372 | 373 | #pragma omp parallel for num_threads(cmd_args.num_threads) 374 | for (wclass_t class_1 = 0; class_1 < cmd_args.num_classes-1; class_1++) { 375 | const size_t scores_2_length = cmd_args.num_classes - class_1; 376 | double scores_2[scores_2_length]; 377 | memset(scores_2, 0, sizeof(double) * scores_2_length); 378 | 379 | for (wclass_t class_2 = class_1+1; class_2 < cmd_args.num_classes; class_2++) { 380 | for (size_t word_offset = 0; word_offset < class2words[class_2].length; word_offset++) { // Sum of all words 381 | const word_id_t word = class2words[class_2].words[word_offset]; 382 | scores_2[class_2] += pex_move_word(cmd_args, word, word_bigrams[word].headword_count, class_1, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts, count_arrays[0], entropy_terms, true); 383 | } 384 | scores_1_which[class_1] = which_max(scores_2, scores_2_length); 385 | scores_1_val[class_1] = max(scores_2, scores_2_length); 386 | 387 | } 388 | //const double best_pairing_val = max(scores_1_val, cmd_args.num_classes); 389 | } 390 | } 391 | 392 | free_class_listing(cmd_args, class2words); 393 | free(entropy_terms); 394 | } 395 | 396 | 397 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words) { 398 | // Invert word2class array so that we know what words are associated with a given class 399 | 400 | // First pass through the word2class array to get counts of how many words are associated with a given class, then later allocate enough memory for these 401 | for (word_id_t word = 0; word < model_metadata.type_count; word++) { 402 | const wclass_t class = word2class[word]; 403 | class2words[class].length++; 404 | } 405 | 406 | // Allocate enough memory for all words in a given class, then zero-out length values, so that we know where next word should go 407 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) { 408 | class2words[class].words = malloc(sizeof(word_id_t) * class2words[class].length); 409 | class2words[class].length = 0; 410 | } 411 | 412 | // Now add each word to the word array, and increment local offset 413 | for (word_id_t word = 0; word < model_metadata.type_count; word++) { 414 | const wclass_t class = word2class[word]; 415 | class2words[class].words[class2words[class].length] = word; 416 | class2words[class].length++; // The final value of this should be the same as before we zeroed this value out 417 | } 418 | } 419 | 420 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words) { 421 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) 422 | free(class2words[class].words); 423 | } 424 | 425 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max) { 426 | entropy_terms[0] = 0.0; 427 | #pragma omp parallel for num_threads(cmd_args.num_threads) 428 | for (unsigned long i = 1; i < entropy_terms_max; i++) 429 | entropy_terms[i] = i * log2f(i); 430 | } 431 | -------------------------------------------------------------------------------- /src/clustercat-cluster.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CC_CLUSTER_HEADER 2 | #define INCLUDE_CC_CLUSTER_HEADER 3 | 4 | #include "clustercat.h" 5 | 6 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of word_id's all within the same class. We also keep track of the length of that array. 7 | word_id_t * words; 8 | unsigned int length; 9 | } struct_class_listing; 10 | 11 | void cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_count_t word_counts[const], char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts); 12 | 13 | void print_words_and_vectors(FILE * out_file, const struct cmd_args cmd_args, const struct_model_metadata model_metadata, char * word_list[restrict], wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts); 14 | 15 | void post_exchange_brown_cluster(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, wclass_t word2class[], const struct_word_bigram_entry word_bigrams[const], const struct_word_bigram_entry word_bigrams_rev[const], unsigned int * restrict word_class_counts, unsigned int * restrict word_class_rev_counts, count_arrays_t count_arrays); 16 | 17 | void build_entropy_terms(const struct cmd_args cmd_args, float * restrict entropy_terms, const unsigned int entropy_terms_max); 18 | 19 | void get_class_listing(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const wclass_t word2class[const], struct_class_listing * restrict class2words); 20 | void free_class_listing(const struct cmd_args cmd_args, struct_class_listing * restrict class2words); 21 | #endif // INCLUDE_HEADER 22 | -------------------------------------------------------------------------------- /src/clustercat-data.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_DATA_HEADER 2 | #define INCLUDE_CLUSTERCAT_DATA_HEADER 3 | 4 | #include "clustercat-map.h" 5 | //#include "clustercat-tree.h" 6 | 7 | // Thanks Dipstick 8 | #define STR(x) #x 9 | #define SHOW_DEFINE(x) printf("%s=%s\n", #x, STR(x)) 10 | // SHOW_DEFINE(DATA_STRUCT_FLOAT_NAME); // for example 11 | 12 | // Default to storing word-word entries in hash table using uthash 13 | // You can change this by compiling with -DATA_STORE_TREE_LCRS or -DATA_STORE_TRIE 14 | #if defined ATA_STORE_KHASH // https://github.com/attractivechaos/klib 15 | #define DATA_STRUCT_FLOAT_HUMAN_NAME "khash_map" 16 | #define DATA_STRUCT_FLOAT_NAME word_word_float_khash 17 | #define DATA_STRUCT_FLOAT_ADDR 18 | #define DATA_STRUCT_FLOAT_TYPE kh_struct_khash_float_t 19 | #define DATA_STRUCT_FLOAT_TYPE_IN_STRUCT kh_struct_khash_float_t 20 | #define DATA_STRUCT_FLOAT_SIZE sizeof(kh_struct_khash_float_t) 21 | #define DECLARE_DATA_STRUCT_FLOAT KHASH_MAP_INIT_STR(DATA_STRUCT_FLOAT_TYPE, float); 22 | #define INIT_DATA_STRUCT_FLOAT khash_t(struct_khash_float) * DATA_STRUCT_FLOAT_NAME = kh_init(struct_khash_float); 23 | #define UPDATE_ENTRY_FLOAT(db,key,val) { \ 24 | int ret; \ 25 | khint_t k = kh_put(struct_khash_float, (&db), (key), &ret); \ 26 | if (!ret) kh_del(struct_khash_float, (&db), (k)); \ 27 | kh_value((&db), (k)) = (val); \ 28 | } 29 | #define FIND_ENTRY_FLOAT(db,key) ( kh_get(struct_khash_float, (db), (key))) 30 | //#define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) ({ \ 31 | // unsigned long number_of_entries = 0; \ 32 | // for (khint_t k = kh_begin(db); k != kh_end(db); ++k) \ 33 | // if (kh_exist(db, k)) { \ 34 | // printf("foobar\n"); \ 35 | //// printf("%s%s%c%i\n", prefix, entry->key, sep_char, entry->count); 36 | // number_of_entries++; \ 37 | // } \ 38 | // return number_of_entries; \ 39 | //}) 40 | #define PRINT_ENTRIES_FLOAT(db, prefix, sep_char, min_count) (1) 41 | #endif 42 | 43 | typedef struct { 44 | struct_map_word *word_map; 45 | struct_map_word *word_word_map; 46 | struct_map_word *ngram_map; 47 | struct_map_word *class_map; 48 | char **unique_words; 49 | } struct_model_maps; 50 | 51 | 52 | #endif // INCLUDE_HEADER 53 | -------------------------------------------------------------------------------- /src/clustercat-dbg.c: -------------------------------------------------------------------------------- 1 | #include "clustercat-dbg.h" 2 | 3 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts) { 4 | for (wclass_t class = 0; class < cmd_args.num_classes; class++) { 5 | printf("Class=%u Offsets=%u,%u,...%u:\n\t", class, class, class+cmd_args.num_classes, (model_metadata.type_count-1) * cmd_args.num_classes + class); 6 | for (word_id_t word = 0; word < model_metadata.type_count; word++) { 7 | printf("#(<%u,%hu>)=%u ", word, class, word_class_counts[word * cmd_args.num_classes + class]); 8 | } 9 | printf("\n"); 10 | } 11 | fflush(stdout); 12 | } 13 | 14 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list) { 15 | printf("word_bigrams:\n"); fflush(stdout); 16 | for (word_id_t word_i = 0; word_i < model_metadata.type_count; word_i++) { 17 | printf(" %18s=%u -> {%lu, [", word_list[word_i], word_i, word_bigrams[word_i].length); fflush(stdout); 18 | for (word_id_t word_j = 0; word_j < word_bigrams[word_i].length; word_j++) { 19 | if (word_j > 0) 20 | printf(", "); 21 | printf("%s=%u (%ux)", word_list[word_bigrams[word_i].predecessors[word_j]], word_bigrams[word_i].predecessors[word_j], word_bigrams[word_i].bigram_counts[word_j]); 22 | } 23 | printf("]}\n"); fflush(stdout); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/clustercat-dbg.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CC_DBG_HEADER 2 | #define INCLUDE_CC_DBG_HEADER 3 | 4 | #include "clustercat.h" 5 | 6 | void print_word_class_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const word_class_count_t * restrict word_class_counts); 7 | 8 | void print_word_bigrams(const struct_model_metadata model_metadata, const struct_word_bigram_entry * restrict word_bigrams, char ** restrict word_list); 9 | 10 | #endif // INCLUDE_HEADER 11 | -------------------------------------------------------------------------------- /src/clustercat-import-class-file.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "clustercat-import-class-file.h" 4 | #include "clustercat-map.h" 5 | 6 | // Parse TSV file input and overwrite relevant word mappings 7 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes) { 8 | char * restrict line_end; 9 | char * restrict line = calloc(MAX_WORD_LEN + 9, 1); 10 | const word_id_t unk_id = map_find_id(word_map, UNKNOWN_WORD, -1); 11 | 12 | FILE *file = fopen(class_file_name, "r"); 13 | if (!file) { 14 | fprintf(stderr, "%s: fopen of '%s' failed: %s.\n", argv_0_basename, class_file_name, strerror(errno)); 15 | exit(EXIT_FAILURE); 16 | } 17 | while (fgets(line, MAX_WORD_LEN + 8, file) != 0) { 18 | 19 | line_end = strchr(line, '\n'); 20 | *line_end = '\0'; 21 | 22 | // Parse each line 23 | unsigned int keylen = strcspn(line, PRIMARY_SEP_STRING); 24 | line[keylen] = '\0'; // Split key and count 25 | char * restrict key = line; 26 | wclass_t class = atoi(line + keylen + 1); 27 | if (num_classes <= class) { 28 | fprintf(stderr, " Error: Imported word classes from file \"%s\" must be in a range [0,%u-1]. Word \"%s\" has class %i. If --num-classes is unset, a value is automatically chosen. See --help\n", class_file_name, num_classes, key, class); fflush(stderr); 29 | exit(13); 30 | } 31 | //printf("keylen=%i, key=<<%s>>, class=<<%d>>\n", keylen, key, class); 32 | word_id_t key_int = map_find_id(word_map, key, unk_id); 33 | word2class[key_int] = class; 34 | } 35 | 36 | fclose(file); 37 | free(line); 38 | } 39 | -------------------------------------------------------------------------------- /src/clustercat-import-class-file.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER 2 | #define INCLUDE_CLUSTERCAT_IMPORT_CLASS_FILE_HEADER 3 | 4 | #include "clustercat.h" // wclass_t 5 | 6 | void import_class_file(struct_map_word **word_map, wclass_t word2class[restrict], const char * restrict class_file_name, const wclass_t num_classes); 7 | 8 | #endif // INCLUDE_HEADER 9 | -------------------------------------------------------------------------------- /src/clustercat-io.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "clustercat.h" 4 | #include "clustercat-data.h" 5 | #include "clustercat-array.h" 6 | #include "clustercat-io.h" 7 | 8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage) { 9 | struct_model_metadata model_metadata = {0}; 10 | map_update_count(initial_word_map, UNKNOWN_WORD, 0, 0); // initialize entry for , , and 11 | map_update_count(initial_word_map, "", 0, 1); 12 | map_update_count(initial_word_map, "", 0, 2); 13 | const word_id_t unk_id = map_find_id(initial_word_map, UNKNOWN_WORD, 0); 14 | const word_id_t start_id = map_find_id(initial_word_map, "", 1); 15 | const word_id_t end_id = map_find_id(initial_word_map, "", 2); 16 | const size_t sizeof_struct_map_word = sizeof(struct_map_word); 17 | const size_t sizeof_struct_map_bigram = sizeof(struct_map_bigram); 18 | model_metadata.type_count = 3; // start with , , and , and . 19 | 20 | // n-gram input 21 | if (cmd_args.ngram_input) { 22 | char line[STDIN_SENT_MAX_CHARS]; 23 | register unsigned int strlen_line = 0; 24 | register unsigned long line_num = 1; 25 | register char * count_split_pos = NULL; 26 | register char * word_split_pos = NULL; 27 | register unsigned long count = 0; 28 | 29 | while (!feof(file)) { 30 | if (! fgets(line, STDIN_SENT_MAX_CHARS, file)) 31 | break; 32 | if (*line == '\n') // ignore empty lines 33 | continue; 34 | strlen_line = strlen(line); 35 | if (strlen_line == STDIN_SENT_MAX_CHARS-1) 36 | fprintf(stderr, "\n%s: Warning: Input line too long, at buffer line %lu. The full line was:\n%s\n", argv_0_basename, line_num, line); 37 | line[strlen_line-1] = '\0'; // rm newline 38 | 39 | // Split words from counts 40 | count_split_pos = strchr(line, '\t'); 41 | *count_split_pos = '\0'; 42 | if (count_split_pos == NULL) { 43 | fprintf(stderr, "\n%s: Warning: Malformed n-gram input line number %lu. The line was:\n%s\n", argv_0_basename, line_num, line); fflush(stderr); 44 | } else { 45 | count = strtoul(count_split_pos+1, NULL, 10); 46 | } 47 | 48 | // Try to split word1 from word2 49 | word_split_pos = strchr(line, ' '); 50 | 51 | if (word_split_pos) { // Line has bigrams 52 | *word_split_pos = '\0'; 53 | 54 | // Lookup each word 55 | const word_id_t w1 = map_find_id(initial_word_map, line, unk_id); 56 | const word_id_t w2 = map_find_id(initial_word_map, word_split_pos+1, unk_id); 57 | if (w1 == unk_id || w2 == unk_id) // Unseen word(s) in bigram 58 | fprintf(stderr, "%s: Warning: Unseen word(s) in bigram '%s %s' on input line %lu will be assigned to '%s'. Otherwise, include in unigram counts first.\n", argv_0_basename, line, word_split_pos+1, line_num, UNKNOWN_WORD); 59 | 60 | // Form bigram 61 | const struct_word_bigram bigram = {w1, w2}; 62 | 63 | // Update bigram count 64 | if (map_update_bigram(initial_bigram_map, &bigram, count)) // increment previous+ bigram in bigram map 65 | *memusage += sizeof_struct_map_bigram; 66 | 67 | } else { // Line has unigrams 68 | if (model_metadata.type_count == map_update_count(initial_word_map, line, count, model_metadata.type_count)) { // 's word_id is set to 0. 69 | model_metadata.type_count++; 70 | *memusage += sizeof_struct_map_word; 71 | } 72 | 73 | } 74 | 75 | //if (word_split_pos) // line could be unigram count 76 | // printf("w1=<<%s>>; w2=<<%s>>; count=<<%s>>==%lu\n", line, word_split_pos+1, count_split_pos+1, count); 77 | //else 78 | // printf("w1=<<%s>>; count=<<%s>>==%lu\n", line, count_split_pos+1, count); 79 | //fflush(stdout); 80 | 81 | line_num++; 82 | } 83 | 84 | 85 | // Normal text input 86 | } else { 87 | char curr_word[MAX_WORD_LEN + 1]; curr_word[MAX_WORD_LEN] = '\0'; 88 | register unsigned int chars_in_sent = 0; 89 | register int ch = 0; 90 | unsigned int curr_word_pos = 0; 91 | unsigned int prev_word_id = start_id; 92 | 93 | while (!feof(file)) { 94 | ch = getc(file); 95 | chars_in_sent++; 96 | //printf("«%c» ", ch); fflush(stdout); 97 | if (ch == ' ' || ch == '\t' || ch == '\n') { // end of a word 98 | 99 | if (chars_in_sent > STDIN_SENT_MAX_CHARS) { // Line too long 100 | curr_word_pos = 0; 101 | curr_word[0] = '\0'; // truncate word 102 | } else { 103 | curr_word[curr_word_pos] = '\0'; // terminate word 104 | } 105 | 106 | //printf("chars_in_sent=%u; max_chars=%u; curr_word=%s\n", chars_in_sent, STDIN_SENT_MAX_CHARS, curr_word); fflush(stdout); 107 | 108 | if (!strncmp(curr_word, "", 1)) { // ignore empty words, due to leading, trailing, and multiple spaces 109 | //printf("skipping empty word; ch=«%c»\n", ch); fflush(stdout); 110 | if (ch == '\n') { // trailing spaces require more stuff to do 111 | const struct_word_bigram bigram = {prev_word_id, end_id}; 112 | if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+ bigram in bigram map 113 | *memusage += sizeof_struct_map_bigram; 114 | chars_in_sent = 0; 115 | prev_word_id = start_id; 116 | model_metadata.line_count++; 117 | } 118 | continue; 119 | } 120 | //printf("curr_word=%s, prev_id=%u\n", curr_word, prev_word_id); fflush(stdout); 121 | model_metadata.token_count++; 122 | curr_word_pos = 0; 123 | // increment current word in word map 124 | const word_id_t curr_word_id = map_increment_count(initial_word_map, curr_word, model_metadata.type_count); // 's word_id is set to 0. 125 | 126 | if (curr_word_id == model_metadata.type_count) { // previous call to map_increment_count() had a new word 127 | model_metadata.type_count++; 128 | *memusage += sizeof_struct_map_word; 129 | } 130 | 131 | // increment previous+current bigram in bigram map 132 | const struct_word_bigram bigram = {prev_word_id, curr_word_id}; 133 | //printf("{%u,%u}\n", prev_word_id, curr_word_id); fflush(stdout); 134 | if (map_increment_bigram(initial_bigram_map, &bigram)) // true if bigram is new 135 | *memusage += sizeof_struct_map_bigram; 136 | 137 | //printf("process_input(): curr_word=<<%s>>; curr_word_id=%u, prev_word_id=%u\n", curr_word, curr_word_id, prev_word_id); fflush(stdout); 138 | if (ch == '\n') { // end of line 139 | const struct_word_bigram bigram = {curr_word_id, end_id}; 140 | if (map_increment_bigram(initial_bigram_map, &bigram)) // increment previous+ bigram in bigram map 141 | *memusage += sizeof_struct_map_bigram; 142 | chars_in_sent = 0; 143 | prev_word_id = start_id; 144 | model_metadata.line_count++; 145 | } else { 146 | prev_word_id = curr_word_id; 147 | } 148 | 149 | } else { // normal character; within a word 150 | if (curr_word_pos > MAX_WORD_LEN) // word is too long; do nothing until space or newline 151 | continue; 152 | else 153 | curr_word[curr_word_pos++] = ch; 154 | } 155 | } 156 | } 157 | 158 | // Set counts of and once, based on line_count 159 | map_update_count(initial_word_map, "", model_metadata.line_count, 1); 160 | map_update_count(initial_word_map, "", model_metadata.line_count, 2); 161 | return model_metadata; 162 | } 163 | -------------------------------------------------------------------------------- /src/clustercat-io.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_IO 2 | #define INCLUDE_CLUSTERCAT_IO 3 | 4 | #include "clustercat.h" 5 | #include "clustercat-data.h" 6 | 7 | // Import 8 | struct_model_metadata process_input(const struct cmd_args cmd_args, FILE *file, struct_map_word ** initial_word_map, struct_map_bigram ** initial_bigram_map, size_t *memusage); 9 | 10 | #endif // INCLUDE_HEADER 11 | -------------------------------------------------------------------------------- /src/clustercat-map.c: -------------------------------------------------------------------------------- 1 | #include "clustercat-map.h" 2 | 3 | inline bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram) { 4 | struct_map_bigram *local_s; 5 | HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash? 6 | if (local_s == NULL) { 7 | local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram)); 8 | //memcpy(local_s->key, bigram, sizeof(struct_word_bigram)); 9 | local_s->key = *bigram; 10 | local_s->count = 1; 11 | HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s); 12 | return true; 13 | } else { 14 | (local_s->count)++; 15 | return false; 16 | } 17 | } 18 | 19 | inline bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count) { 20 | struct_map_bigram *local_s; 21 | HASH_FIND(hh, *map, bigram, sizeof(struct_word_bigram), local_s); // id already in the hash? 22 | if (local_s == NULL) { 23 | local_s = (struct_map_bigram *)malloc(sizeof(struct_map_bigram)); 24 | //memcpy(local_s->key, bigram, sizeof(struct_word_bigram)); 25 | local_s->key = *bigram; 26 | local_s->count = count; 27 | HASH_ADD(hh, *map, key, sizeof(struct_word_bigram), local_s); 28 | return true; 29 | } else { 30 | local_s->count += count; 31 | return false; 32 | } 33 | } 34 | 35 | void map_print_bigrams(struct_map_bigram **bigram_map, char **word_list) { 36 | struct_map_bigram *entry, *tmp; 37 | struct_word_bigram bigram_key; 38 | word_id_t w_1, w_2; 39 | word_bigram_count_t count; 40 | 41 | printf("bigram_map:\n"); 42 | HASH_ITER(hh, *bigram_map, entry, tmp) { 43 | count = entry->count; 44 | bigram_key = entry->key; 45 | w_1 = bigram_key.word_1; 46 | w_2 = bigram_key.word_2; 47 | if (w_1 == (word_id_t)-1 || w_2 == (word_id_t)-1) // Don't print dummy values 48 | continue; 49 | printf(" {%s=%u, %s=%u}: #=%u\n", word_list[w_1], w_1, word_list[w_2], w_2, count); 50 | //printf(" {%u, %u}: #=%u\n", w_1, w_2, count); fflush(stdout); 51 | } 52 | printf("\n"); fflush(stdout); 53 | } 54 | 55 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id) { 56 | // Iterates through initial bigram hash map and builds a new hash map based on the mapping of old word id's to new ids. Alongside this, it also builds a reversed counterpart. 57 | struct_map_bigram *entry, *tmp; 58 | struct_word_bigram orig_bigram, new_bigram, new_bigram_rev; 59 | word_id_t w_1, w_2; 60 | word_bigram_count_t count; 61 | //printf("initial_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); 62 | //printf("word_id_remap71: [%u,%u,%u,%u,%u,%u,...]\n", word_id_remap[0], word_id_remap[1], word_id_remap[2], word_id_remap[3], word_id_remap[4], word_id_remap[5]); 63 | 64 | HASH_ITER(hh, *initial_bigram_map, entry, tmp) { 65 | count = entry->count; 66 | orig_bigram = entry->key; 67 | w_1 = word_id_remap[orig_bigram.word_1]; 68 | w_2 = word_id_remap[orig_bigram.word_2]; 69 | if (w_1 == (word_id_t) -1) // reassign temporary placeholder unk_id to final unk_id 70 | w_1 = real_unk_id; 71 | if (w_2 == (word_id_t) -1) 72 | w_2 = real_unk_id; 73 | new_bigram = (struct_word_bigram) {w_1, w_2}; 74 | new_bigram_rev = (struct_word_bigram) {w_2, w_1}; 75 | //printf("remap_and_rev_bigram_map: count=%u, orig_w_1=%u, new_w_1=%u, orig_w_2=%u, new_w_2=%u\n", count, orig_bigram.word_1, w_1, orig_bigram.word_2, w_2); fflush(stdout); 76 | 77 | //#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel 78 | { 79 | //#pragma omp section 80 | { map_update_bigram(new_bigram_map, &new_bigram, count); } 81 | //const word_bigram_count_t bigram_count = map_update_bigram(&new_bigram_map, &new_bigram, count); 82 | //printf("map_update_bigram: {%u,%u} += %u; now %u\n", new_bigram.word_1, new_bigram.word_2, count, bigram_count); 83 | //#pragma omp section 84 | { map_update_bigram(new_bigram_map_rev, &new_bigram_rev, count); } 85 | } 86 | } 87 | } 88 | 89 | inline void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count) { // Based on uthash's docs 90 | struct_map_word *local_s; 91 | 92 | //HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 93 | //if (local_s == NULL) { 94 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word)); 95 | unsigned short strlen_entry_key = strlen(entry_key); 96 | local_s->key = malloc(strlen_entry_key + 1); 97 | strcpy(local_s->key, entry_key); 98 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s); 99 | //} 100 | local_s->count = count; 101 | } 102 | 103 | inline void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class) { 104 | struct_map_word_class *local_s; 105 | 106 | //HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 107 | //if (local_s == NULL) { 108 | local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class)); 109 | strncpy(local_s->key, entry_key, KEYLEN-1); 110 | HASH_ADD_STR(*map, key, local_s); 111 | //} 112 | local_s->word_count = word_count; 113 | local_s->class = entry_class; 114 | } 115 | 116 | inline void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned short entry_class) { 117 | struct_map_word_class *local_s; 118 | 119 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 120 | if (local_s == NULL) { 121 | local_s = (struct_map_word_class *)malloc(sizeof(struct_map_word_class)); 122 | strncpy(local_s->key, entry_key, KEYLEN-1); 123 | HASH_ADD_STR(*map, key, local_s); 124 | } 125 | local_s->class = entry_class; 126 | } 127 | 128 | inline void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) { 129 | struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering 130 | 131 | #pragma omp critical (map_set_word_id_lookup) 132 | { 133 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 134 | } 135 | if (local_s == NULL) { 136 | printf("Error: map_set_word_id(): word '%s' should already be in word_map\n", entry_key); // Shouldn't happen 137 | exit(5); 138 | } 139 | #pragma omp critical (map_set_word_id_assignment) 140 | { local_s->word_id = word_id; } 141 | } 142 | 143 | inline word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id) { // Based on uthash's docs 144 | struct_map_word *local_s; // local_s->word_id uninitialized here; assign value after filtering 145 | 146 | #pragma omp critical (map_increment_count_lookup) 147 | { 148 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 149 | if (local_s == NULL) { 150 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word)); 151 | local_s->count = 0; 152 | local_s->word_id = word_id; 153 | unsigned short strlen_entry_key = strlen(entry_key); 154 | local_s->key = malloc(strlen_entry_key + 1); 155 | strcpy(local_s->key, entry_key); 156 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s); 157 | } 158 | } 159 | #pragma omp critical (map_increment_count_increment) 160 | { ++local_s->count; } 161 | //printf("map: count of %s is now %u\n", entry_key, local_s->count); 162 | return local_s->word_id; 163 | } 164 | 165 | inline wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]) { // Based on uthash's docs 166 | struct_map_class *local_s; 167 | size_t sizeof_key = sizeof(wclass_t) * CLASSLEN; 168 | //printf("map++: sizeof_key=%zu, CLASSLEN=%u, cls_entry=[%hu,%hu,%hu,%hu]\n", sizeof_key, CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]); 169 | 170 | //#pragma omp critical // not needed since each thread gets its own class_map 171 | { 172 | //printf("***41***: sizeof_key=%zu, sizeof(wclass_t)=%zu, CLASSLEN=%u, key=<%u,%u,%u,%u>\n", sizeof_key, sizeof(wclass_t), CLASSLEN, entry_key[0], entry_key[1], entry_key[2], entry_key[3]); fflush(stdout); 173 | HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash? 174 | if (local_s == NULL) { 175 | local_s = (struct_map_class *)malloc(sizeof(struct_map_class)); 176 | local_s->count = 0; 177 | memcpy(local_s->key, entry_key, sizeof_key); 178 | HASH_ADD(hh, *map, key, sizeof_key, local_s); 179 | } 180 | //printf("\t***42***: count: %u\n", local_s->count); fflush(stdout); 181 | } 182 | #pragma omp critical (map_increment_count_fixed_width_increment) 183 | { ++local_s->count; } 184 | //printf("map: count of [%hu,%hu,%hu,%hu] is now %u\n", entry_key[0],entry_key[1],entry_key[2],entry_key[3], local_s->count); 185 | return local_s->count; 186 | } 187 | 188 | inline wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]) { // Based on uthash's docs 189 | struct_map_class *local_s; 190 | size_t sizeof_key = sizeof(wclass_t) * CLASSLEN; 191 | wclass_count_t local_count = 0; 192 | 193 | HASH_FIND(hh, *map, entry_key, sizeof_key, local_s); // id already in the hash? 194 | if (local_s != NULL) { // Deal with OOV 195 | local_count = local_s->count; 196 | } 197 | //printf("map: count=%u for cls_entry=[%hu,%hu,%hu,%hu]\n", local_count, entry_key[0], entry_key[1], entry_key[2], entry_key[3]); 198 | return local_count; 199 | } 200 | 201 | inline word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id) { // Based on uthash's docs 202 | struct_map_word *local_s; 203 | 204 | #pragma omp critical 205 | { 206 | HASH_FIND_STR(*map, entry_key, local_s); // id already in the hash? 207 | if (local_s == NULL) { 208 | local_s = (struct_map_word *)malloc(sizeof(struct_map_word)); 209 | local_s->count = count; 210 | local_s->word_id = word_id; 211 | unsigned short strlen_entry_key = strlen(entry_key); 212 | local_s->key = malloc(strlen_entry_key + 1); 213 | strcpy(local_s->key, entry_key); 214 | HASH_ADD_KEYPTR(hh, *map, local_s->key, strlen_entry_key, local_s); 215 | } else { 216 | local_s->count += count; 217 | } 218 | } 219 | return local_s->word_id; 220 | } 221 | 222 | inline word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs 223 | struct_map_word *local_s; 224 | word_count_t local_count = 0; 225 | 226 | HASH_FIND_STR(*map, entry_key, local_s); // local_s: output pointer 227 | if (local_s != NULL) { // Deal with OOV 228 | local_count = local_s->count; 229 | } 230 | return local_count; 231 | } 232 | 233 | inline word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id) { // Based on uthash's docs 234 | struct_map_word *local_s; 235 | word_id_t local_id = unknown_id; 236 | 237 | HASH_FIND_STR(*map, entry_key, local_s); 238 | if (local_s != NULL) { // Deal with OOV 239 | local_id = local_s->word_id; 240 | } 241 | return local_id; 242 | } 243 | 244 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key) { // Based on uthash's docs 245 | struct_map_word *local_s; 246 | 247 | HASH_FIND_STR(*map, entry_key, local_s); 248 | return *local_s; 249 | } 250 | 251 | inline wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk) { 252 | struct_map_word_class *local_s; 253 | 254 | HASH_FIND_STR(*map, entry_key, local_s); // local_s: output pointer 255 | if (local_s != NULL) { // Word is found 256 | return local_s->class; 257 | } else { // Word is not found 258 | return unk; 259 | } 260 | } 261 | 262 | word_id_t get_keys(struct_map_word *map[const], char *keys[]) { 263 | struct_map_word *entry, *tmp; 264 | word_id_t number_of_keys = 0; 265 | 266 | HASH_ITER(hh, *map, entry, tmp) { 267 | // Build-up array of keys 268 | unsigned short wlen = strlen(entry->key); 269 | keys[number_of_keys] = (char *) malloc(wlen + 1); 270 | strcpy(keys[number_of_keys], entry->key); 271 | //printf("key=%s, i=%lu, count=%u\n", entry->key, (unsigned long)number_of_keys, entry->count); 272 | number_of_keys++; 273 | } 274 | return number_of_keys; 275 | } 276 | 277 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]) { // most useful if map is already sorted by count; then you can directly map from old id to new id. 278 | struct_map_word *entry, *tmp; 279 | word_id_t number_of_keys = 0; // 0-2 are reserved for , , and 280 | 281 | HASH_ITER(hh, *map, entry, tmp) { 282 | //word_ids[number_of_keys] = entry->word_id; // Build-up array of word_id's, from new id to old one 283 | const word_id_t word_id = entry->word_id; 284 | //if (word_id < 3) // don't change id's for , , or 285 | // continue; 286 | word_ids[word_id] = number_of_keys; // Build-up array of word_id's, from old id to new one 287 | //printf("get_ids: old_id=%u\n", word_id); fflush(stdout); 288 | number_of_keys++; 289 | } 290 | return number_of_keys; 291 | } 292 | 293 | void delete_entry(struct_map_word **map, struct_map_word *entry) { // Based on uthash's docs 294 | HASH_DEL(*map, entry); // entry: pointer to deletee 295 | free(entry->key); // key is a malloc'd string 296 | free(entry); 297 | } 298 | 299 | void delete_all(struct_map_word **map) { 300 | struct_map_word *current_entry, *tmp; 301 | 302 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs 303 | HASH_DEL(*map, current_entry); // delete it (map advances to next) 304 | free(current_entry); // free it 305 | } 306 | } 307 | 308 | void delete_all_class(struct_map_class **map) { 309 | struct_map_class *current_entry, *tmp; 310 | 311 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs 312 | HASH_DEL(*map, current_entry); // delete it (map advances to next) 313 | free(current_entry); // free it 314 | } 315 | } 316 | 317 | void delete_all_bigram(struct_map_bigram **map) { 318 | struct_map_bigram *current_entry, *tmp; 319 | 320 | HASH_ITER(hh, *map, current_entry, tmp) { // Based on uthash's docs 321 | HASH_DEL(*map, current_entry); // delete it (map advances to next) 322 | free(current_entry); // free it 323 | } 324 | } 325 | 326 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs) { 327 | struct_map_word_class *map = NULL; 328 | 329 | for (word_id_t word_id = 0; word_id < type_count; word_id++) { // Populate new word2class_map, so we can do fun stuff like primary- and secondary-sort easily 330 | //printf("adding %s=%hu to temp word2class_map\n", word_list[word_id], word2class[word_id]); fflush(stdout); 331 | map_add_class(&map, word_list[word_id], (unsigned long)word_counts[word_id], word2class[word_id]); 332 | } 333 | 334 | sort_by_key(&map); // Tertiary sort, alphabetically by key 335 | word_class_sort_by_count(&map); // Secondary sort, by count 336 | sort_by_class(&map); // Primary sort, numerically by class 337 | 338 | struct_map_word_class *s; 339 | for (s = map; s != NULL; s = (struct_map_word_class *)(s->hh.next)) { 340 | fprintf(out_file, "%s\t%li", s->key, (long)(s->class) + class_offset); 341 | if (print_freqs) 342 | fprintf(out_file, "\t%lu", (long unsigned)(s->word_count)); 343 | fprintf(out_file, "\n"); 344 | HASH_DEL(map, s); // delete it (map advances to next) 345 | free(s->key); // free it 346 | //fprintf(stderr, "49.11: next=%zu\n", (struct_map_word_class *)(s->hh.next)); fflush(stderr); 347 | } 348 | } 349 | 350 | int count_sort(struct_map_word *a, struct_map_word *b) { // Based on uthash's docs 351 | return (b->count - a->count); // sort descending: most frequent to least frequent 352 | } 353 | 354 | void sort_by_count(struct_map_word **map) { // Based on uthash's docs 355 | HASH_SORT(*map, count_sort); 356 | } 357 | 358 | int id_sort(struct_map_word *a, struct_map_word *b) { 359 | return (a->word_id - b->word_id); // sort ascending 360 | } 361 | 362 | void sort_by_id(struct_map_word **map) { 363 | HASH_SORT(*map, id_sort); 364 | } 365 | 366 | int word_class_count_sort(struct_map_word_class *a, struct_map_word_class *b) { 367 | return (b->word_count - a->word_count); // sort descending: most frequent to least frequent 368 | } 369 | 370 | void word_class_sort_by_count(struct_map_word_class **map) { 371 | HASH_SORT(*map, word_class_count_sort); 372 | } 373 | 374 | int key_sort(struct_map_word_class *a, struct_map_word_class *b) { 375 | return strcmp(a->key, b->key); 376 | } 377 | 378 | void sort_by_key(struct_map_word_class **map) { 379 | HASH_SORT(*map, key_sort); 380 | } 381 | 382 | int class_sort(struct_map_word_class *a, struct_map_word_class *b) { // Based on uthash's docs 383 | return (a->class - b->class); 384 | } 385 | 386 | void sort_by_class(struct_map_word_class **map) { 387 | HASH_SORT(*map, class_sort); 388 | } 389 | 390 | inline int bigram_sort_word_1(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs 391 | return ((a->key).word_1 - (b->key).word_1); 392 | } 393 | 394 | inline int bigram_sort_word_2(struct_map_bigram *a, struct_map_bigram *b) { // Based on uthash's docs 395 | return ((a->key).word_2 - (b->key).word_2); 396 | } 397 | 398 | void sort_bigrams(struct_map_bigram **map) { 399 | HASH_SORT(*map, bigram_sort_word_2); 400 | //HASH_SORT(*map, bigram_sort_word_1); 401 | } 402 | 403 | unsigned long map_count(struct_map_word *map[const]) { 404 | return HASH_COUNT(*map); 405 | } 406 | 407 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count) { 408 | struct_map_word *entry, *tmp; 409 | unsigned long number_of_entries = 0; 410 | 411 | HASH_ITER(hh, *map, entry, tmp) { 412 | if (entry->count >= min_count) { 413 | printf("%s%s%c%lu\n", prefix, entry->key, sep_char, (unsigned long)entry->count); 414 | number_of_entries++; 415 | } 416 | } 417 | return number_of_entries; 418 | } 419 | -------------------------------------------------------------------------------- /src/clustercat-map.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_MAP_HEADER 2 | #define INCLUDE_CLUSTERCAT_MAP_HEADER 3 | 4 | #include 5 | #include 6 | #include "uthash.h" 7 | 8 | #ifdef ATA_STORE_KHASH 9 | #include "khash.h" 10 | KHASH_MAP_INIT_STR(struct_khash_float, float); 11 | #endif 12 | 13 | // Defaults 14 | #define KEYLEN 80 15 | #define CLASSLEN 3 // Longest possible class ngram to store 16 | typedef unsigned short wclass_t; // Max number of word classes 17 | typedef unsigned int wclass_count_t; // Max count of a given word class 18 | typedef unsigned int word_id_t; // Max number of words 19 | typedef unsigned int word_count_t; // Max count of a given word class 20 | typedef unsigned int word_bigram_count_t; // Max count of a given word bigram 21 | typedef unsigned int class_bigram_count_t; // Max count of a given class bigram 22 | typedef unsigned int word_class_count_t; // Max count of a given tuple 23 | 24 | typedef struct { 25 | word_id_t word_1; 26 | word_id_t word_2; 27 | } struct_word_bigram; 28 | 29 | 30 | typedef struct { // We need an O(1) map that we can iterate over later 31 | struct_word_bigram key; 32 | word_bigram_count_t count; 33 | UT_hash_handle hh; // makes this structure hashable 34 | } struct_map_bigram; 35 | 36 | typedef struct { 37 | char * restrict key; 38 | word_count_t count; 39 | word_id_t word_id; 40 | UT_hash_handle hh; // makes this structure hashable 41 | } struct_map_word; 42 | 43 | typedef struct { // Maps a class to its count 44 | wclass_t key[CLASSLEN]; 45 | wclass_count_t count; 46 | UT_hash_handle hh; // makes this structure hashable 47 | } struct_map_class; 48 | 49 | typedef struct { // Maps a word to its class 50 | char key[KEYLEN]; 51 | unsigned long word_count; 52 | wclass_t class; 53 | UT_hash_handle hh; // makes this structure hashable 54 | } struct_map_word_class; 55 | 56 | void map_add_entry(struct_map_word **map, char * restrict entry_key, const word_count_t count); 57 | 58 | void map_add_class(struct_map_word_class **map, const char * restrict entry_key, const unsigned long word_count, const wclass_t entry_class); 59 | 60 | void map_update_class(struct_map_word_class **map, const char * restrict entry_key, const wclass_t entry_class); 61 | 62 | void map_set_word_id(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id); 63 | 64 | word_id_t map_increment_count(struct_map_word **map, const char * restrict entry_key, const word_id_t word_id); 65 | 66 | wclass_count_t map_increment_count_fixed_width(struct_map_class **map, const wclass_t entry_key[const]); 67 | 68 | bool map_increment_bigram(struct_map_bigram **map, const struct_word_bigram * bigram); 69 | bool map_update_bigram(struct_map_bigram **map, const struct_word_bigram * bigram, const word_bigram_count_t count); 70 | void map_print_bigrams(struct_map_bigram **map, char **word_list); 71 | void remap_and_rev_bigram_map(struct_map_bigram ** initial_bigram_map, struct_map_bigram ** new_bigram_map, struct_map_bigram ** new_bigram_map_rev, word_id_t * restrict word_id_remap, const word_id_t real_unk_id); 72 | 73 | word_id_t map_update_count(struct_map_word **map, const char * restrict entry_key, const word_count_t count, const word_id_t word_id); 74 | 75 | struct_map_word map_find_entry(struct_map_word *map[const], const char * restrict entry_key); 76 | word_count_t map_find_count(struct_map_word *map[const], const char * restrict entry_key); 77 | wclass_count_t map_find_count_fixed_width(struct_map_class *map[const], const wclass_t entry_key[const]); 78 | 79 | word_id_t map_find_id(struct_map_word *map[const], const char * restrict entry_key, const word_id_t unknown_id); 80 | 81 | wclass_t get_class(struct_map_word_class *map[const], const char * restrict entry_key, const wclass_t unk); 82 | 83 | word_id_t get_keys(struct_map_word *map[const], char *keys[]); 84 | word_id_t get_ids(struct_map_word *map[const], word_id_t word_ids[restrict]); 85 | 86 | void sort_by_class(struct_map_word_class **map); 87 | void sort_by_key(struct_map_word_class **map); 88 | void sort_by_id(struct_map_word **map); 89 | void sort_by_count(struct_map_word **map); 90 | void word_class_sort_by_count(struct_map_word_class **map); 91 | void sort_bigrams(struct_map_bigram **map); 92 | 93 | unsigned long map_count(struct_map_word *map[const]); 94 | 95 | unsigned long map_print_entries(struct_map_word **map, const char * restrict prefix, const char sep_char, const word_count_t min_count); 96 | void print_words_and_classes(FILE * out_file, word_id_t type_count, char **word_list, const word_count_t word_counts[const], const wclass_t word2class[const], const int class_offset, const bool print_freqs); 97 | 98 | void delete_all(struct_map_word **map); 99 | void delete_all_class(struct_map_class **map); 100 | void delete_all_bigram(struct_map_bigram **map); 101 | void delete_entry(struct_map_word **map, struct_map_word *entry); 102 | 103 | #endif // INCLUDE_HEADER 104 | -------------------------------------------------------------------------------- /src/clustercat-math.c: -------------------------------------------------------------------------------- 1 | #include "clustercat.h" // Model importing/exporting functions 2 | #include "clustercat-math.h" 3 | 4 | double dot_product(const double probs[const], const double weights[const], int length) { 5 | double sum = 0; 6 | double sum_weights = 0; 7 | length--; 8 | 9 | for (; length >= 0; --length) { 10 | sum_weights += weights[length]; 11 | sum += probs[length] * weights[length]; 12 | //printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length); 13 | } 14 | //printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights); 15 | return sum_weights ? (sum / sum_weights) : 0.0; 16 | } 17 | 18 | float dot_productf(const float probs[const], const float weights[const], int length) { 19 | float sum = 0; 20 | float sum_weights = 0; 21 | length--; 22 | 23 | for (; length >= 0; --length) { 24 | sum_weights += weights[length]; 25 | sum += probs[length] * weights[length]; 26 | //printf("dot_product: sum=%g += probs[%i]=%g * weights[%i]=%g; length=%i;\n", sum, length, probs[length], length, weights[length], length); 27 | } 28 | //printf("dot_product: final sum = %g = prob_sum=%g/weight_sum=%g\n", sum/sum_weights, sum, sum_weights); 29 | return sum_weights ? (sum / sum_weights) : 0.0; 30 | } 31 | 32 | long int powi(long int base, long int exp) { // Integer exponentiation 33 | long int result = 1; 34 | while (exp--) 35 | result *= base; 36 | return result; 37 | } 38 | 39 | double perplexity(const double log_probs, const unsigned long num_words_queried) { 40 | // Assumes log_probs used log2() 41 | return pow(2, -log_probs / (double)num_words_queried); 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/clustercat-math.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_MATH 2 | #define INCLUDE_CLUSTERCAT_MATH 3 | 4 | double dot_product(const double probs[const], const double weights[const], int length); 5 | float dot_productf(const float probs[const], const float weights[const], int length); 6 | 7 | long int powi(long int base, long int exp); 8 | 9 | double perplexity(const double log_probs, const unsigned long num_words_queried); 10 | 11 | #endif // INCLUDE_HEADER 12 | -------------------------------------------------------------------------------- /src/clustercat-tokenize.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "clustercat-tokenize.h" 3 | 4 | // Simple threadsafe tokenization for plaintext, copying words into **sent_words 5 | // Remember to free using tokenize_simple_free() 6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words) { 7 | sentlen_t i; 8 | char * restrict pch; 9 | 10 | sent_words[0] = ""; 11 | 12 | for (i = 1, pch = sent_string; i < SENT_LEN_MAX ; i++) { 13 | sentlen_t toklen = strcspn(pch, " \n\t"); 14 | 15 | if (toklen == 0) { // End of sentence 16 | sent_words[i] = ""; 17 | break; 18 | } 19 | 20 | sent_words[i] = malloc(toklen+1); 21 | strncpy(sent_words[i], pch, toklen); // Threadsafe copy doesn't touch original 22 | sent_words[i][toklen] = '\0'; 23 | 24 | pch += toklen+1; 25 | } 26 | 27 | return i; 28 | } 29 | 30 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length) { 31 | sentlen_t i = 1; 32 | for (; i < length-1; ++i) { // Assumes word_0 is and word_sentlen is , which weren't malloc'd 33 | free(sent_words[i]); 34 | } 35 | free(sent_words); 36 | } 37 | -------------------------------------------------------------------------------- /src/clustercat-tokenize.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_TOKENIZE 2 | #define INCLUDE_CLUSTERCAT_TOKENIZE 3 | 4 | #include "clustercat.h" 5 | 6 | sentlen_t tokenize_simple(char * restrict sent_string, char * restrict * restrict sent_words); 7 | void tokenize_simple_free(char ** restrict sent_words, sentlen_t length); 8 | 9 | #endif // INCLUDE_HEADER 10 | -------------------------------------------------------------------------------- /src/clustercat.c: -------------------------------------------------------------------------------- 1 | /** Induces word categories 2 | * By Jon Dehdari, 2014-2016 3 | * Usage: ./clustercat [options] < corpus.tok.txt > classes.tsv 4 | **/ 5 | 6 | #include // UCHAR_MAX, UINT_MAX 7 | #include // DBL_MAX, etc. 8 | #include // isnan() 9 | #include // clock_t, clock(), CLOCKS_PER_SEC 10 | #include 11 | #include // OPTIONAL! Comment-out on non-Posix machines, and the function setlocale() in the first line of main() 12 | 13 | #include "clustercat.h" // Model importing/exporting functions 14 | #include "clustercat-array.h" // which_maxf() 15 | #include "clustercat-data.h" 16 | #include "clustercat-cluster.h" // cluster() 17 | #include "clustercat-dbg.h" // for printing out various complex data structures 18 | #include "clustercat-import-class-file.h" // import_class_file() 19 | #include "clustercat-io.h" // process_input() 20 | #include "clustercat-math.h" // perplexity(), powi() 21 | 22 | #define USAGE_LEN 10000 23 | #define LOG2ADD(a,b) (log2(a) + log2(1 + (b) / (a) )) 24 | 25 | // Declarations 26 | void get_usage_string(char * restrict usage_string, int usage_len); 27 | void parse_cmd_args(const int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args); 28 | char * restrict class_algo = NULL; 29 | char * restrict in_train_file_string = NULL; 30 | char * restrict out_file_string = NULL; 31 | char * restrict initial_class_file = NULL; 32 | char * argv_0_basename = NULL; 33 | 34 | struct_map_word *word_map = NULL; // Must initialize to NULL 35 | struct_map_bigram *initial_bigram_map = NULL; // Must initialize to NULL 36 | struct_map_bigram *new_bigram_map = NULL; // Must initialize to NULL 37 | struct_map_bigram *new_bigram_map_rev = NULL; // Must initialize to NULL 38 | char usage[USAGE_LEN]; 39 | size_t memusage = 0; 40 | 41 | 42 | // Defaults 43 | struct cmd_args cmd_args = { 44 | .class_algo = EXCHANGE, 45 | .class_offset = 0, 46 | .forward_lambda = 0.55, 47 | .min_count = 3, // or max(2, floor(N^0.14 - 7)) 48 | .max_array = 2, 49 | .ngram_input = false, 50 | .num_threads = 8, 51 | .num_classes = 0, 52 | .print_freqs = false, 53 | .print_word_vectors = NO_VEC, 54 | .refine = 2, 55 | .rev_alternate = 3, 56 | .tune_cycles = 15, 57 | .unidirectional = false, 58 | .verbose = 0, 59 | }; 60 | 61 | 62 | 63 | int main(int argc, char **argv) { 64 | setlocale(LC_ALL, ""); // Comment-out on non-Posix systems 65 | clock_t time_start = clock(); 66 | time_t time_t_start; 67 | time(&time_t_start); 68 | argv_0_basename = basename(argv[0]); 69 | get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere 70 | 71 | //printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args)); 72 | parse_cmd_args(argc, argv, usage, &cmd_args); 73 | 74 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) 75 | memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage 76 | 77 | struct_model_metadata global_metadata; 78 | 79 | // The list of unique words should always include , unknown word, and 80 | map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first 81 | map_update_count(&word_map, "", 0, 1); 82 | map_update_count(&word_map, "", 0, 2); 83 | 84 | // Open input 85 | FILE *in_train_file = stdin; 86 | if (in_train_file_string) 87 | in_train_file = fopen(in_train_file_string, "r"); 88 | if (in_train_file == NULL) { 89 | fprintf(stderr, "%s: Error: Unable to open input file %s\n", argv_0_basename, in_train_file_string); fflush(stderr); 90 | exit(15); 91 | } 92 | 93 | // Process input sentences 94 | size_t input_memusage = 0; 95 | const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage); 96 | memusage += input_memusage; 97 | fclose(in_train_file); 98 | 99 | clock_t time_input_processed = clock(); 100 | if (cmd_args.verbose >= -1) { 101 | fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr); 102 | } 103 | 104 | global_metadata.token_count = input_model_metadata.token_count; 105 | global_metadata.type_count = map_count(&word_map); 106 | 107 | // Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's 108 | sort_by_count(&word_map); 109 | word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count); 110 | get_ids(&word_map, word_id_remap); 111 | word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap); 112 | 113 | // Get list of unique words 114 | char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count); 115 | memusage += sizeof(char*) * global_metadata.type_count; 116 | reassign_word_ids(&word_map, word_list, word_id_remap); 117 | get_keys(&word_map, word_list); 118 | sort_by_id(&word_map); 119 | 120 | 121 | // Check or set number of classes 122 | if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low 123 | fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u). Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr); 124 | exit(3); 125 | } else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all 126 | cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2); 127 | } 128 | 129 | // Build array of word_counts 130 | word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count); 131 | memusage += sizeof(word_count_t) * global_metadata.type_count; 132 | build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count); 133 | 134 | // Initialize clusters, and possibly read-in external class file 135 | wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count); 136 | memusage += sizeof(wclass_t) * global_metadata.type_count; 137 | init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list); 138 | if (initial_class_file != NULL) 139 | import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file 140 | 141 | // Remap word_id's in initial_bigram_map 142 | remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1)); 143 | global_metadata.start_sent_id = map_find_id(&word_map, "", -1);; // need this for tallying emission probs 144 | global_metadata.end_sent_id = map_find_id(&word_map, "", -1);; // need this for tallying emission probs 145 | global_metadata.line_count = map_find_count(&word_map, ""); // Used for calculating perplexity 146 | 147 | if (global_metadata.line_count == 0) { 148 | fprintf(stderr, "%s: Warning: Number of lines is 0. Include and in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr); 149 | } 150 | 151 | //printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout); 152 | //printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout); 153 | free(word_id_remap); 154 | memusage -= sizeof(word_id_t) * input_model_metadata.type_count; 155 | delete_all(&word_map); // static 156 | delete_all_bigram(&initial_bigram_map); // static 157 | memusage -= input_memusage; 158 | 159 | // Initialize and set word bigram listing 160 | clock_t time_bigram_start = clock(); 161 | size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0; 162 | struct_word_bigram_entry * restrict word_bigrams = NULL; 163 | struct_word_bigram_entry * restrict word_bigrams_rev = NULL; 164 | 165 | if (cmd_args.verbose >= -1) { 166 | fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr); 167 | } 168 | 169 | #pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel 170 | { 171 | #pragma omp section 172 | { 173 | //sort_bigrams(&new_bigram_map); // speeds things up later 174 | word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry)); 175 | memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count; 176 | bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map); 177 | // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering 178 | for (word_id_t word = 0; word < global_metadata.type_count; word++) 179 | word_bigrams[word].headword_count = word_counts[word]; 180 | } 181 | 182 | // Initialize and set *reverse* word bigram listing 183 | #pragma omp section 184 | { 185 | if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used 186 | //sort_bigrams(&new_bigram_map_rev); // speeds things up later 187 | word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry)); 188 | memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count; 189 | bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev); 190 | // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering 191 | for (word_id_t word = 0; word < global_metadata.type_count; word++) 192 | word_bigrams_rev[word].headword_count = word_counts[word]; 193 | } 194 | } 195 | } 196 | 197 | delete_all_bigram(&new_bigram_map); 198 | delete_all_bigram(&new_bigram_map_rev); 199 | memusage += bigram_memusage + bigram_rev_memusage; 200 | clock_t time_bigram_end = clock(); 201 | if (cmd_args.verbose >= -1) { 202 | fprintf(stderr, "in %'.2f CPU secs. Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr); 203 | } 204 | 205 | //print_word_bigrams(global_metadata, word_bigrams, word_list); 206 | 207 | // Build counts, which consists of a word followed by a given class 208 | word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t)); 209 | if (word_class_counts == NULL) { 210 | fprintf(stderr, "%s: Error: Unable to allocate enough memory for . %'.1f MB needed. Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr); 211 | exit(13); 212 | } 213 | memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t); 214 | fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr); 215 | build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/); 216 | //print_word_class_counts(cmd_args, global_metadata, word_class_counts); 217 | 218 | // Build reverse: counts: class followed by word. This and the normal one are both pretty fast, so no need to parallelize this 219 | word_class_count_t * restrict word_class_rev_counts = NULL; 220 | if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used 221 | word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t)); 222 | if (word_class_rev_counts == NULL) { 223 | fprintf(stderr, "%s: Warning: Unable to allocate enough memory for . %'.1f MB needed. Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr); 224 | cmd_args.rev_alternate = 0; 225 | } else { 226 | memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t); 227 | fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr); 228 | build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/); 229 | } 230 | 231 | } 232 | 233 | // Calculate memusage for count_arrays 234 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { 235 | memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)); 236 | //printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout); 237 | } 238 | 239 | clock_t time_model_built = clock(); 240 | if (cmd_args.verbose >= -1) { 241 | fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr); 242 | } 243 | if (cmd_args.verbose >= -1) { 244 | fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr); 245 | } 246 | 247 | cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts); 248 | 249 | // Now print the final word2class mapping 250 | if (cmd_args.verbose >= 0) { 251 | FILE *out_file = stdout; 252 | if (out_file_string) 253 | out_file = fopen(out_file_string, "w"); 254 | if (out_file == NULL) { 255 | fprintf(stderr, "%s: Error: Unable to open output file %s\n", argv_0_basename, out_file_string); fflush(stderr); 256 | exit(16); 257 | } 258 | if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) { 259 | print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs); 260 | } else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) { 261 | print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts); 262 | } 263 | fclose(out_file); 264 | } 265 | 266 | clock_t time_clustered = clock(); 267 | time_t time_t_end; 268 | time(&time_t_end); 269 | double time_secs_total = difftime(time_t_end, time_t_start); 270 | if (cmd_args.verbose >= -1) 271 | fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds. Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60) ); 272 | 273 | free(word2class); 274 | free(word_bigrams); 275 | free(word_list); 276 | free(word_counts); 277 | exit(0); 278 | } 279 | 280 | 281 | void get_usage_string(char * restrict usage_string, int usage_len) { 282 | 283 | snprintf(usage_string, usage_len, "ClusterCat (c) 2014-2016 Jon Dehdari - LGPL v3 or Mozilla Public License v2\n\ 284 | \n\ 285 | Usage: clustercat [options] < corpus.tok.txt > classes.tsv \n\ 286 | \n\ 287 | Function: Induces word categories from plaintext\n\ 288 | \n\ 289 | Options:\n\ 290 | -c, --classes Set number of word classes (default: 1.2 * square root of vocabulary size)\n\ 291 | --class-file Initialize exchange word classes from an existing clustering tsv file (default: pseudo-random initialization\n\ 292 | for exchange). If you use this option, you probably can set --tune-cycles to 3 or so\n\ 293 | --class-offset Print final word classes starting at a given number (default: %d)\n\ 294 | --forward-lambda Set interpolation weight for forward bigram class model, in range of [0,1] (default: %g)\n\ 295 | -h, --help Print this usage\n\ 296 | --in Specify input training file (default: stdin)\n\ 297 | --ngram-input Input is a listing of n-grams and their counts. Otherwise input is a normal corpus\n\ 298 | --min-count Minimum count of entries in training set to consider (default: %d occurrences)\n\ 299 | --max-array Set maximum order of n-grams for which to use an array instead of a sparse hash map (default: %d-grams)\n\ 300 | --out Specify output file (default: stdout)\n\ 301 | --print-freqs Print word frequencies after words and classes in final clustering output (useful for visualization)\n\ 302 | -q, --quiet Print less output. Use additional -q for even less output\n\ 303 | --refine Set initial class refinement value (c==0 -> no refinement; otherwise 2^n. Default:c==2 -> 4 initial clusters)\n\ 304 | --rev-alternate How often to alternate using reverse predictive exchange. 0==never, 1==after every normal cycle (default: %u)\n\ 305 | -j, --threads Set number of threads to run simultaneously (default: %d threads)\n\ 306 | --tune-cycles Set max number of cycles to tune on (default: %d cycles)\n\ 307 | --unidirectional Disable simultaneous bidirectional predictive exchange. Results in faster cycles, but slower & worse convergence\n\ 308 | If you want to do basic predictive exchange, use: --rev-alternate 0 --unidirectional\n\ 309 | -v, --verbose Print additional info to stderr. Use additional -v for more verbosity\n\ 310 | --word-vectors Print word vectors (a.k.a. word embeddings) instead of discrete classes.\n\ 311 | Specify as either 'text' or 'binary'. The binary format is compatible with word2vec\n\ 312 | \n\ 313 | ", cmd_args.class_offset, cmd_args.forward_lambda, cmd_args.min_count, cmd_args.max_array, cmd_args.rev_alternate, cmd_args.num_threads, cmd_args.tune_cycles); 314 | } 315 | // --class-algo Set class-induction algorithm {brown,exchange,exchange-then-brown} (default: exchange)\n\ 316 | // -o, --order Maximum n-gram order in training set to consider (default: %d-grams)\n\ 317 | // -w, --weights 'f f ...' Set class interpolation weights for: 3-gram, 2-gram, 1-gram, rev 2-gram, rev 3-gram. (default: %s)\n\ 318 | 319 | void parse_cmd_args(int argc, char **argv, char * restrict usage, struct cmd_args *cmd_args) { 320 | for (int arg_i = 0; arg_i < argc; arg_i++) // Print command-line invocation, for reproducibility 321 | if (cmd_args->verbose >= -1) { 322 | fprintf(stderr, "%s ", argv[arg_i]); fflush(stderr); 323 | } 324 | if (cmd_args->verbose >= -1) { 325 | fprintf(stderr, "\n"); fflush(stderr); 326 | } 327 | 328 | for (int arg_i = 1; arg_i < argc; arg_i++) { 329 | if (!(strcmp(argv[arg_i], "-h") && strcmp(argv[arg_i], "--help"))) { 330 | printf("%s", usage); 331 | exit(0); 332 | } else if (!strcmp(argv[arg_i], "--class-algo")) { 333 | char * restrict class_algo_string = argv[arg_i+1]; 334 | arg_i++; 335 | if (!strcmp(class_algo_string, "brown")) 336 | cmd_args->class_algo = BROWN; 337 | else if (!strcmp(class_algo_string, "exchange")) 338 | cmd_args->class_algo = EXCHANGE; 339 | else if (!strcmp(class_algo_string, "exchange-then-brown")) 340 | cmd_args->class_algo = EXCHANGE_BROWN; 341 | else { printf("%s", usage); exit(1); } 342 | } else if (!strcmp(argv[arg_i], "--class-file")) { 343 | initial_class_file = argv[arg_i+1]; 344 | arg_i++; 345 | } else if (!strcmp(argv[arg_i], "--class-offset")) { 346 | cmd_args->class_offset = (signed char)atoi(argv[arg_i+1]); 347 | arg_i++; 348 | } else if (!strcmp(argv[arg_i], "--forward-lambda")) { 349 | cmd_args->forward_lambda = (float)atof(argv[arg_i+1]); 350 | arg_i++; 351 | } else if (!strcmp(argv[arg_i], "--in")) { 352 | in_train_file_string = argv[arg_i+1]; 353 | arg_i++; 354 | } else if (!(strcmp(argv[arg_i], "-j") && strcmp(argv[arg_i], "--threads") && strcmp(argv[arg_i], "--jobs"))) { 355 | cmd_args->num_threads = (unsigned int) atol(argv[arg_i+1]); 356 | arg_i++; 357 | } else if (!strcmp(argv[arg_i], "--min-count")) { 358 | cmd_args->min_count = (unsigned int) atol(argv[arg_i+1]); 359 | arg_i++; 360 | } else if (!strcmp(argv[arg_i], "--max-array")) { 361 | cmd_args->max_array = (unsigned char) atol(argv[arg_i+1]); 362 | if ((cmd_args->max_array) < 1 || (cmd_args->max_array > 3)) { 363 | printf("%s: --max-array value should be between 1-3\n", argv_0_basename); 364 | fflush(stderr); 365 | exit(10); 366 | } 367 | arg_i++; 368 | } else if (!(strcmp(argv[arg_i], "--ngram-input"))) { 369 | cmd_args->ngram_input = true; 370 | } else if (!(strcmp(argv[arg_i], "-c") && strcmp(argv[arg_i], "-n") && strcmp(argv[arg_i], "--classes") && strcmp(argv[arg_i], "--num-classes"))) { 371 | cmd_args->num_classes = (wclass_t) atol(argv[arg_i+1]); 372 | arg_i++; 373 | } else if (!strcmp(argv[arg_i], "--out")) { 374 | out_file_string = argv[arg_i+1]; 375 | arg_i++; 376 | } else if (!(strcmp(argv[arg_i], "--print-freqs"))) { 377 | cmd_args->print_freqs = true; 378 | } else if (!(strcmp(argv[arg_i], "-q") && strcmp(argv[arg_i], "--quiet"))) { 379 | cmd_args->verbose--; 380 | } else if (!(strcmp(argv[arg_i], "--refine"))) { 381 | cmd_args->refine = (unsigned char) atol(argv[arg_i+1]); 382 | arg_i++; 383 | } else if (!strcmp(argv[arg_i], "--rev-alternate")) { 384 | cmd_args->rev_alternate = (unsigned char) atoi(argv[arg_i+1]); 385 | arg_i++; 386 | } else if (!strcmp(argv[arg_i], "--tune-cycles")) { 387 | cmd_args->tune_cycles = (unsigned short) atol(argv[arg_i+1]); 388 | arg_i++; 389 | } else if (!(strcmp(argv[arg_i], "--unidirectional"))) { 390 | cmd_args->unidirectional = true; 391 | } else if (!(strcmp(argv[arg_i], "-v") && strcmp(argv[arg_i], "--verbose"))) { 392 | cmd_args->verbose++; 393 | } else if (!(strcmp(argv[arg_i], "--word-vectors"))) { 394 | char * restrict print_word_vectors_string = argv[arg_i+1]; 395 | arg_i++; 396 | if (!strcmp(print_word_vectors_string, "text")) 397 | cmd_args->print_word_vectors = TEXT_VEC; 398 | else if (!strcmp(print_word_vectors_string, "binary")) 399 | cmd_args->print_word_vectors = BINARY_VEC; 400 | else { printf("Error: Please specify either 'text' or 'binary' after the --word-vectors flag.\n\n%s", usage); exit(1); } 401 | } else if (!strncmp(argv[arg_i], "-", 1)) { // Unknown flag 402 | printf("%s: Unknown command-line argument: %s\n\n", argv_0_basename, argv[arg_i]); 403 | printf("%s", usage); fflush(stderr); 404 | exit(2); 405 | } 406 | } 407 | } 408 | 409 | void build_word_count_array(struct_map_word **word_map, char * restrict word_list[const], word_count_t word_counts[restrict], const word_id_t type_count) { 410 | for (word_id_t i = 0; i < type_count; i++) { 411 | word_counts[i] = map_find_count(word_map, word_list[i]); 412 | } 413 | } 414 | 415 | void populate_word_ids(struct_map_word **word_map, char * restrict word_list[const], const word_id_t type_count) { 416 | for (word_id_t i = 0; i < type_count; i++) { 417 | map_set_word_id(word_map, word_list[i], i); 418 | } 419 | } 420 | 421 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap) { 422 | sort_by_count(word_map); 423 | struct_map_word *entry, *tmp; 424 | word_id_t i = 0; 425 | 426 | HASH_ITER(hh, *word_map, entry, tmp) { 427 | const word_id_t word_id = entry->word_id; 428 | char * word = entry->key; 429 | word_id_remap[word_id] = i; // set remap 430 | word_list[i] = entry->key; 431 | //printf("reassigning w=%s %u -> %u; count=%u\n", entry->key, word_id, i, entry->count); fflush(stdout); 432 | map_set_word_id(word_map, word, i); // reset word_id in word_map 433 | i++; 434 | } 435 | } 436 | 437 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** word_map, word_id_t * restrict word_id_remap) { // word_map must already be sorted by word frequency! 438 | 439 | unsigned long number_of_deleted_words = 0; 440 | unsigned long vocab_size = model_metadata->type_count; // Save this to separate variable since we'll modify model_metadata.type_count later 441 | // Get keys 442 | // Iterate over keys 443 | // If count of key_i < threshold, 444 | // increment count of by count of key_i, 445 | // decrement model_metadata.type_count by one 446 | // free & delete entry in map, 447 | 448 | char **local_word_list = (char **)malloc(model_metadata->type_count * sizeof(char*)); 449 | //char * local_word_list[model_metadata->type_count]; 450 | if (vocab_size != get_keys(word_map, local_word_list)) { 451 | printf("Error: model_metadata->type_count (%lu) != get_keys() (%lu)\n", (long unsigned) vocab_size, (long unsigned) get_keys(word_map, local_word_list) ); fflush(stderr); 452 | exit(4); 453 | } 454 | 455 | unsigned long new_id = 0; 456 | for (unsigned long word_i = 0; word_i < vocab_size; word_i++, new_id++) { 457 | char * word = local_word_list[word_i]; 458 | //if ((!strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) || (!strncmp(word, "", MAX_WORD_LEN)) || (!strncmp(word, "", MAX_WORD_LEN))) { // Deal with , , and 459 | // //new_id--; 460 | // continue; 461 | //} 462 | 463 | unsigned long word_i_count = map_find_count(word_map, word); // We'll use this a couple times 464 | if ((word_i_count < cmd_args.min_count) && (strncmp(word, UNKNOWN_WORD, MAX_WORD_LEN)) && (strncmp(word, "", MAX_WORD_LEN)) && (strncmp(word, "", MAX_WORD_LEN))) { // Don't delete 465 | number_of_deleted_words++; 466 | if (cmd_args.verbose > 3) { 467 | printf("Filtering-out word: %s (old id=%lu, new id=0) (%lu < %hu);\tcount(%s)=%lu\n", word, word_i, (unsigned long)word_i_count, cmd_args.min_count, UNKNOWN_WORD, (unsigned long)map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout); 468 | } 469 | word_id_remap[map_find_id(word_map, word, (word_id_t) -1)] = (word_id_t) -1; // set value of dud word in remap to temporary unk, which is -1. This gets changed later 470 | map_update_count(word_map, UNKNOWN_WORD, word_i_count, 0); 471 | model_metadata->type_count--; 472 | struct_map_word *local_s; 473 | HASH_FIND_STR(*word_map, word, local_s); 474 | delete_entry(word_map, local_s); 475 | } else { // Keep word 476 | //printf("Keeping word: %s (old id=%u, new id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout); 477 | //map_set_word_id(word_map, word, new_id); // word_id's 0-2 are reserved for , , and 478 | //printf(" Kept word: %s (new map id=%u, new_id=%lu) (%lu >= %hu);\tcount(%s)=%u\n", word, map_find_id(word_map, word, -1), new_id, word_i_count, cmd_args.min_count, UNKNOWN_WORD, map_find_count(word_map, UNKNOWN_WORD)); fflush(stdout); 479 | } 480 | } 481 | //map_set_word_id(word_map, UNKNOWN_WORD, 0); // word_id's 0-2 are reserved for , , and 482 | //map_set_word_id(word_map, "", 1); // word_id's 0-2 are reserved for , , and 483 | //map_set_word_id(word_map, "", 2); // word_id's 0-2 are reserved for , , and 484 | 485 | free(local_word_list); 486 | return number_of_deleted_words; 487 | } 488 | 489 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays) { // Right now it's a drop-in replacement for tally_class_counts_in_store(), but it's not the best way of doing things (eg. for unigram counts, tallying & querying in two separate steps, etc). So this will need to be modified after getting rid of the sent-store 490 | for (word_id_t word_id = 0; word_id < model_metadata.type_count; word_id++) { 491 | const wclass_t headword_class = word2class[word_id]; 492 | count_arrays[0][headword_class] += word_bigrams[word_id].headword_count; 493 | //printf("tally_class_ngram_counts: word=??, word_id=%u, type_count=%u, headword_class=%hu, headword_count=%u, class_count=%lu\n", word_id, model_metadata.type_count, headword_class, word_bigrams[word_id].headword_count, (unsigned long)count_arrays[0][headword_class]); fflush(stdout); 494 | for (unsigned int i = 0; i < word_bigrams[word_id].length; i++) { 495 | const word_id_t prev_word = word_bigrams[word_id].predecessors[i]; 496 | wclass_t prev_class = word2class[prev_word]; 497 | const size_t offset = prev_class + cmd_args.num_classes * headword_class; 498 | //printf(" tally_class_ngram_counts: prev_word=%u, prev_class=%hu, offset=%zu\n", prev_word, prev_class, offset); fflush(stdout); 499 | count_arrays[1][offset] += word_bigrams[word_id].bigram_counts[i]; 500 | } 501 | } 502 | } 503 | 504 | 505 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]) { 506 | register unsigned long word_i = 0; 507 | 508 | if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) { // It doesn't really matter how you initialize word classes in exchange algo. This assigns words from the word list an incrementing class number from [0,num_classes-1]. So it's a simple pseudo-randomized initialization. 509 | register wclass_t class = 0; // [0,num_classes-1] 510 | for (; word_i < vocab_size; word_i++, class++) { 511 | if (class == cmd_args.num_classes) // reset 512 | class = 0; 513 | if (cmd_args.verbose > 3) 514 | printf("cls=%-4u w_i=%-8lu #(w)=%-8u str(w)=%-20s vocab_size=%u\n", class, word_i, word_counts[word_i], word_list[word_i], vocab_size); 515 | word2class[word_i] = class; 516 | } 517 | 518 | } else if (cmd_args.class_algo == BROWN) { // Really simple initialization: one class per word 519 | for (unsigned long class = 0; word_i < vocab_size; word_i++, class++) 520 | word2class[word_i] = class; 521 | } 522 | } 523 | 524 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map) { 525 | 526 | // Build a hash map of bigrams, since we need random access when traversing the corpus. 527 | // Then we convert that to an array of linked lists, since we'll need sequential access during the clustering phase of predictive exchange clustering. 528 | 529 | sort_bigrams(&bigram_map); 530 | 531 | register size_t memusage = 0; 532 | register word_id_t word_2; 533 | register word_id_t word_2_last = 0; 534 | register unsigned int length = 0; 535 | word_id_t * word_buffer = malloc(sizeof(word_id_t) * MAX_WORD_PREDECESSORS); 536 | word_bigram_count_t * count_buffer = malloc(sizeof(word_bigram_count_t) * MAX_WORD_PREDECESSORS); 537 | 538 | // Add a dummy entry at the end of the hash map in order to simplify iterating through it, since it must track changes in head words. 539 | struct_word_bigram dummy = {-1, -1}; // Make sure this bigram is new, so that it's appended to end 540 | map_update_bigram(&bigram_map, &dummy, 0); 541 | 542 | // Iterate through bigram map to get counts of word_2's, so we know how much to allocate for each predecessor list 543 | struct_map_bigram *entry, *tmp; 544 | HASH_ITER(hh, bigram_map, entry, tmp) { 545 | word_2 = (entry->key).word_2; 546 | //printf("\n[%u,%u]=%u, w2_last=%u, length=%u\n", (entry->key).word_1, (entry->key).word_2, entry->count, word_2_last, length); fflush(stdout); 547 | if (word_2 == word_2_last) { // Within successive entry; ie. 2nd entry or greater 548 | word_buffer[length] = (entry->key).word_1; 549 | count_buffer[length] = entry->count; 550 | if (length < MAX_WORD_PREDECESSORS) 551 | length++; 552 | else { 553 | printf("Error: MAX_WORD_PREDECESSORS exceeded (%lu). Increase it in clustercat.h and recompile. Add the -B flag to 'make' to force recompilation.\n", (long unsigned int)MAX_WORD_PREDECESSORS); fflush(stderr); 554 | exit(14); 555 | } 556 | } else { // New entry; process previous entry 557 | word_bigrams[word_2_last].length = length; 558 | word_bigrams[word_2_last].predecessors = malloc(length * sizeof(word_id_t)); 559 | memcpy(word_bigrams[word_2_last].predecessors, word_buffer, length * sizeof(word_id_t)); 560 | memusage += length * sizeof(word_id_t); 561 | word_bigrams[word_2_last].bigram_counts = malloc(length * sizeof(word_bigram_count_t)); 562 | memcpy(word_bigrams[word_2_last].bigram_counts, count_buffer , length * sizeof(word_bigram_count_t)); 563 | memusage += length * sizeof(word_bigram_count_t); 564 | //printf("word_2_last=%u, length=%u word_1s: ", word_2_last, length); 565 | //for (unsigned int i = 0; i < length; i++) { 566 | // printf("<%u,%u> ", word_bigrams[word_2_last].predecessors[i], word_bigrams[word_2_last].bigram_counts[i]); 567 | //} 568 | //printf("\n"); 569 | 570 | word_2_last = word_2; 571 | word_buffer[0] = (entry->key).word_1; 572 | count_buffer[0] = entry->count; 573 | length = 1; 574 | } 575 | } 576 | 577 | free(word_buffer); 578 | free(count_buffer); 579 | //delete_all_bigram(&map_bigram); 580 | 581 | return memusage; 582 | } 583 | 584 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/) { 585 | //long sum = 0; 586 | // set counts 587 | for (word_id_t word = 0; word < type_count; word++) { 588 | for (unsigned int i = 0; i < word_bigrams[word].length; i++) { 589 | word_id_t prev_word = word_bigrams[word].predecessors[i]; 590 | const wclass_t class_i = word2class[word]; 591 | word_class_counts[prev_word * cmd_args.num_classes + class_i] += word_bigrams[word].bigram_counts[i]; 592 | //printf("i=%hu, <%s,%s>=<%u,%u>, =<%u,%u>, num_classes=%u, offset=%u (%u * %u + %u), orig_val=%u\n", i, word_list[prev_word], word_list[word], prev_word, word, prev_word, class_i, cmd_args.num_classes, prev_word * cmd_args.num_classes + class_i, prev_word, cmd_args.num_classes, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i]); fflush(stdout); 593 | //sum += word_bigrams[word].bigram_counts[i]; 594 | //printf(" <%u,%u>=%u at pos %zu\n", prev_word, class_i, word_class_counts[prev_word * cmd_args.num_classes + class_i], ((size_t)prev_word * cmd_args.num_classes + class_i)); fflush(stdout); 595 | } 596 | } 597 | //printf(": sum: %lu; [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u...]\n", sum, word_class_counts[0], word_class_counts[1], word_class_counts[2], word_class_counts[3], word_class_counts[4], word_class_counts[5], word_class_counts[6], word_class_counts[7], word_class_counts[8], word_class_counts[9]); 598 | } 599 | 600 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]) { 601 | const double backward_lambda = 1 - cmd_args.forward_lambda; 602 | 603 | // Transition Probs 604 | double transition_logprob = 0; 605 | // Bigrams 606 | #pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:transition_logprob) 607 | for (word_bigram_count_t ngram = 0; ngram < (powi(cmd_args.num_classes, 2)); ngram++) { 608 | const class_bigram_count_t bigram_count = count_arrays[1][ngram]; 609 | if (!bigram_count) // bigram doesn't exist in training set 610 | continue; 611 | const wclass_t c_1 = ngram % cmd_args.num_classes; 612 | const wclass_t c_2 = ngram / cmd_args.num_classes; 613 | const wclass_count_t c_1_count = count_arrays[0][c_1]; 614 | const wclass_count_t c_2_count = count_arrays[0][c_2]; 615 | const double a = cmd_args.forward_lambda * (bigram_count / (double)c_1_count); 616 | const double b = backward_lambda * (bigram_count / (double)c_2_count); 617 | transition_logprob += LOG2ADD(a,b) * bigram_count; 618 | //printf("ngram=%u, c_1=%u, #(c_1)=%lu, c_2=%u, #(c_2)=%lu, #(c_1,c_2)=%lu, trans_prob=%g\n", ngram, c_1, (unsigned long)c_1_count, c_2, (unsigned long)c_2_count, (unsigned long)bigram_count, transition_logprob); fflush(stdout); 619 | } 620 | 621 | // Emission Probs 622 | //long double emission_prob = 0; 623 | double emission_logprob = 0; 624 | //#pragma omp parallel for num_threads(cmd_args.num_threads) reduction(+:emission_logprob) 625 | for (word_id_t word = 0; word < model_metadata.type_count; word++) { 626 | //if (word == model_metadata.start_sent_id) // Don't tally emission prob for 627 | // continue; 628 | const word_count_t word_count = word_counts[word]; 629 | if (!word_count) // Don't tally emission prob for if min-count is 1 630 | continue; 631 | const wclass_t class = word2class[word]; 632 | const wclass_count_t class_count = count_arrays[0][class]; 633 | emission_logprob += log2(word_count / (double)class_count) * word_count; 634 | //printf("word=%u, class=%u, emission_logprob=%g after += %g = log2(word_count=%lu / class_count=%u) * word_count=%lu\n", word, (unsigned int)class, emission_logprob, log2(word_count / (double)class_count) * word_count, (unsigned long)word_count, class_count, (unsigned long)word_count); fflush(stdout); 635 | } 636 | 637 | //printf("emission_logprob=%g, transition_logprob=%g, LL=%g\n", emission_logprob, transition_logprob, emission_logprob + transition_logprob); 638 | return emission_logprob + transition_logprob; 639 | } 640 | 641 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) { 642 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ... 643 | count_arrays[i-1] = calloc(powi(cmd_args.num_classes, i), sizeof(wclass_count_t)); // powi() is in clustercat-math.c 644 | if (count_arrays[i-1] == NULL) { 645 | fprintf(stderr, "%s: Error: Unable to allocate enough memory for %u-grams. I tried to allocate %zu MB per thread (%zuB * %u^%u). Reduce the number of desired classes using --classes (current value: %u)\n", argv_0_basename, i, sizeof(wclass_count_t) * powi(cmd_args.num_classes, i) / 1048576, sizeof(wclass_count_t), cmd_args.num_classes, i, cmd_args.num_classes ); fflush(stderr); 646 | exit(12); 647 | } 648 | //printf("Allocating %zu B (cmd_args.num_classes=%u^i=%u * sizeof(uint)=%zu)\n", (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), cmd_args.num_classes, i, sizeof(wclass_count_t)); 649 | } 650 | } 651 | 652 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) { 653 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ... 654 | memset(count_arrays[i-1], 0, powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)); // powi() is in clustercat-math.c 655 | } 656 | } 657 | 658 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays) { 659 | for (unsigned char i = 1; i <= cmd_args.max_array; i++) { // Start with unigrams in count_arrays[0], ... 660 | free(count_arrays[i-1]); 661 | } 662 | } 663 | -------------------------------------------------------------------------------- /src/clustercat.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_CLUSTERCAT_HEADER 2 | #define INCLUDE_CLUSTERCAT_HEADER 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include // log(), exp(), pow() 9 | #include // basename() 10 | #include // USHRT_MAX, UINT_MAX 11 | #include 12 | #include "clustercat-math.h" // powi() 13 | 14 | // Defaults 15 | #define PRIMARY_SEP_CHAR '\t' 16 | #define PRIMARY_SEP_STRING "\t" 17 | #define SECONDARY_SEP_CHAR ' ' 18 | #define SECONDARY_SEP_STRING " " 19 | #define TOK_CHARS " \t\n" 20 | #define UNKNOWN_WORD "" 21 | // Number of characters to read-in for each line 22 | #define STDIN_SENT_MAX_CHARS 8000 23 | #define MAX_WORD_LEN 128 24 | #define MAX_WORD_PREDECESSORS 20000000 25 | #define ENTROPY_TERMS_MAX 10000000 26 | 27 | enum class_algos {EXCHANGE, BROWN, EXCHANGE_BROWN}; 28 | enum print_word_vectors {NO_VEC, TEXT_VEC, BINARY_VEC}; 29 | 30 | #include "clustercat-data.h" // bad. chicken-and-egg typedef deps 31 | 32 | typedef unsigned short sentlen_t; // Number of words in a sentence 33 | #define SENT_LEN_MAX USHRT_MAX 34 | //typedef unsigned short wclass_t; // Defined in clustercat-map.h 35 | //typedef unsigned int word_id_t; // Defined in clustercat-map.h 36 | typedef word_count_t * * restrict count_arrays_t; 37 | typedef word_count_t * restrict count_array_t; 38 | 39 | typedef struct { 40 | unsigned long token_count; 41 | unsigned long line_count; 42 | word_id_t type_count; 43 | word_id_t start_sent_id; // need this for tallying emission probs 44 | word_id_t end_sent_id; // need this for tallying emission probs 45 | } struct_model_metadata; 46 | 47 | // typedef {...} struct_word_bigram; // see clustercat-map.h 48 | 49 | typedef struct { // This is for an array pointing to this struct having a pointer to an array of successors to a given word, as well as the length of that array 50 | word_id_t * predecessors; 51 | word_bigram_count_t * bigram_counts; 52 | unsigned long length; 53 | word_count_t headword_count; 54 | } struct_word_bigram_entry; 55 | 56 | extern char *argv_0_basename; // Allow for global access to filename 57 | 58 | struct cmd_args { 59 | float forward_lambda; 60 | wclass_t num_classes; 61 | unsigned short min_count : 12; 62 | signed char verbose : 4; // Negative values increasingly suppress normal output 63 | unsigned short tune_cycles : 8; 64 | unsigned char refine; // 0=no refinement; otherwise 2^n 65 | signed char class_offset: 4; 66 | unsigned short num_threads : 8; 67 | unsigned char rev_alternate: 3; // How often to alternate using reverse pex. 0 == never, 1 == after every one normal pex cycles, ... 68 | unsigned char max_array : 2; 69 | unsigned char class_algo : 2; // enum class_algos 70 | unsigned char print_word_vectors : 2; // enum print_word_vectors 71 | bool ngram_input; 72 | bool print_freqs; 73 | bool unidirectional; 74 | }; 75 | 76 | void populate_word_ids(struct_map_word **ngram_map, char * restrict unique_words[const], const word_id_t type_count); 77 | void reassign_word_ids(struct_map_word **word_map, char * restrict word_list[restrict], word_id_t * restrict word_id_remap); 78 | void build_word_count_array(struct_map_word **ngram_map, char * restrict unique_words[const], word_count_t word_counts[restrict], const word_id_t type_count); 79 | 80 | void tally_class_ngram_counts(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const struct_word_bigram_entry word_bigrams[const], const wclass_t word2class[const], count_arrays_t count_arrays); 81 | word_id_t filter_infrequent_words(const struct cmd_args cmd_args, struct_model_metadata * restrict model_metadata, struct_map_word ** ngram_map, word_id_t * restrict word_id_remap); 82 | void init_clusters(const struct cmd_args cmd_args, word_id_t vocab_size, wclass_t word2class[restrict], const word_count_t word_counts[const], char * word_list[restrict]); 83 | size_t set_bigram_counts(struct_word_bigram_entry * restrict word_bigrams, struct_map_bigram * bigram_map); 84 | void build_word_class_counts(const struct cmd_args cmd_args, word_class_count_t * restrict word_class_counts, const wclass_t word2class[const], const struct_word_bigram_entry * const word_bigrams, const word_id_t type_count/*, char ** restrict word_list*/); 85 | double training_data_log_likelihood(const struct cmd_args cmd_args, const struct_model_metadata model_metadata, const count_arrays_t count_arrays, const word_count_t word_counts[const], const wclass_t word2class[const]); 86 | 87 | void init_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays); 88 | void clear_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays); 89 | void free_count_arrays(const struct cmd_args cmd_args, count_arrays_t count_arrays); 90 | 91 | // Like atoi/strtol, but doesn't interpret each char's ascii value 0..9 . Hence [104,101] ("he") -> 26725 (ie. (104*256)+101). [3,7,11] -> 198411 (3*256*256) + (7*256) + 11) 92 | // Using a class n-gram array is fast, at the expense of memory usage for lots of unattested ngrams, especially for higher-order n-grams. 93 | // Trigrams are probably the highest order you'd want to use as an array, since the memory usage would be: sizeof(wclass_t) * |C|^3 where |C| is the number of word classes. 94 | // |C| can be represented using an unsigned short (16 bits == 65k classes) for exchange clustering, but probably should be an unsigned int (32 bit == 4 billion classes) for Brown clustering, since initially every word type is its own class. 95 | inline size_t array_offset(wclass_t * pointer, const unsigned int max, const wclass_t num_classes) { 96 | register uint_fast8_t ptr_i = 1; 97 | register size_t total_offset = (*pointer); 98 | 99 | for (; ptr_i < max; ptr_i++) { // little endian 100 | //printf("1: atosize_t: pointer=%p; all vals: [%hu,%hu,%hu]; total_offset=%zu; max=%u\n", pointer, *pointer, *(pointer+1), *(pointer+2), total_offset, max); fflush(stdout); 101 | total_offset += (pointer[ptr_i]) * powi(num_classes, ptr_i); 102 | //printf("2: adding ((pointer[%u]=%u)* powi(%hu, %u)=%lu)=%lu\n", ptr_i, pointer[ptr_i], num_classes, ptr_i, powi(num_classes, ptr_i), pointer[ptr_i] * powi(num_classes, ptr_i)); fflush(stdout); 103 | } 104 | //printf("3: atosize_t: pointer=%p; val0=%hu; total_offset=%zu; max=%u\n\n", pointer, *pointer, total_offset, max); fflush(stdout); 105 | return total_offset; 106 | } 107 | 108 | 109 | 110 | #endif // INCLUDE_HEADER 111 | -------------------------------------------------------------------------------- /src/ext/uthash/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 11 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 12 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 13 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 14 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 15 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 16 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 17 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 18 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 19 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 20 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 21 | 22 | -------------------------------------------------------------------------------- /src/ext/uthash/README.md: -------------------------------------------------------------------------------- 1 | 2 | Documentation for uthash is available at: 3 | 4 | http://troydhanson.github.com/uthash/ 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/ext/word2vec/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/ext/word2vec/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /src/ext/word2vec/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | float *M; 32 | char *vocab; 33 | if (argc < 2) { 34 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 35 | return 0; 36 | } 37 | strcpy(file_name, argv[1]); 38 | f = fopen(file_name, "rb"); 39 | if (f == NULL) { 40 | printf("Input file not found\n"); 41 | return -1; 42 | } 43 | fscanf(f, "%lld", &words); 44 | fscanf(f, "%lld", &size); 45 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 46 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 48 | if (M == NULL) { 49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 50 | return -1; 51 | } 52 | for (b = 0; b < words; b++) { 53 | a = 0; 54 | while (1) { 55 | vocab[b * max_w + a] = fgetc(f); 56 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 57 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 58 | } 59 | vocab[b * max_w + a] = 0; 60 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 61 | len = 0; 62 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 63 | len = sqrt(len); 64 | for (a = 0; a < size; a++) M[a + b * size] /= len; 65 | } 66 | fclose(f); 67 | while (1) { 68 | for (a = 0; a < N; a++) bestd[a] = 0; 69 | for (a = 0; a < N; a++) bestw[a][0] = 0; 70 | printf("Enter word or sentence (EXIT or CTRL-d to break): "); 71 | a = 0; 72 | while (1) { 73 | st1[a] = fgetc(stdin); 74 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 75 | st1[a] = 0; 76 | break; 77 | } 78 | a++; 79 | } 80 | if ((!strcmp(st1, "EXIT")) || st1[0] == -1) { 81 | printf("\n"); 82 | break; 83 | } 84 | cn = 0; 85 | b = 0; 86 | c = 0; 87 | while (1) { 88 | st[cn][b] = st1[c]; 89 | b++; 90 | c++; 91 | st[cn][b] = 0; 92 | if (st1[c] == 0) break; 93 | if (st1[c] == ' ') { 94 | cn++; 95 | b = 0; 96 | c++; 97 | } 98 | } 99 | cn++; 100 | for (a = 0; a < cn; a++) { 101 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 102 | if (b == words) b = -1; 103 | bi[a] = b; 104 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 105 | if (b == -1) { 106 | printf("Out of dictionary word!\n"); 107 | break; 108 | } 109 | } 110 | if (b == -1) continue; 111 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 112 | for (a = 0; a < size; a++) vec[a] = 0; 113 | for (b = 0; b < cn; b++) { 114 | if (bi[b] == -1) continue; 115 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 116 | } 117 | len = 0; 118 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 119 | len = sqrt(len); 120 | for (a = 0; a < size; a++) vec[a] /= len; 121 | for (a = 0; a < N; a++) bestd[a] = -1; 122 | for (a = 0; a < N; a++) bestw[a][0] = 0; 123 | for (c = 0; c < words; c++) { 124 | a = 0; 125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 126 | if (a == 1) continue; 127 | dist = 0; 128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 129 | for (a = 0; a < N; a++) { 130 | if (dist > bestd[a]) { 131 | for (d = N - 1; d > a; d--) { 132 | bestd[d] = bestd[d - 1]; 133 | strcpy(bestw[d], bestw[d - 1]); 134 | } 135 | bestd[a] = dist; 136 | strcpy(bestw[a], &vocab[c * max_w]); 137 | break; 138 | } 139 | } 140 | } 141 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 142 | } 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /src/ext/word2vec/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: distance word-analogy 6 | 7 | distance : distance.c 8 | $(CC) distance.c -o distance $(CFLAGS) 9 | word-analogy : word-analogy.c 10 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 11 | 12 | clean: 13 | rm -rf distance word-analogy 14 | -------------------------------------------------------------------------------- /src/ext/word2vec/word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | float *M; 32 | char *vocab; 33 | if (argc < 2) { 34 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 35 | return 0; 36 | } 37 | strcpy(file_name, argv[1]); 38 | f = fopen(file_name, "rb"); 39 | if (f == NULL) { 40 | printf("Input file not found\n"); 41 | return -1; 42 | } 43 | fscanf(f, "%lld", &words); 44 | fscanf(f, "%lld", &size); 45 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 46 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 47 | if (M == NULL) { 48 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 49 | return -1; 50 | } 51 | for (b = 0; b < words; b++) { 52 | a = 0; 53 | while (1) { 54 | vocab[b * max_w + a] = fgetc(f); 55 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 56 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 57 | } 58 | vocab[b * max_w + a] = 0; 59 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 60 | len = 0; 61 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 62 | len = sqrt(len); 63 | for (a = 0; a < size; a++) M[a + b * size] /= len; 64 | } 65 | fclose(f); 66 | while (1) { 67 | for (a = 0; a < N; a++) bestd[a] = 0; 68 | for (a = 0; a < N; a++) bestw[a][0] = 0; 69 | printf("Enter three words (EXIT or CTRL-d to break): "); 70 | a = 0; 71 | while (1) { 72 | st1[a] = fgetc(stdin); 73 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 74 | st1[a] = 0; 75 | break; 76 | } 77 | a++; 78 | } 79 | if ((!strcmp(st1, "EXIT")) || st1[0] == -1) { 80 | printf("\n"); 81 | break; 82 | } 83 | cn = 0; 84 | b = 0; 85 | c = 0; 86 | while (1) { 87 | st[cn][b] = st1[c]; 88 | b++; 89 | c++; 90 | st[cn][b] = 0; 91 | if (st1[c] == 0) break; 92 | if (st1[c] == ' ') { 93 | cn++; 94 | b = 0; 95 | c++; 96 | } 97 | } 98 | cn++; 99 | if (cn < 3) { 100 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 101 | continue; 102 | } 103 | for (a = 0; a < cn; a++) { 104 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 105 | if (b == words) b = 0; 106 | bi[a] = b; 107 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 108 | if (b == 0) { 109 | printf("Out of dictionary word!\n"); 110 | break; 111 | } 112 | } 113 | if (b == 0) continue; 114 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 115 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 116 | len = 0; 117 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 118 | len = sqrt(len); 119 | for (a = 0; a < size; a++) vec[a] /= len; 120 | for (a = 0; a < N; a++) bestd[a] = 0; 121 | for (a = 0; a < N; a++) bestw[a][0] = 0; 122 | for (c = 0; c < words; c++) { 123 | if (c == bi[0]) continue; 124 | if (c == bi[1]) continue; 125 | if (c == bi[2]) continue; 126 | a = 0; 127 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 128 | if (a == 1) continue; 129 | dist = 0; 130 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 131 | for (a = 0; a < N; a++) { 132 | if (dist > bestd[a]) { 133 | for (d = N - 1; d > a; d--) { 134 | bestd[d] = bestd[d - 1]; 135 | strcpy(bestw[d], bestw[d - 1]); 136 | } 137 | bestd[a] = dist; 138 | strcpy(bestw[a], &vocab[c * max_w]); 139 | break; 140 | } 141 | } 142 | } 143 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 144 | } 145 | return 0; 146 | } 147 | -------------------------------------------------------------------------------- /visualization/d3/basque_cluster_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/basque_cluster_thumbnail.png -------------------------------------------------------------------------------- /visualization/d3/french_cluster_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/french_cluster_thumbnail.png -------------------------------------------------------------------------------- /visualization/d3/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 38 | 39 |

Word Clusters

40 |
Made using ClusterCat
41 |
Click to zoom in/out
42 | 43 | 125 | 126 | Uses D3     127 | Download json data 128 | 129 | -------------------------------------------------------------------------------- /visualization/d3/russian_cluster_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonsafari/clustercat/e6f618a5f70fe6de5f7c620ccaec22364f954aef/visualization/d3/russian_cluster_thumbnail.png --------------------------------------------------------------------------------