├── .gitattributes ├── .gitignore ├── CHANGES.txt ├── GRADLE.CHEATSHEET ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── build.gradle ├── docs ├── assets │ ├── css │ │ ├── bootstrap.min.css │ │ └── prettify.css │ ├── img │ │ ├── glyphicons-halflings-white.png │ │ ├── glyphicons-halflings.png │ │ ├── mapping.odg │ │ └── mapping.png │ └── js │ │ ├── carrotsearch.circles.js │ │ ├── carrotsearch.foamtree.js │ │ ├── config.js │ │ ├── jquery-2.0.2.min.js │ │ ├── prettify.js │ │ └── sample-data.js ├── curl │ ├── 01-index-data.sh │ ├── 02-simple-clustering.sh │ ├── 03-field-mapping.sh │ └── 04-delete-test-index.sh ├── examples.html └── index.html ├── gradle ├── publishing.gradle ├── validation │ ├── spotless.gradle │ └── spotless │ │ └── source-header.txt └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── src ├── javaRestTest ├── java │ └── org │ │ └── carrot2 │ │ └── elasticsearch │ │ ├── ClusteringActionIT.java │ │ ├── ClusteringActionRestIT.java │ │ ├── ListAlgorithmsActionIT.java │ │ ├── MultithreadedClusteringIT.java │ │ ├── SampleIndexTestCase.java │ │ └── TestInfra.java └── resources │ ├── log4j.properties │ └── org │ └── carrot2 │ └── elasticsearch │ ├── _ClusteringActionRestIT │ ├── post_invalid_attribute_value.json │ ├── post_invalid_query.json │ ├── post_language_field.json │ ├── post_multiple_field_mapping.json │ ├── post_nonexistent_algorithmId.json │ ├── post_nonexistent_fields.json │ ├── post_runtime_attributes.json │ ├── post_with_clusters.json │ ├── post_with_fields.json │ ├── post_with_highlighted_fields.json │ └── post_with_source_fields.json │ └── _TestInfra │ └── datamining.json ├── main ├── config │ └── config.yml ├── java │ └── org │ │ └── carrot2 │ │ └── elasticsearch │ │ ├── ClusteringAction.java │ │ ├── ClusteringActionRequest.java │ │ ├── ClusteringActionRequestBuilder.java │ │ ├── ClusteringActionResponse.java │ │ ├── ClusteringActionTransport.java │ │ ├── ClusteringContext.java │ │ ├── ClusteringException.java │ │ ├── ClusteringPlugin.java │ │ ├── DocumentGroup.java │ │ ├── FieldMappingSpec.java │ │ ├── FieldSource.java │ │ ├── InputDocument.java │ │ ├── ListAlgorithmsAction.java │ │ ├── LoggerUtils.java │ │ ├── LogicalField.java │ │ ├── OptionalQueryHintSetterVisitor.java │ │ ├── PathResourceLookup.java │ │ ├── Preconditions.java │ │ └── ToString.java └── plugin-metadata │ └── plugin-security.policy └── yamlRestTest ├── java └── org │ └── carrot2 │ └── elasticsearch │ └── ListAlgorithmsActionRestIT.java └── resources └── rest-api-spec ├── api └── algorithms.json └── test └── elasticsearch-carrot2 ├── 00_sanity.yml └── 01_list_algorithms.yml /.gitattributes: -------------------------------------------------------------------------------- 1 | versions.lock text eol=lf 2 | *.gradle text eol=lf 3 | *.java text eol=lf 4 | *.txt text eol=lf 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .DS_Store 3 | .gradle 4 | /.project 5 | /.classpath 6 | /.settings 7 | *.versionsBackup 8 | .local* 9 | build/ 10 | 11 | # idea files 12 | .idea 13 | *.iml 14 | *.ipr 15 | *.iws 16 | 17 | # l3g license for tests. 18 | src/main/config/license.xml 19 | 20 | # eclipse 21 | bin/ 22 | out/ 23 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | 2 | ElasticSearch-Carrot2 Change Log 3 | 4 | ================ master ==================== 5 | 6 | ================ ElasticSearch-Carrot2 7.17.7 =================== 7 | 8 | * ES 7.17.7 compatibility build (manually verified). 9 | * Upgrade to Carrot2 4.3.1. 10 | 11 | ================ ElasticSearch-Carrot2 7.14.1 =================== 12 | 13 | * ES 7.14.1 compatibility build (manually verified). 14 | 15 | ================ ElasticSearch-Carrot2 7.13.4 =================== 16 | 17 | * ES 7.13.4 compatibility build. 18 | 19 | ================ ElasticSearch-Carrot2 7.12.0 =================== 20 | 21 | * ES 7.12.0 compatibility build. 22 | 23 | * Upgrade to Carrot2 4.2.1. 24 | 25 | ================ ElasticSearch-Carrot2 7.11.2 =================== 26 | 27 | * ES 7.11.2 API changes and compatibility build. 28 | 29 | ================ ElasticSearch-Carrot2 7.10.2 =================== 30 | 31 | * ES 7.10.2 compatibility build. 32 | 33 | ================ ElasticSearch-Carrot2 7.10.1 =================== 34 | 35 | * ES 7.10.1 compatibility build. 36 | 37 | ================ ElasticSearch-Carrot2 7.10.0 =================== 38 | 39 | * ES 7.10.0 compatibility build. 40 | 41 | * Cleanups of build infrastructure. 42 | 43 | ================ ElasticSearch-Carrot2 7.9.3 ==================== 44 | 45 | * ES 7.9.3 compatibility build. 46 | 47 | ================ ElasticSearch-Carrot2 7.9.2 ==================== 48 | 49 | * ES 7.9.2 compatibility build. 50 | 51 | ================ ElasticSearch-Carrot2 7.8.1 ==================== 52 | 53 | * ES 7.8.1 compatibility build. 54 | 55 | * Upgrade to Carrot2 4.0.4. 56 | 57 | ================ ElasticSearch-Carrot2 7.7.1 ==================== 58 | 59 | Other changes 60 | 61 | * Upgrade to Carrot2 4.0.3. Adjust to API changes in language resources. 62 | 63 | * API updates to ES 7.7.1. Build scripts updates. 64 | 65 | ================ ElasticSearch-Carrot2 7.6.0 ==================== 66 | 67 | Backward-incompatible changes 68 | 69 | * Upgrade of Carrot2 to 4.0.0-beta3. This results in a number of backward-incompatible 70 | changes to the plugin. 71 | 72 | 1) The local field for URL (and the corresponding mapping) is gone without replacement. 73 | 2) Names of clustering algorithms have changed. They are now (note white space): 74 | Lingo, STC, Bisecting K-Means, Lingo3G (optional, commercial) 75 | 3) The language field is no longer a two-letter code. The list of supported languages 76 | depends on the algorithm and availability of resources and is printed at startup; 77 | (service extension points are used to load algorithms and languages). 78 | 4) Names and structure of attributes for clustering algorithms have changed. They reflect 79 | the new, updated Carrot2 API, see: 80 | https://carrot2.github.io/release/4.0.0-beta3/doc/rest-api-reference/ 81 | 5) Algorithm suites and predefined component descriptors (in XML) have been removed and 82 | are no longer supported. Algorithms are loaded via service extensions. 83 | 6) There are two new request attributes: "language" specifying default clustering 84 | language for documents that don't declare it explicitly and "create_ungrouped" which 85 | forces the plugin to create a synthetic group with unclustered document references 86 | (this corresponds to "other topics" group from previous Carrot2 versions). 87 | 7) Nearly all plugin options have been removed. The single option remaining is "resources" 88 | with an array of ES configuration folder-relative locations where algorithm resources 89 | are looked up. 90 | 8) Default lexical resources from Carrot2 are included under configuration folder and 91 | used by default. 92 | 93 | ================ ElasticSearch-Carrot2 7.5.0 ==================== 94 | 95 | Other changes 96 | 97 | * Build updates to ES 7.5.0. 98 | 99 | ================ ElasticSearch-Carrot2 7.3.2 ==================== 100 | 101 | Other changes 102 | 103 | * Build updates to ES 7.3.2. 104 | 105 | ================ ElasticSearch-Carrot2 7.2.1 ==================== 106 | 107 | Other changes 108 | 109 | * Build updates to ES 7.2.1. 110 | * Switch dependency from simple-xml to simple-xml-safe. 111 | 112 | ================ ElasticSearch-Carrot2 7.2.0 ==================== 113 | 114 | Other changes 115 | 116 | * Build updates to ES 7.2.0. 117 | 118 | ================ ElasticSearch-Carrot2 7.1.1 ==================== 119 | 120 | Other changes 121 | 122 | * Build updates to ES 7.1.1. 123 | 124 | ================ ElasticSearch-Carrot2 7.0.0 ==================== 125 | 126 | Other changes 127 | 128 | * Build updates to ES 7.0.0. 129 | 130 | ================ ElasticSearch-Carrot2 6.8.5 ==================== 131 | 132 | Other changes 133 | 134 | * Build updates to ES 6.8.5. 135 | 136 | ================ ElasticSearch-Carrot2 6.7.2 ==================== 137 | 138 | Other changes 139 | 140 | * Build updates to ES 6.7.2. 141 | 142 | ================ ElasticSearch-Carrot2 6.7.1 ==================== 143 | 144 | Other changes 145 | 146 | * Build updates to ES 6.7.1. 147 | 148 | ================ ElasticSearch-Carrot2 6.6.2 ==================== 149 | 150 | Other changes 151 | 152 | * Build updates to ES 6.6.2. 153 | 154 | ================ ElasticSearch-Carrot2 6.5.4 ==================== 155 | 156 | Other changes 157 | 158 | * Build updates to ES 6.5.4. 159 | 160 | ================ ElasticSearch-Carrot2 6.4.3 ==================== 161 | 162 | Other changes 163 | 164 | * Build updates to ES 6.4.3. 165 | 166 | * Permit custom resource lookup location for each algorithm. This can be accomplished 167 | by declaring a 'esplugin.resources' attribute inside a given component's configuration 168 | attributes. For example, changing lingo-attributes.xml in the following way: 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | would cause all algorithm resources to be read from: 180 | 181 | {es-home}/config/elasticsearch-carrot2/lingo-resources 182 | 183 | ================ ElasticSearch-Carrot2 6.3.2 ==================== 184 | 185 | Other changes 186 | 187 | * Build updates to ES 6.3.2. (PR #77, thanks Sergey34). 188 | 189 | ================ ElasticSearch-Carrot2 6.2.4 ==================== 190 | 191 | Other changes 192 | 193 | * ES security (x-pack) fixes to make it possible to run clustering in secured 194 | ES instances. 195 | 196 | * Upgrade dependency to Carrot2 3.16.0 (and Lingo3G 1.16.0). 197 | 198 | * Build updates to ES 6.2.4. 199 | 200 | ================ ElasticSearch-Carrot2 6.1.1 ==================== 201 | 202 | * Updates to API changes in ES 6.1.1. (PR #74, thanks Tom Chambers!). 203 | 204 | * Updates to API changes in ES 6.0.0. 205 | 206 | * REST tests cleanups and updates. 207 | 208 | ================ ElasticSearch-Carrot2 5.5.2 ==================== 209 | 210 | Other changes 211 | 212 | * Updates to API changes in ES 5.5.2. 213 | 214 | ================ ElasticSearch-Carrot2 5.4.0 ==================== 215 | 216 | Other changes 217 | 218 | * Updates to API changes in ES 5.4.0. 219 | 220 | ================ ElasticSearch-Carrot2 5.3.0 ==================== 221 | 222 | Other changes 223 | 224 | * Updates to API changes in ES 5.3.0. 225 | 226 | * Documentation fixes (location of resources). 227 | 228 | * Added slf4j-log4j12 logging redirector as a default dependency. 229 | 230 | * Updated FoamTree and Circles to their newest versions. 231 | 232 | * Added logging configuration to silence permission denied exceptions and 233 | other non-critical information from C2. 234 | 235 | ================ ElasticSearch-Carrot2 5.2.0 ==================== 236 | 237 | Other changes 238 | 239 | * Updates to API changes in ES 5.2.0. 240 | 241 | * GH-60: Upgrade dependency to Carrot2 3.15.1 (and Lingo3G 1.15.1). 242 | 243 | ================ ElasticSearch-Carrot2 5.1.1 ==================== 244 | 245 | Other changes 246 | 247 | * The first ES version 5.x-compatible release. Moved to Gradle build system from Maven. 248 | (PR #57, thanks Utkarsh!). 249 | 250 | * Java 8 is from now on required. 251 | 252 | * Fields will not work unless explicitly stored. Use source filtering instead. 253 | https://www.elastic.co/guide/en/elasticsearch/reference/5.1/search-request-stored-fields.html 254 | https://www.elastic.co/guide/en/elasticsearch/reference/5.1/search-request-source-filtering.html 255 | 256 | * Updated documentation links and content. 257 | 258 | * Removed the demo _site part of the plugin. ES 5.1.1 onwards there is no 259 | support for site plugins. 260 | 261 | ================ ElasticSearch-Carrot2 2.4.3 ==================== 262 | 263 | No changes (ES version compatibility release). 264 | 265 | ================ ElasticSearch-Carrot2 2.4.2 ==================== 266 | 267 | No changes (ES version compatibility release). 268 | 269 | ================ ElasticSearch-Carrot2 2.4.1.1 ================== 270 | 271 | * GH-53: Upgrade to C2 3.15.0 and L3G 1.15.0. This is a compatibility 272 | release. It will work with ES 2.4.1. 273 | 274 | ================ ElasticSearch-Carrot2 2.4.1 ================== 275 | 276 | * GH-50: Upgrade to C2 3.14.0 and L3G 1.14.0. 277 | 278 | ================ ElasticSearch-Carrot2 2.4.0.1 ================== 279 | 280 | * GH-49: Wrong version of morfologik stemming shipped (2.0.1 281 | instead of 2.1.0). This is a bugfix release, it will work 282 | with ES 2.4.0. 283 | 284 | ================ ElasticSearch-Carrot2 2.4.0 ==================== 285 | 286 | * GH-47: Method renamed in ES (compatibility release). 287 | 288 | ================ ElasticSearch-Carrot2 2.3.4 ==================== 289 | 290 | No changes (ES version compatibility release). 291 | 292 | ================ ElasticSearch-Carrot2 2.3.2 ==================== 293 | 294 | No changes (ES version compatibility release). 295 | 296 | ================ ElasticSearch-Carrot2 2.3.0 ==================== 297 | 298 | Other changes 299 | 300 | * ES version/ API compatibility release. 301 | 302 | * Visualizations in the docs display proper resolution on high 303 | DPI screens. 304 | 305 | ================ ElasticSearch-Carrot2 2.2.1 ==================== 306 | 307 | Other changes 308 | 309 | * GH-40: Upgrade to C2 3.12.0 and L3G 1.13.0. 310 | 311 | * GH-38: Array of fields cannot be used. (Christophe Quintard via Dawid Weiss). 312 | 313 | ================ ElasticSearch-Carrot2 2.2.0 ==================== 314 | 315 | Other changes 316 | 317 | * GH-37: Added workarounds for tighter security checks in ES 2.2.0. 318 | The plugin from now on requires custom relaxed policy which has 319 | to be manually approved during startup. 320 | 321 | ================ ElasticSearch-Carrot2 2.1.2 ==================== 322 | 323 | No changes (ES version compatibility release). 324 | 325 | ================ ElasticSearch-Carrot2 2.1.1 ==================== 326 | 327 | No changes (ES version compatibility release). 328 | 329 | ================ ElasticSearch-Carrot2 2.1.0 ==================== 330 | 331 | No changes (ES version compatibility release). 332 | 333 | ================ ElasticSearch-Carrot2 2.0.2 ==================== 334 | 335 | No changes (ES version compatibility release). 336 | 337 | ================ ElasticSearch-Carrot2 2.0.1 ==================== 338 | 339 | No changes (ES version compatibility release). 340 | 341 | ================ ElasticSearch-Carrot2 2.0.0 ==================== 342 | 343 | Other changes 344 | 345 | * GH-29: Upgrade to ES 2.0.0 (official). 346 | 347 | ================ ElasticSearch-Carrot2 2.0.0-rc1 ================ 348 | 349 | Changes in Backwards Compatibility 350 | 351 | * GH-23: Plugin code restructured for ES 2.0.0. Versioning 352 | scheme will now follow ElasticSearch strictly since starting 353 | from version 2.0.0 plugins must declare an (exact) version 354 | of ES they were compiled against. 355 | 356 | * GH-25: Plugin configuration files have been moved to reside 357 | inside the plugin. They should be automatically installed 358 | (copied) to ElasticSearch's config/ folder, where they can 359 | be tweaked. 360 | 361 | Other changes 362 | 363 | * GH-28: Add support for es/config and plugin/config relative 364 | license locations in Lingo3G 365 | 366 | * GH-27: Add dependency on morfologik-stemming. 367 | 368 | * GH-24: Update JS visualizations in plugin documentation. 369 | 370 | * Dropped support for plugin branches 1.7.x and 1.8.x (ES 1.3.x and 371 | 1.4.x). 372 | 373 | ================ ElasticSearch-Carrot2 1.9.1 ================ 374 | 375 | Changes in Backwards Compatibility 376 | 377 | * Dependency update to Carrot2 3.10.4 (and Lingo3G 1.12.3). 378 | 379 | ================ ElasticSearch-Carrot2 1.9.0 ================ 380 | 381 | Changes in Backwards Compatibility 382 | 383 | * Dependency update to ES 1.6.0. 384 | 385 | * Dependency update to Carrot2 3.10.1 (and Lingo3G 1.12.0). 386 | 387 | * Dependency updates (test libraries). 388 | 389 | ================ ElasticSearch-Carrot2 1.8.0 ================ 390 | 391 | Changes in Backwards Compatibility 392 | 393 | * Dependency update to ES 1.4.0. 394 | 395 | Other changes 396 | 397 | * Dropped support branch for ES 1.1.x. 398 | 399 | ================ ElasticSearch-Carrot2 1.7.0 ================ 400 | 401 | Changes in Backwards Compatibility 402 | 403 | * Dependency update to ES 1.3.0. 404 | 405 | ================ ElasticSearch-Carrot2 1.6.1 ================ 406 | 407 | Changes in Backwards Compatibility 408 | 409 | * Dependency update to ES 1.2.2. 410 | 411 | * Dependency update to Carrot2 3.9.3 (and Lingo3G 1.10.0) 412 | 413 | * Demo visualizations updated to their newest version. 414 | 415 | ================ ElasticSearch-Carrot2 1.6.0 ================ 416 | 417 | Changes in Backwards Compatibility 418 | 419 | * Compatibility update to ES 1.2.0. 420 | 421 | ================ ElasticSearch-Carrot2 1.5.0 ================ 422 | 423 | Changes in Backwards Compatibility 424 | 425 | * Compatibility upgrade to Carrot2 3.9.2 and Lingo3G 1.9.1. 426 | 427 | ================ ElasticSearch-Carrot2 1.4.0 ================ 428 | 429 | Changes in Backwards Compatibility 430 | 431 | * Compatibility upgrade to Carrot2 3.9.0 and Lingo3G 1.9.0. 432 | 433 | * include_hits parameter is now deprecated in favor of setting 434 | max_hits to 0. include_hits = false will act as an alias of 435 | setting max_hits to 0. 436 | 437 | New Features 438 | 439 | * GH-9: Add a more flexible limit of the set of returned search results. 440 | 441 | ================ ElasticSearch-Carrot2 1.3.1 ================ 442 | 443 | New features 444 | 445 | * Added an option to return only cluster labels (omit search hits 446 | in the response). [thanks @kielni] 447 | 448 | ================ ElasticSearch-Carrot2 1.3.0 ================ 449 | 450 | Changes in Backwards Compatibility 451 | 452 | * GH-6: Fixes compatibility issues to work with elasticsearch-1.0.0. 453 | 454 | ================ ElasticSearch-Carrot2 1.2.2 ================ 455 | 456 | New features 457 | 458 | * Added an option to return only cluster labels (omit search hits 459 | in the response). [thanks @kielni] 460 | 461 | ================ ElasticSearch-Carrot2 1.2.1 ================ 462 | 463 | Changes in Backwards Compatibility 464 | 465 | * GH-4: Compatibility issue with elasticsearch-0.90.10 (this 466 | release will not work for ES < 0.90.10). 467 | 468 | Bug Fixes 469 | 470 | * GH-4: Compatibility issue with elasticsearch-0.90.10 471 | 472 | ================ ElasticSearch-Carrot2 1.2.0 ================ 473 | 474 | New Features 475 | 476 | * Added an action to return all available clustering algorithms. 477 | 478 | * Added support for search-and-cluster requests using HTTP GET (with a subset 479 | of all the functionality but still useful). 480 | 481 | Bug Fixes 482 | 483 | * Proper propagation of ProcessingExceptions from Carrot2 framework (in case 484 | they happen due to, for example, invalid attribute values). 485 | 486 | Other 487 | 488 | * Added tests for invalid attribute values and proper responses in such 489 | case (API, REST). 490 | 491 | * Refactoring of API classes (nesting classes under ClusteringAction, similar pattern 492 | will follow for all future *Action implementations). 493 | 494 | * Improved tests (test requests are converted to all XContentTypes). 495 | 496 | * Restructured the plugin documentation, added table of contents and 497 | sections for each REST API method. 498 | 499 | * Minor touches for incompatible ES API changes (removed methods in RestActions). 500 | 501 | ================ ElasticSearch-Carrot2 1.1.1 ================ 502 | 503 | Other 504 | 505 | * Minor touches for incompatible ES API changes (removed methods in RestActions). 506 | 507 | ================ ElasticSearch-Carrot2 1.1.0 ================ 508 | 509 | New features 510 | 511 | * The size of the clustering controller's component pool is adjustable using 512 | the configuration file (controller.pool-size option). 513 | 514 | * Added language field mapping descriptors. This helps with multi-lingual 515 | clustering -- hints about each document's language can be stored together 516 | with the document in the index. See the documentation for examples. 517 | 518 | Bug Fixes 519 | 520 | * GH-1: Front-page example visualizations fail with Lingo3G. 521 | 522 | Other 523 | 524 | * Update base ES dependency to 0.90.2 525 | 526 | * Verified threading (clustering runs on the search thread), added 527 | simple stress testing. 528 | 529 | * Added information about search threadpool use and potential tweaks to 530 | its default size to examples.html 531 | 532 | ================ ElasticSearch-Carrot2 1.0.1 ================ 533 | 534 | Bug Fixes 535 | 536 | * GH-1: Front-page example visualizations fail with Lingo3G. 537 | 538 | ================ ElasticSearch-Carrot2 1.0.0 ================ 539 | 540 | First open source release. 541 | -------------------------------------------------------------------------------- /GRADLE.CHEATSHEET: -------------------------------------------------------------------------------- 1 | 2 | # Publishing. 3 | # Remember to set up ~/.gradle/gradle.properties: 4 | 5 | nexusUsername= 6 | nexusPassword= 7 | signing.keyId= 8 | signing.password= 9 | signing.secretKeyRingFile=.../secring.gpg 10 | 11 | # then: 12 | gradlew clean publishToSonatype --max-workers 1 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2013-2017 Carrot Search s.c. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Elasticsearch-Carrot2 2 | Copyright 2013-2021 Carrot Search s.c. 3 | 4 | This product includes software developed by The Apache Software 5 | Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Search results clustering for ElasticSearch 2 | =========================================== 3 | 4 | This clustering plugin adds on-the-fly text clustering capability 5 | to an ElasticSearch node. Clustering algorithms from the 6 | [Carrot2](https://github.com/carrot2/carrot2) project (open source) 7 | are used by default. Proprietary [Lingo3G algorithm](https://carrotsearch.com/lingo3g/) 8 | from [Carrot Search](https://carrotsearch.com) can also be used 9 | via an [extension plugin](https://github.com/carrotsearch/elasticsearch-lingo3g). 10 | 11 | 12 | Installation 13 | ------------ 14 | 15 | In order to install a stable version of the plugin, 16 | run ElasticSearch's `plugin` utility (remember to pick the 17 | ES-compatible version of the plugin from the table below!). 18 | 19 | bin/elasticsearch-plugin install org.carrot2:elasticsearch-carrot2:7.17.7 20 | 21 | To install from sources (master branch), run: 22 | 23 | ./gradlew clean build 24 | 25 | then install with (use the full, absolute path): 26 | 27 | Linux: 28 | bin/elasticsearch-plugin install file:/.../(plugin)/build/distributions/*.zip 29 | 30 | Windows: 31 | bin\elasticsearch-plugin install file:///c:/.../(plugin)/build/distributions/*.zip 32 | 33 | If plugin installation shows a popup to request additional 34 | permissions, you have to accept it. Plugin versions for Carrot2 4.0.0+ do not 35 | require any extra permissions. 36 | 37 | 38 | Usage 39 | ----- 40 | 41 | To play with the examples in the documentation, you'll have to allow 42 | CORS requests from null (if opened directly) or localhost (if served 43 | by some local HTTP server). Add the following to ES/config/elasticsearch.yml: 44 | 45 | ``` 46 | # Allow localhost cross-origin requests. 47 | http.cors.enabled: true 48 | http.cors.allow-origin: /(null)|(https?:\/\/localhost(:[0-9]+)?)|(https?:\/\/carrot2\.github\.io(:[0-9]+)?)/ 49 | ``` 50 | 51 | More information about security implications of enabling CORS are here: 52 | https://www.elastic.co/guide/en/elasticsearch/reference/5.0/modules-http.html 53 | 54 | Finally, start ES and open up the documentation in your browser 55 | (can be opened as a file resource): 56 | 57 | (plugin sources)/doc/index.html 58 | 59 | The above regular expression enables CORS headers from github, so you can open 60 | the documentation directly from there: 61 | 62 | https://carrot2.github.io/elasticsearch-carrot2/ 63 | 64 | CURL request examples are available here: 65 | 66 | https://github.com/carrot2/elasticsearch-carrot2/tree/master/doc/curl/ 67 | 68 | 69 | Versions and compatibility 70 | -------------------------- 71 | 72 | Recommended compatibility chart (matching versions of ES, Carrot2, 73 | and optionally Lingo3G). (+) means it'll probably work with newer 74 | releases (we test against the latest version from that branch). 75 | 76 | The plugin is compiled against *an exact* version of ES 77 | and *will not work* with any other version. The numbering of the plugin 78 | will always correspond to the numbering of ES to easily identify 79 | the version of ES the plugin will work with. The only exceptions from this rule 80 | will be critical bugfixes, which will have the fourth version number: then 81 | the first three numbers denote ES release the plugin is compiled against. 82 | 83 | If you need a point version that has not been released (yet or skipped), 84 | then update the project descriptor (pom.xml) and recompile from sources, 85 | this will yield a binary version of the plugin compatible with the 86 | given ES version. 87 | 88 | Lingo3G 2.x will be supported via a separate 89 | [extension plugin](https://github.com/carrotsearch/elasticsearch-lingo3g). 90 | 91 | | Clustering Plugin | Elasticsearch | Carrot2 | Lingo3G | 92 | |-------------------| --- | --- | --- | 93 | | (master branch) | | 4.3.1 | ext-plugin | 94 | | 7.17.7 → | | 4.3.1 | ext-plugin | 95 | | 7.14.1 → | | 4.3.1 | ext-plugin | 96 | | 7.13.4 → | | 4.3.1 | ext-plugin | 97 | | 7.12.0 → | | 4.2.1 | ext-plugin | 98 | 99 | Discontinued version branches: 100 | 101 | | Clustering Plugin | Elasticsearch | Carrot2 | Lingo3G | 102 | | --- | --- | --- | --- | 103 | | 7.11.2 → | | 4.0.4 | ext-plugin | 104 | | 7.10.2.2 | 7.10.2 | 4.3.1 | ext-plugin | 105 | | 7.10.0 → 7.10.2 | | 4.0.4 | ext-plugin | 106 | | 7.9.2 → 7.9.3 | | 4.0.4 | ext-plugin | 107 | | 7.8.1 | | 4.0.4 | ext-plugin | 108 | | 7.7.1 | | 4.0.4 | ext-plugin | 109 | | 7.6.0.1 | 7.6.0 | 4.0.0-beta3 | not supported | 110 | | 7.6.0 | | 4.0.0-beta3 | not supported | 111 | | 7.0.0 → 7.5.0 | | 3.16.1 | 1.16.1 | 112 | | 6.8.5 → 6.8.9 | | 3.16.1 | 1.16.1 | 113 | | 6.7.1 | | 3.16.1 | 1.16.1 | 114 | | 6.6.2 | | 3.16.1 | 1.16.1 | 115 | | 6.5.4 | | 3.16.0 | 1.16.0 | 116 | | 6.4.3 | | 3.16.0 | 1.16.0 | 117 | | 6.3.2 | | 3.16.0 | 1.16.0 | 118 | | 6.2.4 | | 3.16.0 | 1.16.0 | 119 | | 6.2.3 | | 3.15.1 | 1.15.1 | 120 | | 6.1.1 | | 3.15.1 | 1.15.1 | 121 | | 5.5.2 | | 3.15.1 | 1.15.1 | 122 | | 5.4.0 | | 3.15.1 | 1.15.1 | 123 | | 5.3.0 | | 3.15.1 | 1.15.1 | 124 | | 5.2.0 | | 3.15.1 | 1.15.1 | 125 | | 5.1.1 | | 3.15.0 | 1.15.0 | 126 | | 2.4.2 → 2.4.3 | | 3.15.0 | 1.15.0 | 127 | | 2.4.1.1 | | 3.15.0 | 1.15.0 | 128 | | 2.4.1 → 2.4.1 | | 3.14.0 | 1.14.0 | 129 | | 2.4.0 → 2.4.0.1 | | 3.12.0 | 1.13.0 | 130 | | 2.3.0 → 2.3.4 | | 3.12.0 | 1.13.0 | 131 | | 2.2.1 | | 3.12.0 | 1.13.0 | 132 | | 2.2.0 | | 3.11.0 | 1.12.3 | 133 | | 2.1.0 → 2.1.2 | | 3.11.0 | 1.12.3 | 134 | | 2.0.0 → 2.0.2 | | 3.11.0 | 1.12.3 | 135 | | 1.9.1 | 1.6.0 → 1.7.2+? | 3.10.4 | 1.12.3 | 136 | | 1.9.0 | 1.6.0 → 1.7.0+? | 3.10.1 | 1.12.0 | 137 | | 1.8.0 | 1.4.0 → 1.6.0+ | 3.9.3 | 1.10.0 | 138 | | 1.7.0 | 1.3.0 → 1.3.5+ | 3.9.3 | 1.10.0 | 139 | | 1.6.0 | 1.2.0 → 1.2.2+ | 3.9.2 | 1.9.1 | 140 | | 1.5.0 | 1.1.0 → 1.1.2+ | 3.9.2 | 1.9.1 | 141 | | 1.4.0 | 1.0.0 → 1.0.3 | 3.9.0 | 1.9.0 | 142 | | 1.3.1 | 1.0.0 → 1.0.3 | 3.8.1 | 1.8.1 | 143 | | 1.3.0 | 1.0.0 → 1.0.3 | 3.8.1 | 1.8.1 | 144 | | 1.2.2 | 0.90.10→ 0.90.13 | 3.8.0 | 1.8.0 | 145 | | 1.2.1 | 0.90.10→ 0.90.11 | 3.8.0 | 1.8.0 | 146 | | 1.2.0 | 0.90.4 → 0.90.9 | 3.8.0 | 1.8.0 | 147 | | 1.1.1 | 0.90.4 → 0.90.9 | 3.8.0 | 1.8.0 | 148 | | 1.1.0 | 0.90.2 → 0.90.3 | 3.8.0 | 1.8.0 | 149 | | 1.0.1 | 0.90 → 0.90.3 | 3.7.1 | 1.7.1 | 150 | | 1.0.0 | 0.90 → 0.90.3 | 3.7.1 | 1.7.1 | 151 | 152 | License 153 | ------- 154 | 155 | This software is licensed under the Apache 2 license. Full text 156 | of the license is in the repository (`LICENSE.txt`). 157 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | ext { 3 | localversions = [ 4 | es : "7.17.7", 5 | c2 : "4.3.1" 6 | ] 7 | } 8 | 9 | repositories { 10 | mavenCentral() 11 | } 12 | 13 | dependencies { 14 | classpath "org.elasticsearch.gradle:build-tools:${localversions.es}" 15 | } 16 | } 17 | 18 | plugins { 19 | id 'java-library' 20 | id 'idea' 21 | id 'com.diffplug.spotless' version "5.8.2" apply false 22 | } 23 | 24 | apply plugin: 'elasticsearch.esplugin' 25 | apply plugin: 'elasticsearch.java-rest-test' 26 | apply plugin: 'elasticsearch.yaml-rest-test' 27 | 28 | apply from: file('gradle/validation/spotless.gradle') 29 | 30 | // This plugin's version (typically must match that of ES). 31 | // For bugfix releases against the same ES version, you can add a bugfix suffix. 32 | def bugfix = "" 33 | version = "${localversions.es}${bugfix}" 34 | group = 'org.carrot2' 35 | 36 | repositories { 37 | mavenLocal() 38 | mavenCentral() 39 | } 40 | 41 | ext { 42 | licenseFile = rootProject.file('LICENSE.txt') 43 | noticeFile = rootProject.file('NOTICE.txt') 44 | } 45 | 46 | esplugin { 47 | name 'elasticsearch-carrot2' 48 | description "Search results clustering plugin for Elasticsearch ${localversions.es} (Carrot2 ${localversions.c2})" 49 | classname 'org.carrot2.elasticsearch.ClusteringPlugin' 50 | } 51 | 52 | configurations { 53 | c2resources 54 | } 55 | 56 | dependencies { 57 | c2resources("org.carrot2:carrot2-core:${localversions.c2}", { 58 | transitive false 59 | }) 60 | 61 | api("org.carrot2:carrot2-core:${localversions.c2}", { 62 | exclude group: "com.carrotsearch", module: "hppc" 63 | }) 64 | 65 | testImplementation "org.assertj:assertj-core:3.13.2" 66 | 67 | // Let the javaRestTest see the classpath of main and tests. 68 | javaRestTestImplementation project.sourceSets.main.runtimeClasspath 69 | javaRestTestImplementation project.sourceSets.test.runtimeClasspath 70 | 71 | // TODO: ES 7.10.0-7.17.7 hack: missing log4j classes? 72 | yamlRestTestRuntimeClasspath "org.apache.logging.log4j:log4j-core:2.17.1" 73 | } 74 | 75 | // Set target compatibility 76 | sourceCompatibility = 11 77 | targetCompatibility = 11 78 | 79 | // We don't have unit tests, only integration tests. 80 | test.enabled = false 81 | 82 | // Add plugin configuration files to each testClusters instance. 83 | // 'extraConfigFile' doesn't allow directories, only files, so we 84 | // need to add each individually 85 | testClusters.all { 86 | fileTree(dir: 'src/main/config').each { file -> 87 | extraConfigFile 'elasticsearch-carrot2/' + file.name, file 88 | } 89 | } 90 | 91 | // Unpack and bundle the default resources with the plugin. 92 | bundlePlugin { 93 | from({ zipTree(configurations.c2resources.singleFile).matching { include "**/*.utf8" } }, { 94 | eachFile { fcd -> 95 | fcd.path -= "org/carrot2/language/" 96 | } 97 | includeEmptyDirs = false 98 | into 'config' 99 | }) 100 | } 101 | 102 | // Configure publishing. 103 | apply from: file('gradle/publishing.gradle') 104 | 105 | // TODO: ES 7.13.4 hack. We generate our own POM. 106 | tasks.matching { it.path == ":validateElasticPom" }.all { Task t -> 107 | enabled = false 108 | } -------------------------------------------------------------------------------- /docs/assets/css/prettify.css: -------------------------------------------------------------------------------- 1 | .com { color: #93a1a1; } 2 | .lit { color: #195f91; } 3 | .pun, .opn, .clo { color: #93a1a1; } 4 | .fun { color: #dc322f; } 5 | .str, .atv { color: #D14; } 6 | .kwd, .prettyprint .tag { color: #1e347b; } 7 | .typ, .atn, .dec, .var { color: teal; } 8 | .pln { color: #48484c; } 9 | 10 | .prettyprint { 11 | padding: 8px; 12 | background-color: #f7f7f9; 13 | border: 1px solid #e1e1e8; 14 | } 15 | .prettyprint.linenums { 16 | -webkit-box-shadow: inset 40px 0 0 #fbfbfc, inset 41px 0 0 #ececf0; 17 | -moz-box-shadow: inset 40px 0 0 #fbfbfc, inset 41px 0 0 #ececf0; 18 | box-shadow: inset 40px 0 0 #fbfbfc, inset 41px 0 0 #ececf0; 19 | } 20 | 21 | /* Specify class=linenums on a pre to get line numbering */ 22 | ol.linenums { 23 | margin: 0 0 0 33px; /* IE indents via margin-left */ 24 | } 25 | ol.linenums li { 26 | padding-left: 12px; 27 | color: #bebec5; 28 | line-height: 20px; 29 | text-shadow: 0 1px 0 #fff; 30 | } -------------------------------------------------------------------------------- /docs/assets/img/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/docs/assets/img/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /docs/assets/img/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/docs/assets/img/glyphicons-halflings.png -------------------------------------------------------------------------------- /docs/assets/img/mapping.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/docs/assets/img/mapping.odg -------------------------------------------------------------------------------- /docs/assets/img/mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/docs/assets/img/mapping.png -------------------------------------------------------------------------------- /docs/assets/js/config.js: -------------------------------------------------------------------------------- 1 | 2 | // Declare ES's address globally. 3 | window.ES_URL="http://localhost:9200"; -------------------------------------------------------------------------------- /docs/assets/js/prettify.js: -------------------------------------------------------------------------------- 1 | var q=null;window.PR_SHOULD_USE_CONTINUATION=!0; 2 | (function(){function L(a){function m(a){var f=a.charCodeAt(0);if(f!==92)return f;var b=a.charAt(1);return(f=r[b])?f:"0"<=b&&b<="7"?parseInt(a.substring(1),8):b==="u"||b==="x"?parseInt(a.substring(2),16):a.charCodeAt(1)}function e(a){if(a<32)return(a<16?"\\x0":"\\x")+a.toString(16);a=String.fromCharCode(a);if(a==="\\"||a==="-"||a==="["||a==="]")a="\\"+a;return a}function h(a){for(var f=a.substring(1,a.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),a= 3 | [],b=[],o=f[0]==="^",c=o?1:0,i=f.length;c122||(d<65||j>90||b.push([Math.max(65,j)|32,Math.min(d,90)|32]),d<97||j>122||b.push([Math.max(97,j)&-33,Math.min(d,122)&-33]))}}b.sort(function(a,f){return a[0]-f[0]||f[1]-a[1]});f=[];j=[NaN,NaN];for(c=0;ci[0]&&(i[1]+1>i[0]&&b.push("-"),b.push(e(i[1])));b.push("]");return b.join("")}function y(a){for(var f=a.source.match(/\[(?:[^\\\]]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),b=f.length,d=[],c=0,i=0;c=2&&a==="["?f[c]=h(j):a!=="\\"&&(f[c]=j.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return f.join("")}for(var t=0,s=!1,l=!1,p=0,d=a.length;p=5&&"lang-"===b.substring(0,5))&&!(o&&typeof o[1]==="string"))c=!1,b="src";c||(r[f]=b)}i=d;d+=f.length;if(c){c=o[1];var j=f.indexOf(c),k=j+c.length;o[2]&&(k=f.length-o[2].length,j=k-c.length);b=b.substring(5);B(l+i,f.substring(0,j),e,p);B(l+i+j,c,C(b,c),p);B(l+i+k,f.substring(k),e,p)}else p.push(l+i,b)}a.e=p}var h={},y;(function(){for(var e=a.concat(m), 9 | l=[],p={},d=0,g=e.length;d=0;)h[n.charAt(k)]=r;r=r[1];n=""+r;p.hasOwnProperty(n)||(l.push(r),p[n]=q)}l.push(/[\S\s]/);y=L(l)})();var t=m.length;return e}function u(a){var m=[],e=[];a.tripleQuotedStrings?m.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?m.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/, 10 | q,"'\"`"]):m.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&e.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var h=a.hashComments;h&&(a.cStyleComments?(h>1?m.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):m.push(["com",/^#(?:(?:define|elif|else|endif|error|ifdef|include|ifndef|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),e.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h|[a-z]\w*)>/,q])):m.push(["com",/^#[^\n\r]*/, 11 | q,"#"]));a.cStyleComments&&(e.push(["com",/^\/\/[^\n\r]*/,q]),e.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));a.regexLiterals&&e.push(["lang-regex",/^(?:^^\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\(|\*|\*=|\+=|,|-=|->|\/|\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\^=|\^\^|\^\^=|{|\||\|=|\|\||\|\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\s*(\/(?=[^*/])(?:[^/[\\]|\\[\S\s]|\[(?:[^\\\]]|\\[\S\s])*(?:]|$))+\/)/]);(h=a.types)&&e.push(["typ",h]);a=(""+a.keywords).replace(/^ | $/g, 12 | "");a.length&&e.push(["kwd",RegExp("^(?:"+a.replace(/[\s,]+/g,"|")+")\\b"),q]);m.push(["pln",/^\s+/,q," \r\n\t\xa0"]);e.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/,q],["pun",/^.[^\s\w"-$'./@\\`]*/,q]);return x(m,e)}function D(a,m){function e(a){switch(a.nodeType){case 1:if(k.test(a.className))break;if("BR"===a.nodeName)h(a), 13 | a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)e(a);break;case 3:case 4:if(p){var b=a.nodeValue,d=b.match(t);if(d){var c=b.substring(0,d.index);a.nodeValue=c;(b=b.substring(d.index+d[0].length))&&a.parentNode.insertBefore(s.createTextNode(b),a.nextSibling);h(a);c||a.parentNode.removeChild(a)}}}}function h(a){function b(a,d){var e=d?a.cloneNode(!1):a,f=a.parentNode;if(f){var f=b(f,1),g=a.nextSibling;f.appendChild(e);for(var h=g;h;h=g)g=h.nextSibling,f.appendChild(h)}return e} 14 | for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),e;(e=a.parentNode)&&e.nodeType===1;)a=e;d.push(a)}var k=/(?:^|\s)nocode(?:\s|$)/,t=/\r\n?|\n/,s=a.ownerDocument,l;a.currentStyle?l=a.currentStyle.whiteSpace:window.getComputedStyle&&(l=s.defaultView.getComputedStyle(a,q).getPropertyValue("white-space"));var p=l&&"pre"===l.substring(0,3);for(l=s.createElement("LI");a.firstChild;)l.appendChild(a.firstChild);for(var d=[l],g=0;g=0;){var h=m[e];A.hasOwnProperty(h)?window.console&&console.warn("cannot override language handler %s",h):A[h]=a}}function C(a,m){if(!a||!A.hasOwnProperty(a))a=/^\s*=o&&(h+=2);e>=c&&(a+=2)}}catch(w){"console"in window&&console.log(w&&w.stack?w.stack:w)}}var v=["break,continue,do,else,for,if,return,while"],w=[[v,"auto,case,char,const,default,double,enum,extern,float,goto,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"], 18 | "catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],F=[w,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,dynamic_cast,explicit,export,friend,inline,late_check,mutable,namespace,nullptr,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],G=[w,"abstract,boolean,byte,extends,final,finally,implements,import,instanceof,null,native,package,strictfp,super,synchronized,throws,transient"], 19 | H=[G,"as,base,by,checked,decimal,delegate,descending,dynamic,event,fixed,foreach,from,group,implicit,in,interface,internal,into,is,lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var"],w=[w,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],I=[v,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"], 20 | J=[v,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],v=[v,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],K=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)/,N=/\S/,O=u({keywords:[F,H,w,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END"+ 21 | I,J,v],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),A={};k(O,["default-code"]);k(x([],[["pln",/^[^]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-",/^]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i]]), 22 | ["default-markup","htm","html","mxml","xhtml","xml","xsl"]);k(x([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"]],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/],["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css", 23 | /^style\s*=\s*([^\s"'>]+)/i]]),["in.tag"]);k(x([],[["atv",/^[\S\s]+/]]),["uq.val"]);k(u({keywords:F,hashComments:!0,cStyleComments:!0,types:K}),["c","cc","cpp","cxx","cyc","m"]);k(u({keywords:"null,true,false"}),["json"]);k(u({keywords:H,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:K}),["cs"]);k(u({keywords:G,cStyleComments:!0}),["java"]);k(u({keywords:v,hashComments:!0,multiLineStrings:!0}),["bsh","csh","sh"]);k(u({keywords:I,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}), 24 | ["cv","py"]);k(u({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["perl","pl","pm"]);k(u({keywords:J,hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb"]);k(u({keywords:w,cStyleComments:!0,regexLiterals:!0}),["js"]);k(u({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,true,try,unless,until,when,while,yes", 25 | hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);k(x([],[["str",/^[\S\s]+/]]),["regex"]);window.prettyPrintOne=function(a,m,e){var h=document.createElement("PRE");h.innerHTML=a;e&&D(h,e);E({g:m,i:e,h:h});return h.innerHTML};window.prettyPrint=function(a){function m(){for(var e=window.PR_SHOULD_USE_CONTINUATION?l.now()+250:Infinity;p=0){var k=k.match(g),f,b;if(b= 26 | !k){b=n;for(var o=void 0,c=b.firstChild;c;c=c.nextSibling)var i=c.nodeType,o=i===1?o?b:c:i===3?N.test(c.nodeValue)?b:o:o;b=(f=o===b?void 0:o)&&"CODE"===f.tagName}b&&(k=f.className.match(g));k&&(k=k[1]);b=!1;for(o=n.parentNode;o;o=o.parentNode)if((o.tagName==="pre"||o.tagName==="code"||o.tagName==="xmp")&&o.className&&o.className.indexOf("prettyprint")>=0){b=!0;break}b||((b=(b=n.className.match(/\blinenums\b(?::(\d+))?/))?b[1]&&b[1].length?+b[1]:!0:!1)&&D(n,b),d={g:k,h:n,i:b},E(d))}}p 2 | 3 | 4 | 5 | Carrot² search results clustering plugin for ElasticSearch 6 | 7 | 8 | 9 | 10 | 34 | 35 | 36 | 37 |
38 |
39 |
40 |
41 | ElasticSearch instance cannot be reached. 42 | This manual requires a running instance of ES to render outputs. 43 |
44 |
45 |
46 | 47 |
48 |
49 | 59 |
60 |
61 | 62 |
63 |
0
64 |
65 |

66 | This documentation has a hardcoded ElasticSearch URL pointing 67 | at: . Make sure you 68 | have configured CORS permissions properly by adding the following 69 | to ES's configuration file: 70 |

71 |
# Allow localhost cross-origin requests
 72 | http.cors.enabled: true
 73 | http.cors.allow-origin: /(null)|(https?:\/\/localhost(:[0-9]+)?)|(https?:\/\/cdn\.rawgit\.com(:[0-9]+)?)/
74 |
75 |
76 | 77 |
78 |
1
79 |
80 |
81 | First, 82 | documents to be clustered. 83 |
84 |
85 |
86 | 87 |
88 |
2
89 |
90 | Then, type in a query like: 91 |
92 | 93 | and 94 | 95 | with 96 | 97 | algorithm. 98 | 99 |
100 |
101 |
102 | 103 |
104 |
105 | Loading Carrot Search FoamTree visualization... 106 |
107 |
108 | Loading Carrot Search Circles visualization... 109 |
110 |
111 | 112 |
113 |
3
114 | 118 |
119 | 120 | 121 | 122 | 123 | 124 | 125 | 279 |
280 | 281 | 282 | -------------------------------------------------------------------------------- /gradle/publishing.gradle: -------------------------------------------------------------------------------- 1 | // Configure maven central publishing independently from ES's default infrastructure. 2 | 3 | configure(rootProject) { 4 | apply plugin: 'maven-publish' 5 | apply plugin: 'signing' 6 | apply plugin: 'java-library' 7 | 8 | ext { 9 | mavenBuildRepo = file("${buildDir}/maven") 10 | } 11 | 12 | publishing { 13 | repositories { 14 | maven { 15 | name = 'build' 16 | url = mavenBuildRepo 17 | } 18 | 19 | maven { 20 | name = 'sonatype' 21 | url "https://s01.oss.sonatype.org/service/local/staging/deploy/maven2" 22 | credentials { 23 | if (project.hasProperty('nexusUsername')) { 24 | username project.nexusUsername 25 | } 26 | if (project.hasProperty('nexusPassword')) { 27 | password project.nexusPassword 28 | } 29 | } 30 | } 31 | } 32 | 33 | plugins.withType(JavaPlugin) { 34 | // Do not generate gradle metadata files. 35 | tasks.withType(GenerateModuleMetadata) { 36 | enabled = false 37 | } 38 | 39 | // Disable ES POM validation tasks. 40 | tasks.matching {it.path in [":validateMavenPom", ":validateNebulaPom"]}.all { task -> 41 | task.enabled = false 42 | } 43 | 44 | java { 45 | withSourcesJar() 46 | withJavadocJar() 47 | } 48 | 49 | publications { 50 | maven(MavenPublication) { 51 | from components.java 52 | group = project.group 53 | artifactId = project.archivesBaseName 54 | 55 | // artifact sourcesJar 56 | // artifact javadocJar 57 | artifact bundlePlugin 58 | 59 | pom { 60 | inceptionYear = "2013" 61 | artifactId 'elasticsearch-carrot2' 62 | licenses { 63 | license { 64 | name = 'The Apache License, Version 2.0' 65 | url = 'https://www.apache.org/licenses/LICENSE-2.0.txt' 66 | } 67 | } 68 | organization { 69 | name = "Carrot Search s.c." 70 | url = "https://www.carrotsearch.com" 71 | } 72 | developers { 73 | developer { 74 | id = 'stanislaw.osinski' 75 | name = 'Stanisław Osiński' 76 | email = 'stanislaw.osinski@carrotsearch.com' 77 | } 78 | developer { 79 | id = 'dawid.weiss' 80 | name = 'Dawid Weiss' 81 | email = 'dawid.weiss@carrotsearch.com' 82 | } 83 | } 84 | 85 | url = 'https://github.com/carrot2/elasticsearch-carrot2' 86 | scm { 87 | connection = 'scm:git:https://github.com/carrot2/elasticsearch-carrot2' 88 | developerConnection = 'scm:git:git@github.com:carrot2/elasticsearch-carrot2.git' 89 | url = 'https://github.com/carrot2/elasticsearch-carrot2' 90 | } 91 | 92 | name = esplugin.name 93 | description = esplugin.description 94 | } 95 | } 96 | } 97 | } 98 | 99 | signing { 100 | sign publishing.publications.maven 101 | } 102 | 103 | task publishToSonatype() { 104 | group "Publishing" 105 | description "Publish to SonaType Nexus at ${publishing.repositories.sonatype.url}." 106 | 107 | dependsOn publishMavenPublicationToSonatypeRepository 108 | } 109 | 110 | task publishToBuild() { 111 | group "Publishing" 112 | description "Publish Maven artifacts locally to ${mavenBuildRepo}" 113 | 114 | dependsOn ":publishMavenPublicationToBuildRepository" 115 | } 116 | 117 | tasks.matching { it.path == ":publishMavenPublicationToBuildRepository" }.all { 118 | doFirst { 119 | delete mavenBuildRepo 120 | } 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /gradle/validation/spotless.gradle: -------------------------------------------------------------------------------- 1 | 2 | allprojects { prj -> 3 | plugins.withType(JavaPlugin) { 4 | prj.apply plugin: 'com.diffplug.spotless' 5 | 6 | spotless { 7 | java { 8 | googleJavaFormat('1.9') 9 | licenseHeaderFile rootProject.file("gradle/validation/spotless/source-header.txt") 10 | lineEndings 'UNIX' 11 | endWithNewline() 12 | } 13 | 14 | check.dependsOn(spotlessCheck) 15 | } 16 | 17 | task tidy() { 18 | description "Applies formatters and cleanups to sources." 19 | group "verification" 20 | 21 | dependsOn spotlessApply 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /gradle/validation/spotless/source-header.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/gradle/validation/spotless/source-header.txt -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carrot2/elasticsearch-carrot2/b98a04b9980ee82a77a6c6e2e4ea7380d578b8d1/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS='"-Xmx64m"' 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS="-Xmx64m" 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/ClusteringActionIT.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.HashMap; 8 | import java.util.LinkedHashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.stream.Collectors; 12 | import org.assertj.core.api.Assertions; 13 | import org.carrot2.attrs.Attrs; 14 | import org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm; 15 | import org.carrot2.clustering.lingo.LingoClusteringAlgorithm; 16 | import org.carrot2.clustering.stc.STCClusteringAlgorithm; 17 | import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionRequestBuilder; 18 | import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionResponse; 19 | import org.carrot2.language.LanguageComponentsLoader; 20 | import org.elasticsearch.ElasticsearchException; 21 | import org.elasticsearch.action.search.SearchRequestBuilder; 22 | import org.elasticsearch.common.Strings; 23 | import org.elasticsearch.common.bytes.BytesArray; 24 | import org.elasticsearch.common.xcontent.XContentHelper; 25 | import org.elasticsearch.index.query.QueryBuilders; 26 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; 27 | import org.elasticsearch.xcontent.DeprecationHandler; 28 | import org.elasticsearch.xcontent.NamedXContentRegistry; 29 | import org.elasticsearch.xcontent.ObjectPath; 30 | import org.elasticsearch.xcontent.ToXContent; 31 | import org.elasticsearch.xcontent.XContentBuilder; 32 | import org.elasticsearch.xcontent.XContentFactory; 33 | import org.elasticsearch.xcontent.XContentParser; 34 | import org.elasticsearch.xcontent.XContentType; 35 | 36 | /** API tests for {@link ClusteringAction}. */ 37 | public class ClusteringActionIT extends SampleIndexTestCase { 38 | public void testComplexQuery() throws IOException { 39 | ClusteringActionResponse result = 40 | new ClusteringActionRequestBuilder(client) 41 | .setQueryHint("data mining") 42 | .addSourceFieldMapping("title", LogicalField.TITLE) 43 | .addHighlightedFieldMapping("content", LogicalField.CONTENT) 44 | .setDefaultLanguage("English") 45 | .setSearchRequest( 46 | client 47 | .prepareSearch() 48 | .setIndices(INDEX_TEST) 49 | .setSize(100) 50 | .setQuery(QueryBuilders.termQuery("content", "data")) 51 | .highlighter(new HighlightBuilder().preTags("").postTags("")) 52 | .setFetchSource(new String[] {"title"}, null) 53 | .highlighter(new HighlightBuilder().field("content"))) 54 | .execute() 55 | .actionGet(); 56 | 57 | checkValid(result); 58 | checkJsonSerialization(result); 59 | } 60 | 61 | public void testDefaultLanguage() throws IOException { 62 | LinkedHashMap> labelsByLanguage = new LinkedHashMap<>(); 63 | String[] languages = new LanguageComponentsLoader().load().languages().toArray(String[]::new); 64 | for (String lang : languages) { 65 | ClusteringActionResponse english = 66 | new ClusteringActionRequestBuilder(client) 67 | .setQueryHint("data mining") 68 | .addSourceFieldMapping("title", LogicalField.TITLE) 69 | .addHighlightedFieldMapping("content", LogicalField.CONTENT) 70 | .setDefaultLanguage(lang) 71 | .setSearchRequest( 72 | client 73 | .prepareSearch() 74 | .setIndices(INDEX_TEST) 75 | .setSize(100) 76 | .setQuery(QueryBuilders.termQuery("content", "data")) 77 | .setFetchSource(new String[] {"title"}, null)) 78 | .execute() 79 | .actionGet(); 80 | 81 | checkValid(english); 82 | checkJsonSerialization(english); 83 | 84 | labelsByLanguage.put( 85 | lang, 86 | Arrays.stream(english.getDocumentGroups()) 87 | .map(DocumentGroup::getLabel) 88 | .collect(Collectors.toList())); 89 | } 90 | 91 | List english = labelsByLanguage.get("English"); 92 | List italian = labelsByLanguage.get("Italian"); 93 | List shared = new ArrayList<>(english); 94 | shared.retainAll(italian); 95 | Assertions.assertThat(shared).hasSizeLessThanOrEqualTo((int) (english.size() * 0.75)); 96 | } 97 | 98 | public void testAttributes() throws IOException { 99 | LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm(); 100 | algorithm.desiredClusterCount.set(5); 101 | 102 | Map extract = Attrs.extract(algorithm); 103 | Attrs.populate(algorithm, extract); 104 | 105 | ClusteringActionResponse result = 106 | new ClusteringActionRequestBuilder(client) 107 | .setQueryHint("data mining") 108 | .addSourceFieldMapping("title", LogicalField.TITLE) 109 | .addSourceFieldMapping("content", LogicalField.CONTENT) 110 | .addAttributes(Attrs.extract(algorithm)) 111 | .setSearchRequest( 112 | client 113 | .prepareSearch() 114 | .setIndices(INDEX_TEST) 115 | .setSize(100) 116 | .setQuery(QueryBuilders.matchAllQuery()) 117 | .setFetchSource(new String[] {"title", "content"}, null)) 118 | .execute() 119 | .actionGet(); 120 | 121 | checkValid(result); 122 | checkJsonSerialization(result); 123 | 124 | Assertions.assertThat(result.getDocumentGroups().length).isBetween(0, 5 + 1); 125 | } 126 | 127 | public void testLanguageField() throws IOException { 128 | Map attrs = new HashMap<>(); 129 | 130 | ClusteringActionResponse result = 131 | new ClusteringActionRequestBuilder(client) 132 | .setQueryHint("data mining") 133 | .addSourceFieldMapping("title", LogicalField.TITLE) 134 | .addSourceFieldMapping("content", LogicalField.CONTENT) 135 | .addSourceFieldMapping("rndlang", LogicalField.LANGUAGE) 136 | .addAttributes(attrs) 137 | .setSearchRequest( 138 | client 139 | .prepareSearch() 140 | .setIndices(INDEX_TEST) 141 | .setSize(100) 142 | .setQuery(QueryBuilders.termQuery("content", "data")) 143 | .setFetchSource(new String[] {"title", "content", "rndlang"}, null)) 144 | .get(); 145 | 146 | checkValid(result); 147 | checkJsonSerialization(result); 148 | 149 | // We should receive groups for multiple languages 150 | String[] languages = 151 | result.getInfo().get(ClusteringActionResponse.Fields.Info.LANGUAGES).split(","); 152 | 153 | Assertions.assertThat(languages) 154 | .describedAs( 155 | "Expected a lot of languages to appear in top groups: " + Arrays.toString(languages)) 156 | .hasSizeGreaterThan(5); 157 | 158 | DocumentGroup[] groups = result.getDocumentGroups(); 159 | List groupLabels = 160 | Arrays.stream(groups) 161 | .map(grp -> grp.getLabel() + " (" + grp.getDocumentReferences().length + ")") 162 | .collect(Collectors.toList()); 163 | Assertions.assertThat(groupLabels).hasSizeGreaterThan(5); 164 | } 165 | 166 | public void testListAlgorithms() { 167 | ListAlgorithmsActionResponse response = new ListAlgorithmsActionRequestBuilder(client).get(); 168 | 169 | List algorithms = response.getAlgorithms(); 170 | Assertions.assertThat(algorithms) 171 | .isNotEmpty() 172 | .contains( 173 | LingoClusteringAlgorithm.NAME, 174 | STCClusteringAlgorithm.NAME, 175 | BisectingKMeansClusteringAlgorithm.NAME); 176 | } 177 | 178 | public void testNonexistentFields() throws IOException { 179 | ClusteringActionResponse result = 180 | new ClusteringActionRequestBuilder(client) 181 | .setQueryHint("data mining") 182 | .addSourceFieldMapping("_nonexistent_", LogicalField.TITLE) 183 | .addSourceFieldMapping("_nonexistent_", LogicalField.CONTENT) 184 | .setCreateUngroupedDocumentsCluster(true) 185 | .setSearchRequest( 186 | client 187 | .prepareSearch() 188 | .setIndices(INDEX_TEST) 189 | .setSize(100) 190 | .setQuery(QueryBuilders.termQuery("content", "data")) 191 | .setFetchSource(new String[] {"title", "content"}, null)) 192 | .execute() 193 | .actionGet(); 194 | 195 | // There should be no clusters, but no errors. 196 | checkValid(result); 197 | checkJsonSerialization(result); 198 | 199 | // Top level groups should be input documents' languages (aggregation strategy above). 200 | DocumentGroup[] documentGroups = result.getDocumentGroups(); 201 | for (DocumentGroup group : documentGroups) { 202 | if (!group.isUngroupedDocuments()) { 203 | fail("Expected no clusters for non-existent fields."); 204 | } 205 | } 206 | } 207 | 208 | public void testNonexistentAlgorithmId() { 209 | // The query should result in an error. 210 | try { 211 | new ClusteringActionRequestBuilder(client) 212 | .setQueryHint("") 213 | .addSourceFieldMapping("_nonexistent_", LogicalField.TITLE) 214 | .setAlgorithm("_nonexistent_") 215 | .setSearchRequest( 216 | client 217 | .prepareSearch() 218 | .setIndices(INDEX_TEST) 219 | .setSize(100) 220 | .setQuery(QueryBuilders.termQuery("content", "data")) 221 | .setFetchSource(new String[] {"title", "content"}, null)) 222 | .execute() 223 | .actionGet(); 224 | throw Preconditions.unreachable(); 225 | } catch (IllegalArgumentException e) { 226 | Assertions.assertThat(e).hasMessageContaining("No such algorithm:"); 227 | } 228 | } 229 | 230 | public void testPropagatingAlgorithmException() { 231 | // The query should result in an error. 232 | try { 233 | // Out of allowed range (should cause an exception). 234 | Map attrs = new HashMap<>(); 235 | attrs.put("ignoreWordIfInHigherDocsPercent", Double.MAX_VALUE); 236 | 237 | new ClusteringActionRequestBuilder(client) 238 | .setQueryHint("") 239 | .addSourceFieldMapping("title", LogicalField.TITLE) 240 | .addSourceFieldMapping("content", LogicalField.CONTENT) 241 | .setAlgorithm(STCClusteringAlgorithm.NAME) 242 | .addAttributes(attrs) 243 | .setSearchRequest( 244 | client 245 | .prepareSearch() 246 | .setIndices(INDEX_TEST) 247 | .setSize(100) 248 | .setQuery(QueryBuilders.termQuery("content", "data")) 249 | .setFetchSource(new String[] {"title", "content"}, null)) 250 | .execute() 251 | .actionGet(); 252 | throw Preconditions.unreachable(); 253 | } catch (ElasticsearchException e) { 254 | Assertions.assertThat(e).hasMessageContaining("Clustering error:"); 255 | } 256 | } 257 | 258 | public void testIncludeHits() throws IOException { 259 | // same search with and without hits 260 | SearchRequestBuilder req = 261 | client 262 | .prepareSearch() 263 | .setIndices(INDEX_TEST) 264 | .setSize(2) 265 | .setQuery(QueryBuilders.termQuery("content", "data")) 266 | .setFetchSource(new String[] {"content"}, null); 267 | 268 | // with hits (default) 269 | ClusteringActionResponse resultWithHits = 270 | new ClusteringActionRequestBuilder(client) 271 | .setQueryHint("data mining") 272 | .setAlgorithm(STCClusteringAlgorithm.NAME) 273 | .addSourceFieldMapping("title", LogicalField.TITLE) 274 | .setCreateUngroupedDocumentsCluster(true) 275 | .setSearchRequest(req) 276 | .execute() 277 | .actionGet(); 278 | checkValid(resultWithHits); 279 | checkJsonSerialization(resultWithHits); 280 | 281 | var asMap = asMap(resultWithHits); 282 | Assertions.assertThat(ObjectPath.eval("hits.total.value", asMap)).isEqualTo(96); 283 | Assertions.assertThat((List) ObjectPath.eval("hits.hits", asMap)).isNotEmpty(); 284 | 285 | // without hits 286 | ClusteringActionResponse resultWithoutHits = 287 | new ClusteringActionRequestBuilder(client) 288 | .setQueryHint("data mining") 289 | .setMaxHits(0) 290 | .setAlgorithm(STCClusteringAlgorithm.NAME) 291 | .addSourceFieldMapping("title", LogicalField.TITLE) 292 | .setCreateUngroupedDocumentsCluster(true) 293 | .setSearchRequest(req) 294 | .execute() 295 | .actionGet(); 296 | checkValid(resultWithoutHits); 297 | checkJsonSerialization(resultWithoutHits); 298 | 299 | asMap = asMap(resultWithoutHits); 300 | Assertions.assertThat(ObjectPath.eval("hits.total.value", asMap)).isEqualTo(96); 301 | Assertions.assertThat((List) ObjectPath.eval("hits.hits", asMap)).isEmpty(); 302 | } 303 | 304 | public void testMaxHits() throws IOException { 305 | // same search with and without hits 306 | SearchRequestBuilder req = 307 | client 308 | .prepareSearch() 309 | .setIndices(INDEX_TEST) 310 | .setSize(2) 311 | .setQuery(QueryBuilders.termQuery("content", "data")) 312 | .setFetchSource(new String[] {"content"}, null); 313 | 314 | // Limit the set of hits to just top 2. 315 | ClusteringActionResponse limitedHits = 316 | new ClusteringActionRequestBuilder(client) 317 | .setQueryHint("data mining") 318 | .setMaxHits(2) 319 | .setAlgorithm(STCClusteringAlgorithm.NAME) 320 | .addSourceFieldMapping("title", LogicalField.TITLE) 321 | .setCreateUngroupedDocumentsCluster(true) 322 | .setSearchRequest(req) 323 | .execute() 324 | .actionGet(); 325 | checkValid(limitedHits); 326 | checkJsonSerialization(limitedHits); 327 | 328 | Assertions.assertThat(limitedHits.getSearchResponse().getHits().getHits()).hasSize(2); 329 | 330 | var asMap = asMap(limitedHits); 331 | Assertions.assertThat(ObjectPath.eval("hits.total.value", asMap)).isEqualTo(96); 332 | Assertions.assertThat((List) ObjectPath.eval("hits.hits", asMap)).hasSize(2); 333 | } 334 | 335 | private Map asMap(ClusteringActionResponse resultWithHits) throws IOException { 336 | XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); 337 | builder.startObject(); 338 | resultWithHits.toXContent(builder, ToXContent.EMPTY_PARAMS); 339 | builder.endObject(); 340 | 341 | Map responseJson; 342 | try (XContentParser parser = 343 | XContentHelper.createParser( 344 | NamedXContentRegistry.EMPTY, 345 | DeprecationHandler.THROW_UNSUPPORTED_OPERATION, 346 | new BytesArray(Strings.toString(builder)), 347 | XContentType.JSON)) { 348 | responseJson = parser.mapOrdered(); 349 | } 350 | return responseJson; 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/ClusteringActionRestIT.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import static org.carrot2.elasticsearch.TestInfra.TestDocument; 5 | import static org.carrot2.elasticsearch.TestInfra.jsonResource; 6 | 7 | import java.io.IOException; 8 | import java.net.HttpURLConnection; 9 | import java.nio.charset.StandardCharsets; 10 | import java.nio.charset.UnsupportedCharsetException; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Random; 15 | import java.util.Set; 16 | import java.util.stream.Collectors; 17 | import java.util.stream.Stream; 18 | import org.assertj.core.api.Assertions; 19 | import org.carrot2.clustering.stc.STCClusteringAlgorithm; 20 | import org.carrot2.elasticsearch.ClusteringAction.RestClusteringAction; 21 | import org.carrot2.language.LanguageComponentsLoader; 22 | import org.elasticsearch.client.Request; 23 | import org.elasticsearch.client.Response; 24 | import org.elasticsearch.client.ResponseException; 25 | import org.elasticsearch.common.Strings; 26 | import org.elasticsearch.common.bytes.BytesArray; 27 | import org.elasticsearch.common.settings.Settings; 28 | import org.elasticsearch.common.xcontent.XContentHelper; 29 | import org.elasticsearch.test.rest.ESRestTestCase; 30 | import org.elasticsearch.xcontent.DeprecationHandler; 31 | import org.elasticsearch.xcontent.NamedXContentRegistry; 32 | import org.elasticsearch.xcontent.ObjectPath; 33 | import org.elasticsearch.xcontent.XContentFactory; 34 | import org.elasticsearch.xcontent.XContentParser; 35 | import org.elasticsearch.xcontent.XContentType; 36 | import org.junit.Before; 37 | 38 | /** REST API tests for {@link ClusteringAction}. */ 39 | public class ClusteringActionRestIT extends ESRestTestCase { 40 | protected static final String INDEX_RNDLANG = "test"; 41 | protected static final String INDEX_NOLANG = "empty"; 42 | 43 | @Before 44 | public void setupData() throws IOException { 45 | addTestData(); 46 | } 47 | 48 | public void testPostMultipleFieldMapping() throws Exception { 49 | postNoError("post_multiple_field_mapping.json"); 50 | } 51 | 52 | public void testPostWithHighlightedFields() throws Exception { 53 | postNoError("post_with_highlighted_fields.json"); 54 | } 55 | 56 | public void testPostWithFields() throws Exception { 57 | postNoError("post_with_fields.json"); 58 | } 59 | 60 | public void testPostWithSourceFields() throws Exception { 61 | postNoError("post_with_source_fields.json"); 62 | } 63 | 64 | public void testGetClusteringRequest() throws Exception { 65 | var request = new Request("GET", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 66 | request.addParameter("pretty", "true"); 67 | request.addParameter("q", "data mining"); 68 | request.addParameter("_source", "url,title,content"); 69 | request.addParameter("size", "100"); 70 | request.addParameter("query_hint", "data mining"); 71 | request.addParameter(ClusteringActionRequest.JSON_CREATE_UNGROUPED_CLUSTER, "true"); 72 | request.addParameter("field_mapping_content", "_source.title,_source.content"); 73 | request.addParameter("algorithm", STCClusteringAlgorithm.NAME); 74 | 75 | var response = checkHttpResponseContainsClusters(request); 76 | Assertions.assertThat((List) response.get("clusters")).hasSizeGreaterThan(5); 77 | } 78 | 79 | public void testRestApiRuntimeAttributes() throws Exception { 80 | var response = postNoError("post_runtime_attributes.json"); 81 | Assertions.assertThat((List) response.get("clusters")) 82 | .hasSizeBetween(1, /* max. cluster size cap */ 5 + /* other topics */ 1); 83 | } 84 | 85 | public void testLanguageField() throws Exception { 86 | var response = postNoError("post_language_field.json"); 87 | Assertions.assertThat((List) response.get("clusters")).hasSizeGreaterThan(1); 88 | 89 | Assertions.assertThat(((String) ObjectPath.eval("info.languages", response)).split(",")) 90 | .hasSizeGreaterThan(3); 91 | } 92 | 93 | public void testNonexistentFields() throws Exception { 94 | var request = new Request("POST", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 95 | request.addParameter("pretty", "true"); 96 | 97 | postNoError("post_nonexistent_fields.json"); 98 | } 99 | 100 | public void testNonexistentAlgorithmId() throws Exception { 101 | var request = new Request("POST", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 102 | request.addParameter("pretty", "true"); 103 | request.setJsonEntity(jsonResource(getClass(), "post_nonexistent_algorithmId.json")); 104 | 105 | expectErrorResponseWithMessage( 106 | request, HttpURLConnection.HTTP_BAD_REQUEST, "No such algorithm: _nonexistent_"); 107 | } 108 | 109 | public void testInvalidSearchQuery() throws Exception { 110 | var request = new Request("POST", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 111 | request.addParameter("pretty", "true"); 112 | request.setJsonEntity(jsonResource(getClass(), "post_invalid_query.json")); 113 | 114 | expectErrorResponseWithMessage( 115 | request, HttpURLConnection.HTTP_BAD_REQUEST, "parsing_exception"); 116 | } 117 | 118 | public void testPropagatingAlgorithmException() throws Exception { 119 | var request = new Request("POST", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 120 | request.addParameter("pretty", "true"); 121 | request.setJsonEntity(jsonResource(getClass(), "post_invalid_attribute_value.json")); 122 | 123 | expectErrorResponseWithMessage( 124 | request, HttpURLConnection.HTTP_INTERNAL_ERROR, "Clustering error: Value must be <= 1.0"); 125 | } 126 | 127 | void expectErrorResponseWithMessage(Request request, int expectedStatus, String messageSubstring) 128 | throws IOException { 129 | Response response; 130 | try { 131 | response = client().performRequest(request); 132 | fail("Expected response exception but received: " + response); 133 | } catch (ResponseException e) { 134 | response = e.getResponse(); 135 | } 136 | 137 | byte[] responseBytes = response.getEntity().getContent().readAllBytes(); 138 | String responseString = new String(responseBytes, StandardCharsets.UTF_8); 139 | String responseDescription = 140 | "HTTP response status: " 141 | + response.getStatusLine().toString() 142 | + ", " 143 | + "HTTP body: " 144 | + responseString; 145 | 146 | Assertions.assertThat(response.getStatusLine().getStatusCode()) 147 | .describedAs(responseDescription) 148 | .isEqualTo(expectedStatus); 149 | 150 | XContentType xContentType = 151 | XContentType.fromMediaTypeOrFormat(response.getHeader("Content-Type")); 152 | try (XContentParser parser = 153 | XContentHelper.createParser( 154 | NamedXContentRegistry.EMPTY, 155 | DeprecationHandler.THROW_UNSUPPORTED_OPERATION, 156 | new BytesArray(responseBytes), 157 | xContentType)) { 158 | Map responseJson = parser.mapOrdered(); 159 | 160 | Assertions.assertThat(responseJson).describedAs(responseString).containsKey("error"); 161 | 162 | Assertions.assertThat(responseJson.get("error").toString()) 163 | .describedAs(responseString) 164 | .contains(messageSubstring); 165 | } 166 | } 167 | 168 | public Map postNoError(String jsonResource) throws Exception { 169 | var request = new Request("POST", "/" + INDEX_RNDLANG + "/" + RestClusteringAction.NAME); 170 | request.addParameter("pretty", "true"); 171 | request.setJsonEntity(jsonResource(getClass(), jsonResource)); 172 | return checkHttpResponseContainsClusters(request); 173 | } 174 | 175 | private Map checkHttpResponseContainsClusters(Request request) 176 | throws IOException { 177 | Map response = responseAsMap(client().performRequest(request)); 178 | Object clusters = response.get("clusters"); 179 | Assertions.assertThat(clusters).isNotNull(); 180 | 181 | System.out.println( 182 | "Clusters:\n" 183 | + Strings.toString( 184 | XContentFactory.jsonBuilder() 185 | .prettyPrint() 186 | .startObject() 187 | .field("clusters", clusters) 188 | .endObject())); 189 | return response; 190 | } 191 | 192 | private static void addTestData() throws UnsupportedCharsetException, IOException { 193 | Random rnd = random(); 194 | String[] languages = new LanguageComponentsLoader().load().languages().toArray(String[]::new); 195 | Arrays.sort(languages); 196 | 197 | List docs = TestInfra.load("datamining.json"); 198 | 199 | index( 200 | INDEX_RNDLANG, 201 | docs.stream() 202 | .map( 203 | doc -> 204 | doc.cloneWith( 205 | Map.ofEntries( 206 | Map.entry("lang", "English"), 207 | Map.entry("rndlang", languages[rnd.nextInt(languages.length)]))))); 208 | 209 | index(INDEX_NOLANG, docs.stream()); 210 | } 211 | 212 | public static void index(String index, Stream docStream) throws IOException { 213 | if (indexExists(index)) { 214 | return; 215 | } 216 | 217 | List docs = docStream.collect(Collectors.toList()); 218 | 219 | Set fields = 220 | docs.stream() 221 | .flatMap(doc -> doc.fields().stream().map(Map.Entry::getKey)) 222 | .collect(Collectors.toSet()); 223 | 224 | var xc = XContentFactory.jsonBuilder().startObject(); 225 | for (String field : fields) { 226 | xc.startObject(field).field("type", "text").endObject(); 227 | } 228 | xc.endObject(); 229 | createIndex(index, Settings.EMPTY, "\"properties\": " + Strings.toString(xc)); 230 | 231 | Request request = new Request("PUT", "/" + index + "/_bulk"); 232 | request.addParameter("refresh", "true"); 233 | 234 | StringBuilder bulk = new StringBuilder(); 235 | int idx = 0; 236 | for (var doc : docs) { 237 | bulk.append( 238 | Strings.toString( 239 | XContentFactory.jsonBuilder() 240 | .startObject() 241 | .startObject("index") 242 | .field("_id", Integer.toString(idx++)) 243 | .endObject() 244 | .endObject())); 245 | bulk.append("\n"); 246 | 247 | bulk.append(doc.toJson().replaceAll("[\r\n]+", " ")); 248 | bulk.append("\n"); 249 | } 250 | System.out.println(bulk.toString()); 251 | request.setJsonEntity(bulk.toString()); 252 | client().performRequest(request); 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/ListAlgorithmsActionIT.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import static org.elasticsearch.test.ESIntegTestCase.Scope.SUITE; 5 | 6 | import java.util.Arrays; 7 | import java.util.Collection; 8 | import org.assertj.core.api.Assertions; 9 | import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionRequestBuilder; 10 | import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionResponse; 11 | import org.elasticsearch.client.Client; 12 | import org.elasticsearch.plugins.Plugin; 13 | import org.elasticsearch.test.ESIntegTestCase; 14 | import org.elasticsearch.test.ESIntegTestCase.ClusterScope; 15 | 16 | @ClusterScope(scope = SUITE, transportClientRatio = 0) 17 | public class ListAlgorithmsActionIT extends ESIntegTestCase { 18 | @Override 19 | protected Collection> nodePlugins() { 20 | return Arrays.asList(ClusteringPlugin.class); 21 | } 22 | 23 | @Override 24 | protected Collection> transportClientPlugins() { 25 | return nodePlugins(); 26 | } 27 | 28 | public void testAlgorithmsAreListed() throws Exception { 29 | Client client = client(); 30 | 31 | ListAlgorithmsActionResponse response = new ListAlgorithmsActionRequestBuilder(client).get(); 32 | Assertions.assertThat(response.getAlgorithms()) 33 | .describedAs("A list of algorithms") 34 | .containsOnly("Lingo", "STC", "Bisecting K-Means"); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/MultithreadedClusteringIT.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.concurrent.Callable; 7 | import java.util.concurrent.ExecutorService; 8 | import java.util.concurrent.Executors; 9 | import java.util.concurrent.Future; 10 | import org.assertj.core.api.Assertions; 11 | import org.elasticsearch.client.Client; 12 | import org.elasticsearch.index.query.QueryBuilders; 13 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; 14 | 15 | /** Java API tests. */ 16 | public class MultithreadedClusteringIT extends SampleIndexTestCase { 17 | 18 | public void testRequestFlood() throws Exception { 19 | final Client client = client(); 20 | 21 | List> tasks = new ArrayList<>(); 22 | 23 | final int requests = 100; 24 | final int threads = 10; 25 | 26 | logger.debug("Stress testing: " + client.getClass().getSimpleName() + "| "); 27 | for (int i = 0; i < requests; i++) { 28 | tasks.add( 29 | () -> { 30 | logger.debug(">"); 31 | 32 | ClusteringActionResponse result = 33 | new ClusteringActionRequestBuilder(client) 34 | .setQueryHint("data mining") 35 | .addFieldMapping("title", LogicalField.TITLE) 36 | .addHighlightedFieldMapping("content", LogicalField.CONTENT) 37 | .setSearchRequest( 38 | client 39 | .prepareSearch() 40 | .setIndices(INDEX_TEST) 41 | .setTypes("test") 42 | .setSize(100) 43 | .setQuery(QueryBuilders.termQuery("content", "data")) 44 | .highlighter( 45 | new HighlightBuilder().preTags("").postTags("").field("content")) 46 | .storedFields("title")) 47 | .execute() 48 | .actionGet(); 49 | 50 | logger.debug("<"); 51 | checkValid(result); 52 | checkJsonSerialization(result); 53 | return result; 54 | }); 55 | } 56 | 57 | ExecutorService executor = Executors.newFixedThreadPool(threads); 58 | try { 59 | for (Future future : executor.invokeAll(tasks)) { 60 | ClusteringActionResponse response = future.get(); 61 | Assertions.assertThat(response).isNotNull(); 62 | Assertions.assertThat(response.getSearchResponse()).isNotNull(); 63 | } 64 | } finally { 65 | executor.shutdown(); 66 | logger.debug("Done."); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/SampleIndexTestCase.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import java.net.InetSocketAddress; 6 | import java.util.ArrayDeque; 7 | import java.util.Arrays; 8 | import java.util.Collection; 9 | import java.util.Collections; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | import java.util.Random; 13 | import org.assertj.core.api.Assertions; 14 | import org.carrot2.elasticsearch.ClusteringActionResponse.Fields; 15 | import org.carrot2.language.LanguageComponentsLoader; 16 | import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; 17 | import org.elasticsearch.action.bulk.BulkRequestBuilder; 18 | import org.elasticsearch.client.Client; 19 | import org.elasticsearch.common.Strings; 20 | import org.elasticsearch.common.network.NetworkAddress; 21 | import org.elasticsearch.plugins.Plugin; 22 | import org.elasticsearch.search.SearchHit; 23 | import org.elasticsearch.search.SearchHits; 24 | import org.elasticsearch.test.ESIntegTestCase; 25 | import org.elasticsearch.xcontent.DeprecationHandler; 26 | import org.elasticsearch.xcontent.NamedXContentRegistry; 27 | import org.elasticsearch.xcontent.ToXContent; 28 | import org.elasticsearch.xcontent.XContentBuilder; 29 | import org.elasticsearch.xcontent.XContentFactory; 30 | import org.elasticsearch.xcontent.XContentParser; 31 | import org.elasticsearch.xcontent.XContentType; 32 | import org.elasticsearch.xcontent.json.JsonXContent; 33 | import org.junit.Before; 34 | 35 | /** Perform tests on sample data. */ 36 | public abstract class SampleIndexTestCase extends ESIntegTestCase { 37 | protected String restBaseUrl; 38 | protected Client client; 39 | 40 | @Override 41 | protected Collection> nodePlugins() { 42 | return Collections.singletonList(ClusteringPlugin.class); 43 | } 44 | 45 | @Override 46 | protected Collection> transportClientPlugins() { 47 | return nodePlugins(); 48 | } 49 | 50 | protected static final String INDEX_TEST = "test"; 51 | protected static final String INDEX_EMPTY = "empty"; 52 | 53 | @Before 54 | public void createTestIndex() throws Exception { 55 | // Delete any previously indexed content. 56 | client = client(); 57 | if (!client.admin().indices().prepareExists(INDEX_TEST).get().isExists()) { 58 | String testTemplate = 59 | "{" 60 | + " \"test\": {" 61 | + " \"properties\": {" 62 | + " \"url\": { \"type\": \"text\" }," 63 | + " \"title\": { \"type\": \"text\" }," 64 | + " \"content\": { \"type\": \"text\" }," 65 | + " \"lang\": { \"type\": \"text\" }," 66 | + " \"rndlang\": { \"type\": \"text\" }" 67 | + " }" 68 | + " }" 69 | + "}"; 70 | 71 | String emptyTemplate = 72 | "{" 73 | + " \"empty\": {" 74 | + " \"properties\": {" 75 | + " \"url\": { \"type\": \"text\" }," 76 | + " \"title\": { \"type\": \"text\" }," 77 | + " \"content\": { \"type\": \"text\" }," 78 | + " \"lang\": { \"type\": \"text\" }," 79 | + " \"rndlang\": { \"type\": \"text\" }" 80 | + " }" 81 | + " }" 82 | + "}"; 83 | 84 | CreateIndexResponse response = 85 | client 86 | .admin() 87 | .indices() 88 | .prepareCreate(INDEX_TEST) 89 | .addMapping("test", testTemplate, XContentType.JSON) 90 | .get(); 91 | Assertions.assertThat(response.isAcknowledged()).isTrue(); 92 | 93 | response = 94 | client 95 | .admin() 96 | .indices() 97 | .prepareCreate(INDEX_EMPTY) 98 | .addMapping("empty", emptyTemplate, XContentType.JSON) 99 | .get(); 100 | Assertions.assertThat(response.isAcknowledged()).isTrue(); 101 | 102 | // Create content at random in the test index. 103 | Random rnd = random(); 104 | String[] languages = new LanguageComponentsLoader().load().languages().toArray(String[]::new); 105 | Arrays.sort(languages); 106 | 107 | BulkRequestBuilder bulk = client.prepareBulk(); 108 | TestInfra.load("datamining.json").stream() 109 | .map( 110 | doc -> 111 | doc.cloneWith( 112 | Map.ofEntries( 113 | Map.entry("lang", "English"), 114 | Map.entry("rndlang", languages[rnd.nextInt(languages.length)])))) 115 | .forEach( 116 | doc -> { 117 | bulk.add( 118 | client 119 | .prepareIndex() 120 | .setIndex(INDEX_TEST) 121 | .setType("test") 122 | .setSource(doc.toXContent())); 123 | }); 124 | 125 | bulk.add( 126 | client 127 | .prepareIndex() 128 | .setIndex(INDEX_EMPTY) 129 | .setType("empty") 130 | .setSource( 131 | new TestInfra.TestDocument(Map.of("url", "", "title", "", "content", "")) 132 | .toXContent())); 133 | 134 | bulk.execute().actionGet(); 135 | flushAndRefresh(INDEX_TEST); 136 | flushAndRefresh(INDEX_EMPTY); 137 | } 138 | ensureGreen(INDEX_TEST); 139 | ensureGreen(INDEX_EMPTY); 140 | 141 | InetSocketAddress endpoint = randomFrom(cluster().httpAddresses()); 142 | this.restBaseUrl = "http://" + NetworkAddress.format(endpoint); 143 | } 144 | 145 | /** Check for valid {@link ClusteringActionResponse}. */ 146 | protected static void checkValid(ClusteringActionResponse result) { 147 | Assertions.assertThat(result.getDocumentGroups()) 148 | .as("top-level clusters") 149 | .isNotNull() 150 | .isNotEmpty(); 151 | 152 | Map idToHit = new HashMap<>(); 153 | SearchHits hits = result.getSearchResponse().getHits(); 154 | if (hits != null) { 155 | for (SearchHit hit : hits) { 156 | idToHit.put(hit.getId(), hit); 157 | } 158 | } 159 | 160 | String maxHits = result.getInfo().get(ClusteringActionResponse.Fields.Info.MAX_HITS); 161 | final boolean containsAllHits = 162 | (maxHits == null || maxHits.isEmpty() || Integer.parseInt(maxHits) == Integer.MAX_VALUE); 163 | 164 | ArrayDeque queue = new ArrayDeque<>(); 165 | queue.addAll(Arrays.asList(result.getDocumentGroups())); 166 | while (!queue.isEmpty()) { 167 | DocumentGroup g = queue.pop(); 168 | 169 | Assertions.assertThat(g.getLabel()).as("label").isNotNull().isNotEmpty(); 170 | 171 | if (containsAllHits) { 172 | String[] documentReferences = g.getDocumentReferences(); 173 | Assertions.assertThat(idToHit.keySet()) 174 | .as("docRefs") 175 | .containsAll(Arrays.asList(documentReferences)); 176 | } 177 | } 178 | 179 | Assertions.assertThat(result.getInfo()) 180 | .containsKey(ClusteringActionResponse.Fields.Info.ALGORITHM) 181 | .containsKey(ClusteringActionResponse.Fields.Info.CLUSTERING_MILLIS) 182 | .containsKey(ClusteringActionResponse.Fields.Info.SEARCH_MILLIS) 183 | .containsKey(ClusteringActionResponse.Fields.Info.TOTAL_MILLIS) 184 | .containsKey(ClusteringActionResponse.Fields.Info.MAX_HITS) 185 | .containsKey(ClusteringActionResponse.Fields.Info.LANGUAGES); 186 | } 187 | 188 | /** Roundtrip to/from JSON. */ 189 | protected static void checkJsonSerialization(ClusteringActionResponse result) throws IOException { 190 | XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); 191 | builder.startObject(); 192 | result.toXContent(builder, ToXContent.EMPTY_PARAMS); 193 | builder.endObject(); 194 | String json = Strings.toString(builder); 195 | 196 | try (XContentParser parser = 197 | JsonXContent.jsonXContent.createParser( 198 | NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION, json)) { 199 | Map mapAndClose = parser.map(); 200 | Assertions.assertThat(mapAndClose).as("json-result").containsKey(Fields.CLUSTERS); 201 | } 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/javaRestTest/java/org/carrot2/elasticsearch/TestInfra.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.UncheckedIOException; 7 | import java.nio.charset.StandardCharsets; 8 | import java.util.ArrayList; 9 | import java.util.Collection; 10 | import java.util.Collections; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.function.BiConsumer; 14 | import java.util.stream.Collectors; 15 | import org.carrot2.clustering.Document; 16 | import org.elasticsearch.common.Strings; 17 | import org.elasticsearch.xcontent.DeprecationHandler; 18 | import org.elasticsearch.xcontent.NamedXContentRegistry; 19 | import org.elasticsearch.xcontent.XContentBuilder; 20 | import org.elasticsearch.xcontent.XContentFactory; 21 | import org.elasticsearch.xcontent.XContentParser; 22 | import org.elasticsearch.xcontent.json.JsonXContent; 23 | 24 | /** Facade loading sample document data. */ 25 | final class TestInfra { 26 | 27 | public static final class TestDocument implements Document { 28 | private final ArrayList> fieldValues = new ArrayList<>(); 29 | 30 | public TestDocument(Map fields) { 31 | this(fields.entrySet()); 32 | } 33 | 34 | public TestDocument(Collection> fieldValues) { 35 | this.fieldValues.addAll(fieldValues); 36 | } 37 | 38 | public TestDocument cloneWith(Map fields) { 39 | ArrayList> cloned = new ArrayList<>(fieldValues); 40 | cloned.addAll(fields.entrySet()); 41 | return new TestDocument(cloned); 42 | } 43 | 44 | @Override 45 | public void visitFields(BiConsumer fieldConsumer) { 46 | fieldValues.forEach(e -> fieldConsumer.accept(e.getKey(), e.getValue())); 47 | } 48 | 49 | public XContentBuilder toXContent() { 50 | try { 51 | var xc = XContentFactory.jsonBuilder().prettyPrint().startObject(); 52 | for (var e : fieldValues) { 53 | xc.field(e.getKey(), e.getValue()); 54 | } 55 | return xc.endObject(); 56 | } catch (IOException e) { 57 | throw new UncheckedIOException(e); 58 | } 59 | } 60 | 61 | public List> fields() { 62 | return Collections.unmodifiableList(fieldValues); 63 | } 64 | 65 | public String toJson() { 66 | return Strings.toString(toXContent()); 67 | } 68 | } 69 | 70 | public static List load(String resource) throws IOException { 71 | var json = jsonResource(TestInfra.class, resource); 72 | 73 | try (XContentParser parser = 74 | JsonXContent.jsonXContent.createParser( 75 | NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION, json)) { 76 | return parser.list().stream() 77 | .map( 78 | entry -> { 79 | @SuppressWarnings("unchecked") 80 | var fields = (Map) entry; 81 | return new TestDocument(fields); 82 | }) 83 | .collect(Collectors.toList()); 84 | } 85 | } 86 | 87 | public static String jsonResource(Class clazz, String resourceName) throws IOException { 88 | return new String(resource(clazz, resourceName), StandardCharsets.UTF_8); 89 | } 90 | 91 | public static byte[] resource(Class clazz, String resourceName) throws IOException { 92 | try (InputStream is = 93 | clazz.getResourceAsStream("_" + clazz.getSimpleName() + "/" + resourceName)) { 94 | return is.readAllBytes(); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, out 2 | 3 | log4j.appender.out=org.apache.log4j.ConsoleAppender 4 | log4j.appender.out.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n 6 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_invalid_attribute_value.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 100 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "field_mapping": { 14 | "title" : ["_source.title"], 15 | "content": ["_source.content"] 16 | }, 17 | "algorithm": "STC", 18 | "attributes": { 19 | "ignoreWordIfInHigherDocsPercent": 100 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_invalid_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining", 7 | "operator": "_foo_" 8 | } 9 | }, 10 | "size": 100 11 | }, 12 | 13 | "query_hint": "data mining", 14 | "field_mapping": { 15 | "title" : ["_source.title"], 16 | "content": ["_source.content"] 17 | }, 18 | "algorithm": "stc", 19 | "attributes": {} 20 | } 21 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_language_field.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content", "rndlang"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 300 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "field_mapping": { 14 | "title" : ["_source.title"], 15 | "content" : ["_source.content"], 16 | "language": ["_source.rndlang"] 17 | }, 18 | "algorithm": "Lingo", 19 | "attributes": { 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_multiple_field_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : [ 4 | "url", 5 | "title", 6 | "content" 7 | ], 8 | "query" : { 9 | "match" : { 10 | "content" : "data mining" 11 | } 12 | }, 13 | "size": 100 14 | }, 15 | 16 | "query_hint": "data mining", 17 | "field_mapping": { 18 | "title" : ["_source.title", "_source.content"] 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_nonexistent_algorithmId.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 100 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "field_mapping": { 14 | "title" : ["_source.title"], 15 | "content": ["_source.content"] 16 | }, 17 | "algorithm": "_nonexistent_", 18 | "attributes": {} 19 | } 20 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_nonexistent_fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 100 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "field_mapping": { 14 | "title" : ["_source._nonexistent_"], 15 | "content": ["_source._nonexistent_"] 16 | }, 17 | "algorithm": "Lingo", 18 | "attributes": {} 19 | } 20 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_runtime_attributes.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 100 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "create_ungrouped": true, 14 | "field_mapping": { 15 | "title" : ["_source.title"], 16 | "content": ["_source.content"] 17 | }, 18 | "algorithm": "Lingo", 19 | "attributes": { 20 | "desiredClusterCount": 5 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_with_clusters.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "query" : { 4 | "match" : { 5 | "content" : "data mining" 6 | } 7 | }, 8 | "size": 100 9 | }, 10 | 11 | "query_hint": "data mining", 12 | "field_mapping": { 13 | "title" : ["_source.title"], 14 | "content": ["_source.content"] 15 | }, 16 | "attributes": { 17 | "allow-one-document-clusters": true, 18 | "clusters": [ 19 | { "label": "Top Level"}, 20 | { "label": "Conference", 21 | "clusters": [ 22 | { "label": "Foo" } 23 | ] 24 | } 25 | ] 26 | }, 27 | "algorithm": "Lingo3G" 28 | } 29 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_with_fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : ["url", "title", "content"], 4 | "query" : { 5 | "match" : { 6 | "content" : "data mining" 7 | } 8 | }, 9 | "size": 100 10 | }, 11 | 12 | "query_hint": "data mining", 13 | "field_mapping": { 14 | "title" : ["_source.title"], 15 | "content": ["_source.content"] 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_with_highlighted_fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "_source" : [ 4 | "url", 5 | "title", 6 | "content" 7 | ], 8 | "highlight" : { 9 | "pre_tags" : ["", ""], 10 | "post_tags" : ["", ""], 11 | "fields" : { 12 | "content" : { "fragment_size" : 150, "number_of_fragments" : 3 }, 13 | "title" : { "fragment_size" : 150, "number_of_fragments" : 3 } 14 | } 15 | }, 16 | "query" : { 17 | "match" : { 18 | "content" : "data mining" 19 | } 20 | }, 21 | "size": 100 22 | }, 23 | 24 | "query_hint": "data mining", 25 | "field_mapping": { 26 | "title" : ["_source.title"], 27 | "content": ["highlight.content"] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/javaRestTest/resources/org/carrot2/elasticsearch/_ClusteringActionRestIT/post_with_source_fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "search_request": { 3 | "query" : { 4 | "match" : { 5 | "content" : "data mining" 6 | } 7 | }, 8 | "size": 100 9 | }, 10 | 11 | "query_hint": "data mining", 12 | "field_mapping": { 13 | "title" : ["_source.title"], 14 | "content": ["_source.content"] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/config/config.yml: -------------------------------------------------------------------------------- 1 | 2 | # Declare locations of external algorithm resources. 3 | resources: [ 4 | "elasticsearch-carrot2", 5 | "elasticsearch-lingo3g" 6 | ] -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringAction.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import static org.carrot2.elasticsearch.LoggerUtils.emitErrorResponse; 5 | import static org.elasticsearch.rest.RestRequest.Method.GET; 6 | import static org.elasticsearch.rest.RestRequest.Method.POST; 7 | 8 | import java.io.IOException; 9 | import java.util.Arrays; 10 | import java.util.EnumMap; 11 | import java.util.HashSet; 12 | import java.util.List; 13 | import java.util.Locale; 14 | import java.util.Map; 15 | import java.util.Set; 16 | import java.util.stream.Collectors; 17 | import org.apache.logging.log4j.LogManager; 18 | import org.apache.logging.log4j.Logger; 19 | import org.elasticsearch.action.ActionListener; 20 | import org.elasticsearch.action.ActionType; 21 | import org.elasticsearch.action.search.SearchRequest; 22 | import org.elasticsearch.client.node.NodeClient; 23 | import org.elasticsearch.common.Strings; 24 | import org.elasticsearch.common.io.stream.Writeable; 25 | import org.elasticsearch.common.util.concurrent.ThreadContext; 26 | import org.elasticsearch.rest.BaseRestHandler; 27 | import org.elasticsearch.rest.BytesRestResponse; 28 | import org.elasticsearch.rest.RestRequest; 29 | import org.elasticsearch.rest.action.search.RestSearchAction; 30 | import org.elasticsearch.xcontent.XContentBuilder; 31 | 32 | /** Perform clustering of search results. */ 33 | public class ClusteringAction extends ActionType { 34 | /* Action name. */ 35 | public static final String NAME = "indices:data/read/cluster"; 36 | 37 | /* Reusable singleton. */ 38 | public static final ClusteringAction INSTANCE = new ClusteringAction(); 39 | 40 | private ClusteringAction() { 41 | super(NAME, ClusteringActionResponse::new); 42 | } 43 | 44 | @Override 45 | public Writeable.Reader getResponseReader() { 46 | return ClusteringActionResponse::new; 47 | } 48 | 49 | /** An {@link BaseRestHandler} for {@link ClusteringAction}. */ 50 | public static class RestClusteringAction extends BaseRestHandler { 51 | protected Logger logger = LogManager.getLogger(getClass()); 52 | 53 | /** Action name suffix. */ 54 | public static String NAME = "_search_with_clusters"; 55 | 56 | @Override 57 | public List routes() { 58 | return Arrays.asList( 59 | new Route(POST, "/" + NAME), 60 | new Route(POST, "/{index}/" + NAME), 61 | new Route(POST, "/{index}/{type}/" + NAME), 62 | new Route(GET, "/" + NAME), 63 | new Route(GET, "/{index}/" + NAME), 64 | new Route(GET, "/{index}/{type}/" + NAME)); 65 | } 66 | 67 | @Override 68 | public String getName() { 69 | return NAME; 70 | } 71 | 72 | @Override 73 | @SuppressWarnings({"try", "deprecation"}) 74 | public RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) 75 | throws IOException { 76 | // A POST request must have a body. 77 | if (request.method() == POST && !request.hasContent()) { 78 | return channel -> 79 | emitErrorResponse( 80 | channel, 81 | logger, 82 | new IllegalArgumentException("Request body was expected for a POST request.")); 83 | } 84 | 85 | // Contrary to ES's default search handler we will not support 86 | // GET requests with a body (this is against HTTP spec guidance 87 | // in my opinion -- GET requests should not have a body). 88 | if (request.method() == GET && request.hasContent()) { 89 | return channel -> 90 | emitErrorResponse( 91 | channel, 92 | logger, 93 | new IllegalArgumentException("Request body was unexpected for a GET request.")); 94 | } 95 | 96 | // Build an action request with data from the request. 97 | 98 | // Parse incoming arguments depending on the HTTP method used to make 99 | // the request. 100 | final ClusteringActionRequestBuilder actionBuilder = 101 | new ClusteringActionRequestBuilder(client); 102 | SearchRequest searchRequest = new SearchRequest(); 103 | switch (request.method()) { 104 | case POST: 105 | searchRequest.indices(Strings.splitStringByCommaToArray(request.param("index"))); 106 | searchRequest.types(Strings.splitStringByCommaToArray(request.param("type"))); 107 | actionBuilder.setSearchRequest(searchRequest); 108 | actionBuilder.setSource( 109 | request.content(), request.getXContentType(), request.getXContentRegistry()); 110 | break; 111 | 112 | case GET: 113 | RestSearchAction.parseSearchRequest( 114 | searchRequest, 115 | request, 116 | null, 117 | client.getNamedWriteableRegistry(), 118 | (size) -> { 119 | searchRequest.source().size(size); 120 | }); 121 | actionBuilder.setSearchRequest(searchRequest); 122 | fillFromGetRequest(actionBuilder, request); 123 | break; 124 | 125 | default: 126 | throw org.carrot2.elasticsearch.Preconditions.unreachable(); 127 | } 128 | 129 | Set passSecurityHeaders = 130 | new HashSet<>(Arrays.asList("es-security-runas-user", "_xpack_security_authentication")); 131 | 132 | Map securityHeaders = 133 | client.threadPool().getThreadContext().getHeaders().entrySet().stream() 134 | .filter(e -> passSecurityHeaders.contains(e.getKey())) 135 | .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); 136 | 137 | // Dispatch clustering request. 138 | return channel -> { 139 | try (ThreadContext.StoredContext ignored = 140 | client.threadPool().getThreadContext().stashContext()) { 141 | client.threadPool().getThreadContext().copyHeaders(securityHeaders.entrySet()); 142 | client.execute( 143 | ClusteringAction.INSTANCE, 144 | actionBuilder.request(), 145 | new ActionListener() { 146 | @Override 147 | public void onResponse(ClusteringActionResponse response) { 148 | try { 149 | XContentBuilder builder = channel.newBuilder(); 150 | builder.startObject(); 151 | response.toXContent(builder, request); 152 | builder.endObject(); 153 | channel.sendResponse( 154 | new BytesRestResponse(response.getSearchResponse().status(), builder)); 155 | } catch (Exception e) { 156 | logger.debug("Failed to emit response.", e); 157 | onFailure(e); 158 | } 159 | } 160 | 161 | @Override 162 | public void onFailure(Exception e) { 163 | emitErrorResponse(channel, logger, e); 164 | } 165 | }); 166 | } 167 | }; 168 | } 169 | 170 | private static final EnumMap GET_REQUEST_FIELDMAPPERS; 171 | 172 | static { 173 | GET_REQUEST_FIELDMAPPERS = new EnumMap<>(LogicalField.class); 174 | for (LogicalField lf : LogicalField.values()) { 175 | GET_REQUEST_FIELDMAPPERS.put(lf, "field_mapping_" + lf.name().toLowerCase(Locale.ROOT)); 176 | } 177 | } 178 | 179 | /** Extract and parse HTTP GET parameters for the clustering request. */ 180 | private void fillFromGetRequest( 181 | ClusteringActionRequestBuilder actionBuilder, RestRequest request) { 182 | // Use the search query as the query hint, if explicit query hint 183 | // is not available. 184 | if (request.hasParam(ClusteringActionRequest.JSON_QUERY_HINT)) { 185 | actionBuilder.setQueryHint(request.param(ClusteringActionRequest.JSON_QUERY_HINT)); 186 | } else { 187 | actionBuilder.setQueryHint(request.param("q")); 188 | } 189 | 190 | if (request.hasParam(ClusteringActionRequest.JSON_ALGORITHM)) { 191 | actionBuilder.setAlgorithm(request.param(ClusteringActionRequest.JSON_ALGORITHM)); 192 | } 193 | 194 | if (request.hasParam(ClusteringActionRequest.JSON_MAX_HITS)) { 195 | actionBuilder.setMaxHits(request.param(ClusteringActionRequest.JSON_MAX_HITS)); 196 | } 197 | 198 | if (request.hasParam(ClusteringActionRequest.JSON_CREATE_UNGROUPED_CLUSTER)) { 199 | actionBuilder.setCreateUngroupedDocumentsCluster( 200 | Boolean.parseBoolean( 201 | request.param(ClusteringActionRequest.JSON_CREATE_UNGROUPED_CLUSTER))); 202 | } 203 | 204 | if (request.hasParam(ClusteringActionRequest.JSON_LANGUAGE)) { 205 | actionBuilder.setDefaultLanguage(request.param(ClusteringActionRequest.JSON_LANGUAGE)); 206 | } 207 | 208 | // Field mappers. 209 | for (Map.Entry e : GET_REQUEST_FIELDMAPPERS.entrySet()) { 210 | if (request.hasParam(e.getValue())) { 211 | for (String spec : Strings.splitStringByCommaToArray(request.param(e.getValue()))) { 212 | actionBuilder.addFieldMappingSpec(spec, e.getKey()); 213 | } 214 | } 215 | } 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringActionRequest.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import static org.elasticsearch.action.ValidateActions.addValidationError; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Objects; 11 | import org.elasticsearch.ElasticsearchException; 12 | import org.elasticsearch.action.ActionRequest; 13 | import org.elasticsearch.action.ActionRequestValidationException; 14 | import org.elasticsearch.action.IndicesRequest; 15 | import org.elasticsearch.action.search.SearchRequest; 16 | import org.elasticsearch.action.search.SearchRequestBuilder; 17 | import org.elasticsearch.action.support.IndicesOptions; 18 | import org.elasticsearch.common.Strings; 19 | import org.elasticsearch.common.bytes.BytesReference; 20 | import org.elasticsearch.common.io.stream.StreamInput; 21 | import org.elasticsearch.common.io.stream.StreamOutput; 22 | import org.elasticsearch.common.xcontent.XContentHelper; 23 | import org.elasticsearch.search.builder.SearchSourceBuilder; 24 | import org.elasticsearch.xcontent.DeprecationHandler; 25 | import org.elasticsearch.xcontent.NamedXContentRegistry; 26 | import org.elasticsearch.xcontent.XContentBuilder; 27 | import org.elasticsearch.xcontent.XContentFactory; 28 | import org.elasticsearch.xcontent.XContentParser; 29 | import org.elasticsearch.xcontent.XContentType; 30 | 31 | /** An {@link ActionRequest} for {@link ClusteringAction}. */ 32 | public class ClusteringActionRequest extends ActionRequest implements IndicesRequest.Replaceable { 33 | public static String JSON_QUERY_HINT = "query_hint"; 34 | public static String JSON_FIELD_MAPPING = "field_mapping"; 35 | public static String JSON_ALGORITHM = "algorithm"; 36 | public static String JSON_ATTRIBUTES = "attributes"; 37 | public static String JSON_SEARCH_REQUEST = "search_request"; 38 | public static String JSON_MAX_HITS = "max_hits"; 39 | public static String JSON_CREATE_UNGROUPED_CLUSTER = "create_ungrouped"; 40 | public static String JSON_LANGUAGE = "language"; 41 | 42 | private SearchRequest searchRequest; 43 | private String queryHint; 44 | private List fieldMapping = new ArrayList<>(); 45 | private String algorithm; 46 | private int maxHits = Integer.MAX_VALUE; 47 | private Map attributes; 48 | boolean createUngroupedDocumentsCluster; 49 | private String defaultLanguage = "English"; 50 | 51 | /** 52 | * Set the {@link SearchRequest} to use for fetching documents to be clustered. The search request 53 | * must fetch enough documents for clustering to make sense (set size appropriately). 54 | * 55 | * @param searchRequest search request to set 56 | * @return same builder instance 57 | */ 58 | public ClusteringActionRequest setSearchRequest(SearchRequest searchRequest) { 59 | this.searchRequest = searchRequest; 60 | return this; 61 | } 62 | 63 | /** 64 | * @param builder The search builder 65 | * @return Returns same object for chaining. 66 | * @see #setSearchRequest(SearchRequest) 67 | */ 68 | public ClusteringActionRequest setSearchRequest(SearchRequestBuilder builder) { 69 | return setSearchRequest(builder.request()); 70 | } 71 | 72 | ClusteringActionRequest() {} 73 | 74 | public ClusteringActionRequest(StreamInput in) throws IOException { 75 | SearchRequest searchRequest = new SearchRequest(in); 76 | 77 | this.searchRequest = searchRequest; 78 | this.queryHint = in.readOptionalString(); 79 | this.algorithm = in.readOptionalString(); 80 | this.maxHits = in.readInt(); 81 | this.createUngroupedDocumentsCluster = in.readBoolean(); 82 | this.defaultLanguage = in.readString(); 83 | 84 | int count = in.readVInt(); 85 | while (count-- > 0) { 86 | FieldMappingSpec spec = new FieldMappingSpec(in); 87 | fieldMapping.add(spec); 88 | } 89 | 90 | boolean hasAttributes = in.readBoolean(); 91 | if (hasAttributes) { 92 | attributes = in.readMap(); 93 | } 94 | } 95 | 96 | public SearchRequest getSearchRequest() { 97 | return searchRequest; 98 | } 99 | 100 | /** 101 | * @param queryHint A set of terms which correspond to the query. This hint helps the clustering 102 | * algorithm to avoid trivial clusters around the query terms. Typically the query terms hint 103 | * will be identical to what the user typed in the search box. 104 | *

The hint may be an empty string but must not be null. 105 | * @return same builder instance 106 | */ 107 | public ClusteringActionRequest setQueryHint(String queryHint) { 108 | this.queryHint = queryHint; 109 | return this; 110 | } 111 | 112 | /** 113 | * @return Query hint 114 | * @see #setQueryHint(String) 115 | */ 116 | public String getQueryHint() { 117 | return queryHint; 118 | } 119 | 120 | /** 121 | * Sets the identifier of the clustering algorithm to use. If null, the default 122 | * algorithm will be used (depending on what's available). 123 | * 124 | * @param algorithm identifier of the clustering algorithm to use. 125 | * @return Same object for chaining 126 | */ 127 | public ClusteringActionRequest setAlgorithm(String algorithm) { 128 | this.algorithm = algorithm; 129 | return this; 130 | } 131 | 132 | /** 133 | * @return The current algorithm to use for clustering 134 | * @see #setAlgorithm 135 | */ 136 | public String getAlgorithm() { 137 | return algorithm; 138 | } 139 | 140 | /** 141 | * Sets the maximum number of hits to return with the response. Setting this value to zero will 142 | * only return clusters, without any hits (can be used to save bandwidth if only cluster labels 143 | * are needed). 144 | * 145 | *

Set to {@link Integer#MAX_VALUE} to include all the hits. 146 | * 147 | * @param maxHits Maximum hits 148 | */ 149 | public void setMaxHits(int maxHits) { 150 | assert maxHits >= 0; 151 | this.maxHits = maxHits; 152 | } 153 | 154 | /** 155 | * Sets {@link #setMaxHits(int)} from a string. An empty string or null means all hits should be 156 | * included. 157 | * 158 | * @param value Maximum number of hits. 159 | */ 160 | public void setMaxHits(String value) { 161 | if (value == null || value.trim().isEmpty()) { 162 | setMaxHits(Integer.MAX_VALUE); 163 | } else { 164 | setMaxHits(Integer.parseInt(value)); 165 | } 166 | } 167 | 168 | /** 169 | * @return Returns the maximum number of hits to be returned as part of the response. * If equal 170 | * to {@link Integer#MAX_VALUE}, then all hits will be returned. 171 | */ 172 | public int getMaxHits() { 173 | return maxHits; 174 | } 175 | 176 | /** 177 | * Sets a map of runtime override attributes for clustering algorithms. 178 | * 179 | * @param map Clustering attributes to use. 180 | * @return Same object for chaining 181 | */ 182 | public ClusteringActionRequest setAttributes(Map map) { 183 | this.attributes = map; 184 | return this; 185 | } 186 | 187 | /** 188 | * @return Clustering algorithm attributes map 189 | * @see #setAttributes(Map) 190 | */ 191 | public Map getAttributes() { 192 | return attributes; 193 | } 194 | 195 | /** 196 | * Parses some {@link org.elasticsearch.xcontent.XContent} and fills in the request. 197 | * 198 | * @param source arg 199 | * @param xContentType arg 200 | * @param xContentRegistry arg 201 | */ 202 | @SuppressWarnings("unchecked") 203 | public void source( 204 | BytesReference source, XContentType xContentType, NamedXContentRegistry xContentRegistry) { 205 | if (source == null || source.length() == 0) { 206 | return; 207 | } 208 | 209 | try (XContentParser parser = 210 | XContentHelper.createParser( 211 | xContentRegistry, 212 | DeprecationHandler.THROW_UNSUPPORTED_OPERATION, 213 | source, 214 | xContentType)) { 215 | // We should avoid reparsing search_request here 216 | // but it's terribly difficult to slice the underlying byte 217 | // buffer to get just the search request. 218 | Map asMap = parser.mapOrdered(); 219 | 220 | Boolean createUngrouped = (Boolean) asMap.get(JSON_CREATE_UNGROUPED_CLUSTER); 221 | if (createUngrouped != null) { 222 | setCreateUngroupedDocumentsCluster(createUngrouped); 223 | } 224 | 225 | String queryHint = (String) asMap.get(JSON_QUERY_HINT); 226 | if (queryHint != null) { 227 | setQueryHint(queryHint); 228 | } 229 | 230 | String defaultLanguage = (String) asMap.get(JSON_LANGUAGE); 231 | if (defaultLanguage != null) { 232 | setDefaultLanguage(defaultLanguage); 233 | } 234 | 235 | Map> fieldMapping = 236 | (Map>) asMap.get(JSON_FIELD_MAPPING); 237 | if (fieldMapping != null) { 238 | parseFieldSpecs(fieldMapping); 239 | } 240 | 241 | String algorithm = (String) asMap.get(JSON_ALGORITHM); 242 | if (algorithm != null) { 243 | setAlgorithm(algorithm); 244 | } 245 | 246 | Map attributes = (Map) asMap.get(JSON_ATTRIBUTES); 247 | if (attributes != null) { 248 | setAttributes(attributes); 249 | } 250 | 251 | Map searchRequestMap = (Map) asMap.get(JSON_SEARCH_REQUEST); 252 | if (searchRequestMap != null) { 253 | if (this.searchRequest == null) { 254 | searchRequest = new SearchRequest(); 255 | } 256 | 257 | XContentBuilder builder = 258 | XContentFactory.contentBuilder(XContentType.JSON).map(searchRequestMap); 259 | XContentParser searchXParser = 260 | XContentFactory.xContent(XContentType.JSON) 261 | .createParser( 262 | xContentRegistry, 263 | DeprecationHandler.THROW_UNSUPPORTED_OPERATION, 264 | Strings.toString(builder)); 265 | SearchSourceBuilder searchSourceBuilder = SearchSourceBuilder.fromXContent(searchXParser); 266 | searchRequest.source(searchSourceBuilder); 267 | } 268 | 269 | Object maxHits = asMap.get(JSON_MAX_HITS); 270 | if (maxHits != null) { 271 | setMaxHits(maxHits.toString()); 272 | } 273 | } catch (Exception e) { 274 | String sSource = "_na_"; 275 | try { 276 | sSource = XContentHelper.convertToJson(source, false, false, xContentType); 277 | } catch (Throwable e1) { 278 | // ignore 279 | } 280 | throw new ClusteringException("Failed to parse source [" + sSource + "]", e); 281 | } 282 | } 283 | 284 | private void parseFieldSpecs(Map> fieldSpecs) { 285 | for (Map.Entry> e : fieldSpecs.entrySet()) { 286 | LogicalField logicalField = LogicalField.valueOfCaseInsensitive(e.getKey()); 287 | if (logicalField != null) { 288 | for (String fieldSpec : e.getValue()) { 289 | addFieldMappingSpec(fieldSpec, logicalField); 290 | } 291 | } 292 | } 293 | } 294 | 295 | /** 296 | * Map a hit's field to a logical section of a document to be clustered (title, content or URL). 297 | * 298 | * @param fieldName field name 299 | * @param logicalField logical field mapping. 300 | * @return Same object for chaining 301 | * @see LogicalField 302 | */ 303 | public ClusteringActionRequest addFieldMapping(String fieldName, LogicalField logicalField) { 304 | fieldMapping.add(new FieldMappingSpec(fieldName, logicalField, FieldSource.FIELD)); 305 | return this; 306 | } 307 | 308 | /** 309 | * Map a hit's source field (field unpacked from the _source document) to a logical 310 | * section of a document to be clustered (title, content or URL). 311 | * 312 | * @param sourceFieldName field name 313 | * @param logicalField logical field mapping. 314 | * @return Same object for chaining 315 | * @see LogicalField 316 | */ 317 | public ClusteringActionRequest addSourceFieldMapping( 318 | String sourceFieldName, LogicalField logicalField) { 319 | fieldMapping.add(new FieldMappingSpec(sourceFieldName, logicalField, FieldSource.SOURCE)); 320 | return this; 321 | } 322 | 323 | /** 324 | * Map a hit's highligted field (fragments of the original field) to a logical section of a 325 | * document to be clustered. This may be used to decrease the amount of information passed to the 326 | * clustering engine but also to "focus" the clustering engine on the context of the query. 327 | * 328 | * @param fieldName field name 329 | * @param logicalField logical field mapping. 330 | * @return Same object for chaining 331 | */ 332 | public ClusteringActionRequest addHighlightedFieldMapping( 333 | String fieldName, LogicalField logicalField) { 334 | fieldMapping.add(new FieldMappingSpec(fieldName, logicalField, FieldSource.HIGHLIGHT)); 335 | return this; 336 | } 337 | 338 | /** 339 | * Add a (valid!) field mapping specification to a logical field. 340 | * 341 | * @param fieldSpec field specification 342 | * @param logicalField logical field mapping. 343 | * @return Same object for chaining 344 | * @see FieldSource 345 | */ 346 | public ClusteringActionRequest addFieldMappingSpec(String fieldSpec, LogicalField logicalField) { 347 | FieldSource.ParsedFieldSource pfs = FieldSource.parseSpec(fieldSpec); 348 | if (pfs.source != null) { 349 | switch (pfs.source) { 350 | case HIGHLIGHT: 351 | addHighlightedFieldMapping(pfs.fieldName, logicalField); 352 | break; 353 | 354 | case FIELD: 355 | addFieldMapping(pfs.fieldName, logicalField); 356 | break; 357 | 358 | case SOURCE: 359 | addSourceFieldMapping(pfs.fieldName, logicalField); 360 | break; 361 | 362 | default: 363 | throw new RuntimeException(); 364 | } 365 | } 366 | 367 | if (pfs.source == null) { 368 | throw new ElasticsearchException( 369 | "Field mapping specification must contain a " 370 | + " valid source prefix for the field source: " 371 | + fieldSpec); 372 | } 373 | 374 | return this; 375 | } 376 | 377 | /** Access to prepared field mapping. */ 378 | List getFieldMapping() { 379 | return fieldMapping; 380 | } 381 | 382 | @Override 383 | public ActionRequestValidationException validate() { 384 | ActionRequestValidationException validationException = null; 385 | if (searchRequest == null) { 386 | validationException = addValidationError("No delegate search request", validationException); 387 | } 388 | 389 | if (queryHint == null) { 390 | validationException = 391 | addValidationError("query hint may be empty but must not be null.", validationException); 392 | } 393 | 394 | if (fieldMapping.isEmpty()) { 395 | validationException = 396 | addValidationError( 397 | "At least one field should be mapped to a logical document field.", 398 | validationException); 399 | } 400 | 401 | ActionRequestValidationException ex = searchRequest.validate(); 402 | if (ex != null) { 403 | if (validationException == null) { 404 | validationException = new ActionRequestValidationException(); 405 | } 406 | validationException.addValidationErrors(ex.validationErrors()); 407 | } 408 | 409 | return validationException; 410 | } 411 | 412 | @Override 413 | public void writeTo(StreamOutput out) throws IOException { 414 | assert searchRequest != null; 415 | this.searchRequest.writeTo(out); 416 | out.writeOptionalString(queryHint); 417 | out.writeOptionalString(algorithm); 418 | out.writeInt(maxHits); 419 | out.writeBoolean(createUngroupedDocumentsCluster); 420 | out.writeString(defaultLanguage); 421 | 422 | out.writeVInt(fieldMapping.size()); 423 | for (FieldMappingSpec spec : fieldMapping) { 424 | spec.writeTo(out); 425 | } 426 | 427 | boolean hasAttributes = (attributes != null); 428 | out.writeBoolean(hasAttributes); 429 | if (hasAttributes) { 430 | out.writeMap(attributes); 431 | } 432 | } 433 | 434 | @Override 435 | public IndicesRequest indices(String... strings) { 436 | return searchRequest.indices(strings); 437 | } 438 | 439 | @Override 440 | public String[] indices() { 441 | return searchRequest.indices(); 442 | } 443 | 444 | @Override 445 | public IndicesOptions indicesOptions() { 446 | return searchRequest.indicesOptions(); 447 | } 448 | 449 | public void setCreateUngroupedDocumentsCluster(boolean enabled) { 450 | this.createUngroupedDocumentsCluster = enabled; 451 | } 452 | 453 | public void setDefaultLanguage(String defaultLanguage) { 454 | this.defaultLanguage = Objects.requireNonNull(defaultLanguage); 455 | } 456 | 457 | public String getDefaultLanguage() { 458 | return defaultLanguage; 459 | } 460 | } 461 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringActionRequestBuilder.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import org.elasticsearch.action.ActionRequestBuilder; 7 | import org.elasticsearch.action.search.SearchRequest; 8 | import org.elasticsearch.action.search.SearchRequestBuilder; 9 | import org.elasticsearch.client.ElasticsearchClient; 10 | import org.elasticsearch.common.bytes.BytesReference; 11 | import org.elasticsearch.xcontent.NamedXContentRegistry; 12 | import org.elasticsearch.xcontent.XContentType; 13 | 14 | /** An {@link ActionRequestBuilder} for {@link ClusteringAction}. */ 15 | public class ClusteringActionRequestBuilder 16 | extends ActionRequestBuilder { 17 | 18 | public ClusteringActionRequestBuilder(ElasticsearchClient client) { 19 | super(client, ClusteringAction.INSTANCE, new ClusteringActionRequest()); 20 | } 21 | 22 | public ClusteringActionRequestBuilder setSearchRequest(SearchRequestBuilder builder) { 23 | super.request.setSearchRequest(builder); 24 | return this; 25 | } 26 | 27 | public ClusteringActionRequestBuilder setSearchRequest(SearchRequest searchRequest) { 28 | super.request.setSearchRequest(searchRequest); 29 | return this; 30 | } 31 | 32 | public ClusteringActionRequestBuilder setQueryHint(String queryHint) { 33 | if (queryHint == null) { 34 | throw new IllegalArgumentException("Query hint may be empty but must not be null."); 35 | } 36 | super.request.setQueryHint(queryHint); 37 | return this; 38 | } 39 | 40 | public ClusteringActionRequestBuilder setAlgorithm(String algorithm) { 41 | super.request.setAlgorithm(algorithm); 42 | return this; 43 | } 44 | 45 | public ClusteringActionRequestBuilder setSource( 46 | BytesReference content, XContentType xContentType, NamedXContentRegistry xContentRegistry) { 47 | super.request.source(content, xContentType, xContentRegistry); 48 | return this; 49 | } 50 | 51 | public ClusteringActionRequestBuilder setMaxHits(int maxHits) { 52 | super.request.setMaxHits(maxHits); 53 | return this; 54 | } 55 | 56 | public ClusteringActionRequestBuilder setMaxHits(String maxHits) { 57 | super.request.setMaxHits(maxHits); 58 | return this; 59 | } 60 | 61 | public ClusteringActionRequestBuilder addAttributes(Map attributes) { 62 | if (super.request.getAttributes() == null) { 63 | super.request.setAttributes(new HashMap()); 64 | } 65 | super.request.getAttributes().putAll(attributes); 66 | return this; 67 | } 68 | 69 | public ClusteringActionRequestBuilder addAttribute(String key, Object value) { 70 | HashMap tmp = new HashMap(); 71 | tmp.put(key, value); 72 | return addAttributes(tmp); 73 | } 74 | 75 | public ClusteringActionRequestBuilder setAttributes(Map attributes) { 76 | super.request.setAttributes(attributes); 77 | return this; 78 | } 79 | 80 | public ClusteringActionRequestBuilder addFieldMapping( 81 | String fieldName, LogicalField logicalField) { 82 | super.request.addFieldMapping(fieldName, logicalField); 83 | return this; 84 | } 85 | 86 | public ClusteringActionRequestBuilder addSourceFieldMapping( 87 | String fieldName, LogicalField logicalField) { 88 | super.request.addSourceFieldMapping(fieldName, logicalField); 89 | return this; 90 | } 91 | 92 | public ClusteringActionRequestBuilder addHighlightedFieldMapping( 93 | String fieldName, LogicalField logicalField) { 94 | super.request.addHighlightedFieldMapping(fieldName, logicalField); 95 | return this; 96 | } 97 | 98 | public ClusteringActionRequestBuilder addFieldMappingSpec( 99 | String fieldSpec, LogicalField logicalField) { 100 | super.request.addFieldMappingSpec(fieldSpec, logicalField); 101 | return this; 102 | } 103 | 104 | public ClusteringActionRequestBuilder setCreateUngroupedDocumentsCluster(boolean enabled) { 105 | super.request.setCreateUngroupedDocumentsCluster(enabled); 106 | return this; 107 | } 108 | 109 | public ClusteringActionRequestBuilder setDefaultLanguage(String language) { 110 | super.request.setDefaultLanguage(language); 111 | return this; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringActionResponse.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import java.util.Collections; 6 | import java.util.LinkedHashMap; 7 | import java.util.Map; 8 | import org.elasticsearch.action.ActionResponse; 9 | import org.elasticsearch.action.search.SearchResponse; 10 | import org.elasticsearch.common.io.stream.StreamInput; 11 | import org.elasticsearch.common.io.stream.StreamOutput; 12 | import org.elasticsearch.xcontent.ToXContent; 13 | import org.elasticsearch.xcontent.XContentBuilder; 14 | 15 | /** An {@link ActionResponse} for {@link ClusteringAction}. */ 16 | public class ClusteringActionResponse extends ActionResponse implements ToXContent { 17 | /** Clustering-related response fields. */ 18 | public static final class Fields { 19 | static final String SEARCH_RESPONSE = "search_response"; 20 | static final String CLUSTERS = "clusters"; 21 | static final String INFO = "info"; 22 | 23 | // from SearchResponse 24 | static final String _SCROLL_ID = "_scroll_id"; 25 | static final String _SHARDS = "_shards"; 26 | static final String TOTAL = "total"; 27 | static final String SUCCESSFUL = "successful"; 28 | static final String FAILED = "failed"; 29 | static final String FAILURES = "failures"; 30 | static final String STATUS = "status"; 31 | static final String INDEX = "index"; 32 | static final String SHARD = "shard"; 33 | static final String REASON = "reason"; 34 | static final String TOOK = "took"; 35 | static final String TIMED_OUT = "timed_out"; 36 | 37 | /** {@link Fields#INFO} keys. */ 38 | public static final class Info { 39 | public static final String ALGORITHM = "algorithm"; 40 | public static final String SEARCH_MILLIS = "search-millis"; 41 | public static final String CLUSTERING_MILLIS = "clustering-millis"; 42 | public static final String TOTAL_MILLIS = "total-millis"; 43 | public static final String INCLUDE_HITS = "include-hits"; 44 | public static final String MAX_HITS = "max-hits"; 45 | public static final String LANGUAGES = "languages"; 46 | } 47 | } 48 | 49 | private SearchResponse searchResponse; 50 | private DocumentGroup[] topGroups; 51 | private Map info; 52 | 53 | ClusteringActionResponse(StreamInput in) throws IOException { 54 | boolean hasSearchResponse = in.readBoolean(); 55 | if (hasSearchResponse) { 56 | this.searchResponse = new SearchResponse(in); 57 | } 58 | 59 | int documentGroupsCount = in.readVInt(); 60 | topGroups = new DocumentGroup[documentGroupsCount]; 61 | for (int i = 0; i < documentGroupsCount; i++) { 62 | DocumentGroup group = new DocumentGroup(in); 63 | topGroups[i] = group; 64 | } 65 | 66 | int entries = in.readVInt(); 67 | info = new LinkedHashMap<>(); 68 | for (int i = 0; i < entries; i++) { 69 | info.put(in.readOptionalString(), in.readOptionalString()); 70 | } 71 | } 72 | 73 | public ClusteringActionResponse( 74 | SearchResponse searchResponse, DocumentGroup[] topGroups, Map info) { 75 | this.searchResponse = Preconditions.checkNotNull(searchResponse); 76 | this.topGroups = Preconditions.checkNotNull(topGroups); 77 | this.info = Collections.unmodifiableMap(Preconditions.checkNotNull(info)); 78 | } 79 | 80 | public SearchResponse getSearchResponse() { 81 | return searchResponse; 82 | } 83 | 84 | public DocumentGroup[] getDocumentGroups() { 85 | return topGroups; 86 | } 87 | 88 | public Map getInfo() { 89 | return info; 90 | } 91 | 92 | @Override 93 | public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { 94 | if (searchResponse != null) { 95 | searchResponse.innerToXContent(builder, ToXContent.EMPTY_PARAMS); 96 | } 97 | 98 | builder.startArray(Fields.CLUSTERS); 99 | if (topGroups != null) { 100 | for (DocumentGroup group : topGroups) { 101 | group.toXContent(builder, params); 102 | } 103 | } 104 | builder.endArray(); 105 | builder.field(Fields.INFO, info); 106 | return builder; 107 | } 108 | 109 | @Override 110 | public void writeTo(StreamOutput out) throws IOException { 111 | boolean hasSearchResponse = searchResponse != null; 112 | out.writeBoolean(hasSearchResponse); 113 | if (hasSearchResponse) { 114 | this.searchResponse.writeTo(out); 115 | } 116 | 117 | out.writeVInt(topGroups == null ? 0 : topGroups.length); 118 | if (topGroups != null) { 119 | for (DocumentGroup group : topGroups) { 120 | group.writeTo(out); 121 | } 122 | } 123 | 124 | out.writeVInt(info == null ? 0 : info.size()); 125 | if (info != null) { 126 | for (Map.Entry e : info.entrySet()) { 127 | out.writeOptionalString(e.getKey()); 128 | out.writeOptionalString(e.getValue()); 129 | } 130 | } 131 | } 132 | 133 | @Override 134 | public String toString() { 135 | return ToString.objectToJson(this); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringContext.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.nio.file.Files; 5 | import java.nio.file.Path; 6 | import java.security.AccessController; 7 | import java.security.PrivilegedExceptionAction; 8 | import java.util.Arrays; 9 | import java.util.Collection; 10 | import java.util.LinkedHashMap; 11 | import java.util.List; 12 | import java.util.Optional; 13 | import java.util.function.Supplier; 14 | import java.util.stream.Collectors; 15 | import org.apache.logging.log4j.LogManager; 16 | import org.apache.logging.log4j.Logger; 17 | import org.carrot2.clustering.ClusteringAlgorithm; 18 | import org.carrot2.clustering.ClusteringAlgorithmProvider; 19 | import org.carrot2.language.LanguageComponents; 20 | import org.carrot2.language.LanguageComponentsLoader; 21 | import org.carrot2.language.LanguageComponentsProvider; 22 | import org.carrot2.language.LoadedLanguages; 23 | import org.carrot2.util.ChainedResourceLookup; 24 | import org.elasticsearch.ElasticsearchException; 25 | import org.elasticsearch.common.component.AbstractLifecycleComponent; 26 | import org.elasticsearch.common.settings.Settings; 27 | import org.elasticsearch.core.SuppressForbidden; 28 | import org.elasticsearch.env.Environment; 29 | import org.elasticsearch.node.Node; 30 | 31 | /** Holds the language components initialized and ready throughout the {@link Node}'s lifecycle. */ 32 | public class ClusteringContext extends AbstractLifecycleComponent { 33 | public static final String PROP_RESOURCES = "resources"; 34 | 35 | private final Environment environment; 36 | private final LinkedHashMap algorithmProviders; 37 | private final LinkedHashMap> languageComponentProviders; 38 | 39 | private LinkedHashMap languages; 40 | private Logger logger; 41 | 42 | public ClusteringContext( 43 | Environment environment, 44 | LinkedHashMap algorithmProviders, 45 | LinkedHashMap> languageComponentProviders) { 46 | this.environment = environment; 47 | this.logger = LogManager.getLogger("plugin.carrot2"); 48 | this.algorithmProviders = algorithmProviders; 49 | this.languageComponentProviders = languageComponentProviders; 50 | } 51 | 52 | @SuppressForbidden(reason = "C2 integration (File API)") 53 | @SuppressWarnings("removal") 54 | @Override 55 | protected void doStart() throws ElasticsearchException { 56 | try { 57 | Path esConfig = environment.configFile(); 58 | Path pluginConfigPath = esConfig.resolve(ClusteringPlugin.PLUGIN_NAME); 59 | 60 | if (!Files.isDirectory(pluginConfigPath)) { 61 | throw new ElasticsearchException("Missing configuration folder?: {}", pluginConfigPath); 62 | } 63 | 64 | Settings.Builder builder = Settings.builder(); 65 | for (String configName : 66 | new String[] {"config.yml", "config.yaml", "config.json", "config.properties"}) { 67 | Path resolved = pluginConfigPath.resolve(configName); 68 | if (Files.exists(resolved)) { 69 | builder.loadFromPath(resolved); 70 | } 71 | } 72 | Settings c2Settings = builder.build(); 73 | 74 | List resourceLocations = 75 | c2Settings.getAsList(PROP_RESOURCES).stream() 76 | .map(p -> esConfig.resolve(p).toAbsolutePath()) 77 | .filter( 78 | p -> { 79 | boolean exists = Files.exists(p); 80 | if (!exists) { 81 | logger.info( 82 | "Clustering algorithm resource location does not exist, ignored: {}", p); 83 | } 84 | return exists; 85 | }) 86 | .collect(Collectors.toList()); 87 | 88 | LanguageComponentsLoader loader = LanguageComponents.loader(); 89 | 90 | if (!resourceLocations.isEmpty()) { 91 | logger.info( 92 | "Clustering algorithm resources first looked up relative to: {}", resourceLocations); 93 | loader.withResourceLookup( 94 | (provider) -> 95 | new ChainedResourceLookup( 96 | Arrays.asList( 97 | new PathResourceLookup(resourceLocations), 98 | provider.defaultResourceLookup()))); 99 | } else { 100 | logger.info("Resources read from defaults (JARs)."); 101 | } 102 | 103 | // Only load the resources of algorithms we're interested in. 104 | loader.limitToAlgorithms( 105 | algorithmProviders.values().stream() 106 | .map(Supplier::get) 107 | .toArray(ClusteringAlgorithm[]::new)); 108 | 109 | AccessController.doPrivileged( 110 | (PrivilegedExceptionAction) 111 | () -> { 112 | languages = new LinkedHashMap<>(); 113 | LoadedLanguages loadedLanguages = loader.load(languageComponentProviders); 114 | for (String lang : loadedLanguages.languages()) { 115 | languages.put(lang, loadedLanguages.language(lang)); 116 | } 117 | 118 | // Debug info about loaded languages. 119 | if (logger.isDebugEnabled()) { 120 | for (String lang : loadedLanguages.languages()) { 121 | logger.trace( 122 | "Loaded language '" 123 | + lang 124 | + "' with components: " 125 | + "\n - " 126 | + loadedLanguages.language(lang).components().stream() 127 | .map(c -> c.getSimpleName()) 128 | .collect(Collectors.joining("\n - "))); 129 | } 130 | } 131 | 132 | // Remove algorithms for which there are no languages that are supported. 133 | algorithmProviders 134 | .entrySet() 135 | .removeIf(e -> !isAlgorithmAvailable(e.getValue(), languages.values())); 136 | 137 | algorithmProviders.forEach( 138 | (name, prov) -> { 139 | String supportedLanguages = 140 | languages.values().stream() 141 | .filter(lc -> prov.get().supports(lc)) 142 | .map(LanguageComponents::language) 143 | .collect(Collectors.joining(", ")); 144 | 145 | logger.info( 146 | "Clustering algorithm {} loaded with support for the following languages: {}", 147 | name, 148 | supportedLanguages); 149 | }); 150 | 151 | return null; 152 | }); 153 | } catch (Exception e) { 154 | throw new ElasticsearchException("Could not initialize clustering.", e); 155 | } 156 | 157 | if (algorithmProviders.isEmpty()) { 158 | throw new ElasticsearchException( 159 | "No registered/ available clustering algorithms? Check the logs, it's odd."); 160 | } 161 | } 162 | 163 | /** @return Return a list of available algorithm component identifiers. */ 164 | public LinkedHashMap getAlgorithms() { 165 | return algorithmProviders; 166 | } 167 | 168 | @Override 169 | protected void doStop() throws ElasticsearchException { 170 | // Noop. 171 | } 172 | 173 | @Override 174 | protected void doClose() throws ElasticsearchException { 175 | // Noop. 176 | } 177 | 178 | public LanguageComponents getLanguageComponents(String lang) { 179 | return languages.get(lang); 180 | } 181 | 182 | public boolean isLanguageSupported(String langCode) { 183 | return languages.containsKey(langCode); 184 | } 185 | 186 | private boolean isAlgorithmAvailable( 187 | ClusteringAlgorithmProvider provider, Collection languages) { 188 | ClusteringAlgorithm algorithm = provider.get(); 189 | Optional first = languages.stream().filter(algorithm::supports).findFirst(); 190 | if (first.isEmpty()) { 191 | logger.warn("Algorithm does not support any of the available languages: {}", provider.name()); 192 | return false; 193 | } else { 194 | return true; 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringException.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import org.elasticsearch.ElasticsearchException; 6 | import org.elasticsearch.ElasticsearchWrapperException; 7 | import org.elasticsearch.common.io.stream.StreamInput; 8 | 9 | /** Generic exception implementing {@link org.elasticsearch.ElasticsearchWrapperException} */ 10 | @SuppressWarnings("serial") 11 | public class ClusteringException extends ElasticsearchException 12 | implements ElasticsearchWrapperException { 13 | 14 | public ClusteringException(Throwable cause) { 15 | super(cause); 16 | } 17 | 18 | public ClusteringException(String msg, Object... args) { 19 | super(msg, args); 20 | } 21 | 22 | public ClusteringException(String msg, Throwable cause, Object... args) { 23 | super(msg, cause, args); 24 | } 25 | 26 | public ClusteringException(StreamInput in) throws IOException { 27 | super(in); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ClusteringPlugin.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.Collection; 7 | import java.util.Collections; 8 | import java.util.LinkedHashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.ServiceLoader; 12 | import java.util.function.Supplier; 13 | import org.carrot2.clustering.ClusteringAlgorithmProvider; 14 | import org.carrot2.language.LanguageComponentsProvider; 15 | import org.elasticsearch.action.ActionRequest; 16 | import org.elasticsearch.action.ActionResponse; 17 | import org.elasticsearch.client.Client; 18 | import org.elasticsearch.client.transport.TransportClient; 19 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; 20 | import org.elasticsearch.cluster.node.DiscoveryNodes; 21 | import org.elasticsearch.cluster.service.ClusterService; 22 | import org.elasticsearch.common.io.stream.NamedWriteableRegistry; 23 | import org.elasticsearch.common.settings.ClusterSettings; 24 | import org.elasticsearch.common.settings.IndexScopedSettings; 25 | import org.elasticsearch.common.settings.Settings; 26 | import org.elasticsearch.common.settings.SettingsFilter; 27 | import org.elasticsearch.env.Environment; 28 | import org.elasticsearch.env.NodeEnvironment; 29 | import org.elasticsearch.plugins.ActionPlugin; 30 | import org.elasticsearch.plugins.ExtensiblePlugin; 31 | import org.elasticsearch.plugins.Plugin; 32 | import org.elasticsearch.repositories.RepositoriesService; 33 | import org.elasticsearch.rest.RestController; 34 | import org.elasticsearch.rest.RestHandler; 35 | import org.elasticsearch.script.ScriptService; 36 | import org.elasticsearch.threadpool.ThreadPool; 37 | import org.elasticsearch.watcher.ResourceWatcherService; 38 | import org.elasticsearch.xcontent.NamedXContentRegistry; 39 | 40 | public class ClusteringPlugin extends Plugin implements ExtensiblePlugin, ActionPlugin { 41 | /** Master on/off switch property for the plugin (general settings). */ 42 | public static final String DEFAULT_ENABLED_PROPERTY_NAME = "carrot2.enabled"; 43 | 44 | /** Plugin name. */ 45 | public static final String PLUGIN_NAME = "elasticsearch-carrot2"; 46 | 47 | /** All algorithm providers. */ 48 | private final LinkedHashMap algorithmProviders = 49 | new LinkedHashMap<>(); 50 | 51 | /** All language component providers. */ 52 | private final Map> languageComponentProviders = 53 | new LinkedHashMap<>(); 54 | 55 | private final boolean transportClient; 56 | private final boolean pluginEnabled; 57 | 58 | public ClusteringPlugin(Settings settings) { 59 | this.pluginEnabled = settings.getAsBoolean(DEFAULT_ENABLED_PROPERTY_NAME, true); 60 | this.transportClient = 61 | TransportClient.CLIENT_TYPE.equals(Client.CLIENT_TYPE_SETTING_S.get(settings)); 62 | 63 | // load our own class loader's extensions. 64 | loadExtensions(getClass().getClassLoader()); 65 | } 66 | 67 | @Override 68 | public List> getActions() { 69 | if (pluginEnabled) { 70 | return Arrays.asList( 71 | new ActionHandler<>(ClusteringAction.INSTANCE, ClusteringActionTransport.class), 72 | new ActionHandler<>( 73 | ListAlgorithmsAction.INSTANCE, 74 | ListAlgorithmsAction.TransportListAlgorithmsAction.class)); 75 | } 76 | return Collections.emptyList(); 77 | } 78 | 79 | @Override 80 | public List getRestHandlers( 81 | Settings settings, 82 | RestController restController, 83 | ClusterSettings clusterSettings, 84 | IndexScopedSettings indexScopedSettings, 85 | SettingsFilter settingsFilter, 86 | IndexNameExpressionResolver indexNameExpressionResolver, 87 | Supplier nodesInCluster) { 88 | return Arrays.asList( 89 | new ClusteringAction.RestClusteringAction(), 90 | new ListAlgorithmsAction.RestListAlgorithmsAction()); 91 | } 92 | 93 | @Override 94 | public Collection createComponents( 95 | Client client, 96 | ClusterService clusterService, 97 | ThreadPool threadPool, 98 | ResourceWatcherService resourceWatcherService, 99 | ScriptService scriptService, 100 | NamedXContentRegistry xContentRegistry, 101 | Environment environment, 102 | NodeEnvironment nodeEnvironment, 103 | NamedWriteableRegistry namedWriteableRegistry, 104 | IndexNameExpressionResolver indexNameExpressionResolver, 105 | Supplier repositoriesServiceSupplier) { 106 | List components = new ArrayList<>(); 107 | if (pluginEnabled && !transportClient) { 108 | components.add( 109 | new ClusteringContext( 110 | environment, 111 | reorderAlgorithms(algorithmProviders), 112 | new LinkedHashMap<>(languageComponentProviders))); 113 | } 114 | return components; 115 | } 116 | 117 | /** This places Lingo3G in front of the algorithm list if it is available. */ 118 | private LinkedHashMap reorderAlgorithms( 119 | LinkedHashMap providers) { 120 | String[] desiredOrder = {"Lingo3G", "Lingo", "STC", "Bisecting K-Means"}; 121 | LinkedHashMap copy = new LinkedHashMap<>(); 122 | for (String name : desiredOrder) { 123 | if (providers.containsKey(name)) { 124 | copy.put(name, providers.get(name)); 125 | } 126 | } 127 | providers.forEach( 128 | (name, provider) -> { 129 | if (!copy.containsKey(name)) { 130 | copy.put(name, provider); 131 | } 132 | }); 133 | return copy; 134 | } 135 | 136 | @Override 137 | public void loadExtensions(ExtensionLoader loader) { 138 | loadExtensions( 139 | loader.loadExtensions(ClusteringAlgorithmProvider.class), 140 | loader.loadExtensions(LanguageComponentsProvider.class)); 141 | } 142 | 143 | private void loadExtensions(ClassLoader classLoader) { 144 | loadExtensions( 145 | ServiceLoader.load(ClusteringAlgorithmProvider.class, classLoader), 146 | ServiceLoader.load(LanguageComponentsProvider.class, classLoader)); 147 | } 148 | 149 | private void loadExtensions( 150 | Iterable clusteringAlgorithmProviders, 151 | Iterable languageComponentsProviders) { 152 | clusteringAlgorithmProviders.forEach( 153 | (provider) -> { 154 | String name = provider.name(); 155 | if (algorithmProviders.containsKey(name)) { 156 | throw new RuntimeException("More than one provider for algorithm " + name + "?"); 157 | } 158 | algorithmProviders.put(name, provider); 159 | }); 160 | 161 | languageComponentsProviders.forEach( 162 | provider -> { 163 | for (String lang : provider.languages()) { 164 | languageComponentProviders 165 | .computeIfAbsent(lang, (k) -> new ArrayList<>()) 166 | .add(provider); 167 | } 168 | }); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/DocumentGroup.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import java.util.Arrays; 6 | import java.util.HashSet; 7 | import java.util.Set; 8 | import org.carrot2.clustering.Cluster; 9 | import org.elasticsearch.common.Strings; 10 | import org.elasticsearch.common.io.stream.StreamInput; 11 | import org.elasticsearch.common.io.stream.StreamOutput; 12 | import org.elasticsearch.common.io.stream.Writeable; 13 | import org.elasticsearch.xcontent.ToXContent; 14 | import org.elasticsearch.xcontent.XContentBuilder; 15 | import org.elasticsearch.xcontent.XContentFactory; 16 | 17 | /** 18 | * A {@link DocumentGroup} acts as an adapter over {@link Cluster}, providing additional 19 | * serialization methods and only exposing a subset of {@link Cluster}'s data. 20 | */ 21 | public class DocumentGroup implements ToXContent, Writeable { 22 | private static final DocumentGroup[] EMPTY_DOC_GROUP = new DocumentGroup[0]; 23 | private static final String[] EMPTY_STRING_ARRAY = new String[0]; 24 | 25 | private int id; 26 | private String[] phrases = EMPTY_STRING_ARRAY; 27 | private double score; 28 | private String[] documentReferences = EMPTY_STRING_ARRAY; 29 | private DocumentGroup[] subgroups = EMPTY_DOC_GROUP; 30 | private boolean ungroupedDocuments; 31 | private Set uniqueDocuments; 32 | 33 | public DocumentGroup() {} 34 | 35 | DocumentGroup(StreamInput in) throws IOException { 36 | id = in.readVInt(); 37 | score = in.readDouble(); 38 | phrases = in.readStringArray(); 39 | ungroupedDocuments = in.readBoolean(); 40 | documentReferences = in.readStringArray(); 41 | 42 | int max = in.readVInt(); 43 | subgroups = new DocumentGroup[max]; 44 | for (int i = 0; i < max; i++) { 45 | subgroups[i] = new DocumentGroup(in); 46 | } 47 | } 48 | 49 | public DocumentGroup[] getSubgroups() { 50 | return subgroups; 51 | } 52 | 53 | public void setSubgroups(DocumentGroup[] subclusters) { 54 | this.subgroups = Preconditions.checkNotNull(subclusters); 55 | } 56 | 57 | public void setId(int id) { 58 | this.id = id; 59 | } 60 | 61 | public int getId() { 62 | return id; 63 | } 64 | 65 | public void setPhrases(String[] phrases) { 66 | this.phrases = Preconditions.checkNotNull(phrases); 67 | } 68 | 69 | public String[] getPhrases() { 70 | return phrases; 71 | } 72 | 73 | public String getLabel() { 74 | return String.join(", ", getPhrases()); 75 | } 76 | 77 | public void setScore(Double score) { 78 | this.score = (score == null ? 0 : score); 79 | } 80 | 81 | public double getScore() { 82 | return score; 83 | } 84 | 85 | public void setDocumentReferences(String[] documentReferences) { 86 | this.documentReferences = Preconditions.checkNotNull(documentReferences); 87 | } 88 | 89 | public String[] getDocumentReferences() { 90 | return documentReferences; 91 | } 92 | 93 | public void setUngroupedDocuments(boolean ungroupedDocuments) { 94 | this.ungroupedDocuments = ungroupedDocuments; 95 | } 96 | 97 | public boolean isUngroupedDocuments() { 98 | return ungroupedDocuments; 99 | } 100 | 101 | public Set uniqueDocuments() { 102 | // Compute lazily. 103 | if (uniqueDocuments == null) { 104 | uniqueDocuments = new HashSet<>(); 105 | uniqueDocuments.addAll(Arrays.asList(getDocumentReferences())); 106 | for (DocumentGroup group : subgroups) { 107 | uniqueDocuments.addAll(group.uniqueDocuments); 108 | } 109 | } 110 | return uniqueDocuments; 111 | } 112 | 113 | @Override 114 | public void writeTo(StreamOutput out) throws IOException { 115 | out.writeVInt(id); 116 | out.writeDouble(score); 117 | out.writeStringArray(phrases); 118 | out.writeBoolean(ungroupedDocuments); 119 | out.writeStringArray(documentReferences); 120 | 121 | out.writeVInt(subgroups.length); 122 | for (DocumentGroup group : subgroups) { 123 | group.writeTo(out); 124 | } 125 | } 126 | 127 | @Override 128 | public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { 129 | builder.startObject(); 130 | builder 131 | .field("id", id) 132 | .field("score", score) 133 | .field("label", getLabel()) 134 | .array("phrases", phrases); 135 | 136 | if (ungroupedDocuments) { 137 | builder.field("other_topics", ungroupedDocuments); 138 | } 139 | 140 | if (documentReferences.length > 0) { 141 | builder.array("documents", documentReferences); 142 | } 143 | 144 | if (subgroups.length > 0) { 145 | builder.startArray("clusters"); 146 | for (DocumentGroup group : subgroups) { 147 | group.toXContent(builder, params); 148 | } 149 | builder.endArray(); 150 | } 151 | 152 | builder.endObject(); 153 | return builder; 154 | } 155 | 156 | public String toString() { 157 | try { 158 | XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); 159 | toXContent(builder, EMPTY_PARAMS); 160 | return Strings.toString(builder); 161 | } catch (IOException e) { 162 | return "{ \"error\" : \"" + e.getMessage() + "\"}"; 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/FieldMappingSpec.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import org.elasticsearch.common.io.stream.StreamInput; 6 | import org.elasticsearch.common.io.stream.StreamOutput; 7 | import org.elasticsearch.common.io.stream.Writeable; 8 | 9 | class FieldMappingSpec implements Writeable { 10 | String field; 11 | LogicalField logicalField; 12 | FieldSource source; 13 | 14 | FieldMappingSpec(String field, LogicalField logicalField, FieldSource source) { 15 | this.field = field; 16 | this.logicalField = logicalField; 17 | this.source = source; 18 | } 19 | 20 | FieldMappingSpec(StreamInput in) throws IOException { 21 | field = in.readString(); 22 | logicalField = LogicalField.fromOrdinal(in.readVInt()); 23 | source = FieldSource.fromOrdinal(in.readVInt()); 24 | } 25 | 26 | @Override 27 | public void writeTo(StreamOutput out) throws IOException { 28 | out.writeString(field); 29 | out.writeVInt(logicalField.ordinal()); 30 | out.writeVInt(source.ordinal()); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/FieldSource.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | /** The source of data for a logical document field. */ 5 | enum FieldSource { 6 | HIGHLIGHT("highlight."), 7 | FIELD("fields."), 8 | SOURCE("_source."); 9 | 10 | /** Field specification prefix for this source. */ 11 | private final String fieldSpecPrefix; 12 | 13 | static class ParsedFieldSource { 14 | final FieldSource source; 15 | final String fieldName; 16 | 17 | ParsedFieldSource(FieldSource source, String fieldName) { 18 | this.source = source; 19 | this.fieldName = fieldName; 20 | } 21 | } 22 | 23 | static ParsedFieldSource parseSpec(String fieldSourceSpec) { 24 | if (fieldSourceSpec != null) { 25 | for (FieldSource fs : cachedByOrdinal) { 26 | if (fieldSourceSpec.startsWith(fs.fieldSpecPrefix)) { 27 | return new ParsedFieldSource(fs, fieldSourceSpec.substring(fs.fieldSpecPrefix.length())); 28 | } 29 | } 30 | } 31 | return null; 32 | } 33 | 34 | static FieldSource[] cachedByOrdinal = values(); 35 | 36 | static FieldSource fromOrdinal(int ordinal) { 37 | return cachedByOrdinal[ordinal]; 38 | } 39 | 40 | FieldSource(String fieldSpecPrefix) { 41 | this.fieldSpecPrefix = fieldSpecPrefix; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/InputDocument.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.Objects; 5 | import java.util.function.BiConsumer; 6 | import org.carrot2.clustering.Document; 7 | 8 | public class InputDocument implements Document { 9 | private final String title; 10 | private final String content; 11 | private final String language; 12 | private final String hitId; 13 | 14 | public InputDocument(String title, String content, String language, String hitId) { 15 | this.title = title; 16 | this.content = content; 17 | this.language = language; 18 | this.hitId = Objects.requireNonNull(hitId); 19 | } 20 | 21 | @Override 22 | public void visitFields(BiConsumer fieldConsumer) { 23 | fieldConsumer.accept("title", title); 24 | fieldConsumer.accept("content", content); 25 | } 26 | 27 | public String getStringId() { 28 | return hitId; 29 | } 30 | 31 | public String language() { 32 | return language; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ListAlgorithmsAction.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import static org.carrot2.elasticsearch.LoggerUtils.emitErrorResponse; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.Collections; 10 | import java.util.List; 11 | import org.apache.logging.log4j.LogManager; 12 | import org.apache.logging.log4j.Logger; 13 | import org.elasticsearch.action.ActionListener; 14 | import org.elasticsearch.action.ActionRequest; 15 | import org.elasticsearch.action.ActionRequestBuilder; 16 | import org.elasticsearch.action.ActionRequestValidationException; 17 | import org.elasticsearch.action.ActionResponse; 18 | import org.elasticsearch.action.ActionType; 19 | import org.elasticsearch.action.support.ActionFilters; 20 | import org.elasticsearch.action.support.TransportAction; 21 | import org.elasticsearch.client.ElasticsearchClient; 22 | import org.elasticsearch.client.node.NodeClient; 23 | import org.elasticsearch.common.inject.Inject; 24 | import org.elasticsearch.common.io.stream.StreamInput; 25 | import org.elasticsearch.common.io.stream.StreamOutput; 26 | import org.elasticsearch.common.io.stream.Writeable; 27 | import org.elasticsearch.rest.BaseRestHandler; 28 | import org.elasticsearch.rest.BytesRestResponse; 29 | import org.elasticsearch.rest.RestRequest; 30 | import org.elasticsearch.rest.RestRequest.Method; 31 | import org.elasticsearch.rest.RestStatus; 32 | import org.elasticsearch.tasks.Task; 33 | import org.elasticsearch.threadpool.ThreadPool; 34 | import org.elasticsearch.transport.TransportChannel; 35 | import org.elasticsearch.transport.TransportRequestHandler; 36 | import org.elasticsearch.transport.TransportService; 37 | import org.elasticsearch.xcontent.ToXContent; 38 | import org.elasticsearch.xcontent.XContentBuilder; 39 | 40 | /** List all available clustering algorithms. */ 41 | public class ListAlgorithmsAction 42 | extends ActionType { 43 | /* Action name. */ 44 | public static final String NAME = "cluster:monitor/carrot2/algorithms"; 45 | 46 | /* Reusable singleton. */ 47 | public static final ListAlgorithmsAction INSTANCE = new ListAlgorithmsAction(); 48 | 49 | private ListAlgorithmsAction() { 50 | super(NAME, ListAlgorithmsActionResponse::new); 51 | } 52 | 53 | @Override 54 | public Writeable.Reader getResponseReader() { 55 | return ListAlgorithmsActionResponse::new; 56 | } 57 | 58 | /** An {@link ActionRequest} for {@link ListAlgorithmsAction}. */ 59 | public static class ListAlgorithmsActionRequest extends ActionRequest { 60 | 61 | ListAlgorithmsActionRequest() {} 62 | 63 | ListAlgorithmsActionRequest(StreamInput in) throws IOException { 64 | super(in); 65 | } 66 | 67 | @Override 68 | public ActionRequestValidationException validate() { 69 | return /* Nothing to validate. */ null; 70 | } 71 | } 72 | 73 | /** An {@link ActionRequestBuilder} for {@link ListAlgorithmsAction}. */ 74 | public static class ListAlgorithmsActionRequestBuilder 75 | extends ActionRequestBuilder { 76 | public ListAlgorithmsActionRequestBuilder(ElasticsearchClient client) { 77 | super(client, ListAlgorithmsAction.INSTANCE, new ListAlgorithmsActionRequest()); 78 | } 79 | } 80 | 81 | /** A {@link ActionResponse} for {@link ListAlgorithmsAction}. */ 82 | public static class ListAlgorithmsActionResponse extends ActionResponse implements ToXContent { 83 | private static final String[] EMPTY_LIST = {}; 84 | private String[] algorithms; 85 | 86 | /** Clustering-related response fields. */ 87 | static final class Fields { 88 | static final String ALGORITHMS = "algorithms"; 89 | } 90 | 91 | public ListAlgorithmsActionResponse(StreamInput in) throws IOException { 92 | super(in); 93 | algorithms = in.readStringArray(); 94 | } 95 | 96 | public ListAlgorithmsActionResponse(List algorithms) { 97 | this.algorithms = algorithms.toArray(new String[0]); 98 | } 99 | 100 | public List getAlgorithms() { 101 | return Collections.unmodifiableList(Arrays.asList(algorithms)); 102 | } 103 | 104 | @Override 105 | public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { 106 | return builder.array(Fields.ALGORITHMS, algorithms); 107 | } 108 | 109 | @Override 110 | public void writeTo(StreamOutput out) throws IOException { 111 | out.writeStringArray(algorithms); 112 | } 113 | 114 | @Override 115 | public String toString() { 116 | return ToString.objectToJson(this); 117 | } 118 | } 119 | 120 | /** 121 | * A {@link TransportAction} for actually executing {@link ListAlgorithmsActionRequest} and 122 | * providing {@link ListAlgorithmsActionResponse}. 123 | */ 124 | public static class TransportListAlgorithmsAction 125 | extends TransportAction { 126 | 127 | protected Logger logger = LogManager.getLogger(getClass()); 128 | private final ClusteringContext controllerSingleton; 129 | 130 | @Inject 131 | public TransportListAlgorithmsAction( 132 | TransportService transportService, 133 | ClusteringContext controllerSingleton, 134 | ActionFilters actionFilters) { 135 | super( 136 | ListAlgorithmsAction.NAME, 137 | actionFilters, 138 | transportService.getLocalNodeConnection(), 139 | transportService.getTaskManager()); 140 | this.controllerSingleton = controllerSingleton; 141 | transportService.registerRequestHandler( 142 | ListAlgorithmsAction.NAME, 143 | ThreadPool.Names.SAME, 144 | ListAlgorithmsActionRequest::new, 145 | new TransportHandler()); 146 | } 147 | 148 | @Override 149 | protected void doExecute( 150 | Task task, 151 | ListAlgorithmsActionRequest request, 152 | ActionListener listener) { 153 | listener.onResponse( 154 | new ListAlgorithmsActionResponse( 155 | new ArrayList<>(controllerSingleton.getAlgorithms().keySet()))); 156 | } 157 | 158 | private final class TransportHandler 159 | implements TransportRequestHandler { 160 | @Override 161 | public void messageReceived( 162 | final ListAlgorithmsActionRequest request, final TransportChannel channel, Task task) 163 | throws Exception { 164 | execute( 165 | request, 166 | new ActionListener() { 167 | @Override 168 | public void onResponse(ListAlgorithmsActionResponse response) { 169 | try { 170 | channel.sendResponse(response); 171 | } catch (Exception e) { 172 | onFailure(e); 173 | } 174 | } 175 | 176 | @Override 177 | public void onFailure(Exception e) { 178 | try { 179 | channel.sendResponse(e); 180 | } catch (Exception e1) { 181 | logger.warn( 182 | "Failed to send error response for action [" 183 | + NAME 184 | + "] and request [" 185 | + request 186 | + "]", 187 | e1); 188 | } 189 | } 190 | }); 191 | } 192 | } 193 | } 194 | 195 | /** {@link BaseRestHandler} for serving {@link ListAlgorithmsAction}. */ 196 | public static class RestListAlgorithmsAction extends BaseRestHandler { 197 | /* Action name suffix. */ 198 | public static String NAME = "_algorithms"; 199 | 200 | protected Logger logger = LogManager.getLogger(getClass()); 201 | 202 | @Override 203 | public List routes() { 204 | return Arrays.asList(new Route(Method.POST, "/" + NAME), new Route(Method.GET, "/" + NAME)); 205 | } 206 | 207 | @Override 208 | public String getName() { 209 | return NAME; 210 | } 211 | 212 | @Override 213 | public RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) { 214 | if (request.hasContent()) { 215 | return channel -> 216 | emitErrorResponse( 217 | channel, logger, new IllegalArgumentException("Request body was expected.")); 218 | } 219 | 220 | ListAlgorithmsActionRequest actionRequest = new ListAlgorithmsActionRequest(); 221 | return channel -> 222 | client.execute( 223 | INSTANCE, 224 | actionRequest, 225 | new ActionListener() { 226 | @Override 227 | public void onResponse(ListAlgorithmsActionResponse response) { 228 | try { 229 | XContentBuilder builder = channel.newBuilder(); 230 | builder.startObject(); 231 | response.toXContent(builder, request); 232 | builder.endObject(); 233 | channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder)); 234 | } catch (Exception e) { 235 | logger.debug("Failed to emit response.", e); 236 | onFailure(e); 237 | } 238 | } 239 | 240 | @Override 241 | public void onFailure(Exception e) { 242 | emitErrorResponse(channel, logger, e); 243 | } 244 | }); 245 | } 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/LoggerUtils.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import org.apache.logging.log4j.Logger; 6 | import org.elasticsearch.rest.BytesRestResponse; 7 | import org.elasticsearch.rest.RestChannel; 8 | 9 | final class LoggerUtils { 10 | 11 | static void emitErrorResponse(RestChannel channel, Logger logger, Exception e) { 12 | try { 13 | channel.sendResponse(new BytesRestResponse(channel, e)); 14 | } catch (IOException e1) { 15 | logger.error("Failed to send failure response.", e1); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/LogicalField.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.HashMap; 5 | import java.util.Locale; 6 | 7 | /** 8 | * Logical fields of a document to be clustered. 9 | * 10 | * @see ClusteringActionRequest#addFieldMappingSpec(String, LogicalField) 11 | * @see ClusteringActionRequest#addFieldMapping(String, LogicalField) 12 | * @see ClusteringActionRequest#addHighlightedFieldMapping(String, LogicalField) 13 | * @see ClusteringActionRequest#addSourceFieldMapping(String, LogicalField) 14 | */ 15 | public enum LogicalField { 16 | TITLE, 17 | CONTENT, 18 | LANGUAGE; 19 | 20 | static final LogicalField[] cachedByOrdinal = values(); 21 | 22 | static LogicalField fromOrdinal(int ordinal) { 23 | return cachedByOrdinal[ordinal]; 24 | } 25 | 26 | static final HashMap aliases; 27 | 28 | static { 29 | aliases = new HashMap<>(); 30 | for (LogicalField v : LogicalField.values()) { 31 | aliases.put(v.name(), v); 32 | aliases.put(v.name().toLowerCase(Locale.ROOT), v); 33 | } 34 | } 35 | 36 | /** 37 | * Same as {@link LogicalField#valueOf(String)} but does not throw an exception on invalid values 38 | * (returns null). 39 | */ 40 | static LogicalField valueOfCaseInsensitive(String enumValue) { 41 | return aliases.get(enumValue); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/OptionalQueryHintSetterVisitor.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.util.Objects; 5 | import org.carrot2.attrs.AcceptingVisitor; 6 | import org.carrot2.attrs.AttrBoolean; 7 | import org.carrot2.attrs.AttrDouble; 8 | import org.carrot2.attrs.AttrEnum; 9 | import org.carrot2.attrs.AttrInteger; 10 | import org.carrot2.attrs.AttrObject; 11 | import org.carrot2.attrs.AttrObjectArray; 12 | import org.carrot2.attrs.AttrString; 13 | import org.carrot2.attrs.AttrStringArray; 14 | import org.carrot2.attrs.AttrVisitor; 15 | 16 | final class OptionalQueryHintSetterVisitor implements AttrVisitor { 17 | private final String queryHint; 18 | 19 | OptionalQueryHintSetterVisitor(String queryHint) { 20 | this.queryHint = queryHint; 21 | } 22 | 23 | @Override 24 | public void visit(String key, AttrBoolean attr) {} 25 | 26 | @Override 27 | public void visit(String key, AttrInteger attr) {} 28 | 29 | @Override 30 | public void visit(String key, AttrDouble attr) {} 31 | 32 | @Override 33 | public void visit(String key, AttrString attr) { 34 | if (Objects.equals(key, "queryHint")) { 35 | attr.set(queryHint); 36 | } 37 | } 38 | 39 | @Override 40 | public void visit(String key, AttrStringArray attr) {} 41 | 42 | @Override 43 | public > void visit(String key, AttrEnum attr) {} 44 | 45 | @Override 46 | public void visit(String key, AttrObject attr) {} 47 | 48 | @Override 49 | public void visit(String key, AttrObjectArray attr) {} 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/PathResourceLookup.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.BufferedInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | import java.util.List; 10 | import java.util.stream.Collectors; 11 | import org.carrot2.util.ResourceLookup; 12 | 13 | public class PathResourceLookup implements ResourceLookup { 14 | private final List locations; 15 | 16 | public PathResourceLookup(List locations) { 17 | if (locations == null || locations.isEmpty()) { 18 | throw new RuntimeException("At least one resource location is required."); 19 | } 20 | this.locations = locations; 21 | } 22 | 23 | @Override 24 | public InputStream open(String resource) throws IOException { 25 | Path p = locate(resource); 26 | if (p == null) { 27 | throw new IOException( 28 | "Resource " 29 | + p 30 | + " not found relative to: " 31 | + locations.stream() 32 | .map(path -> path.toAbsolutePath().toString()) 33 | .collect(Collectors.joining(", "))); 34 | } 35 | return new BufferedInputStream(Files.newInputStream(p)); 36 | } 37 | 38 | @Override 39 | public boolean exists(String resource) { 40 | return locate(resource) != null; 41 | } 42 | 43 | @Override 44 | public String pathOf(String resource) { 45 | return "[" 46 | + locations.stream() 47 | .map(path -> path.resolve(resource).toAbsolutePath().toString()) 48 | .collect(Collectors.joining(" | ")) 49 | + "]"; 50 | } 51 | 52 | private Path locate(String resource) { 53 | for (Path base : locations) { 54 | Path p = base.resolve(resource); 55 | if (Files.exists(p)) { 56 | return p; 57 | } 58 | } 59 | return null; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/Preconditions.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | final class Preconditions { 5 | /** 6 | * Mark unreachable code path. Expected use scenario: 7 | * 8 | *
 9 |    * throw Preconditions.unreachable();
10 |    * 
11 | */ 12 | public static RuntimeException unreachable() throws RuntimeException { 13 | throw new RuntimeException("Unreachable code assertion hit."); 14 | } 15 | 16 | public static T checkNotNull(T object) throws RuntimeException { 17 | if (object != null) return object; 18 | 19 | throw new IllegalArgumentException("Cannot be null"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/carrot2/elasticsearch/ToString.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import java.io.IOException; 5 | import org.elasticsearch.common.Strings; 6 | import org.elasticsearch.xcontent.ToXContent; 7 | import org.elasticsearch.xcontent.XContentBuilder; 8 | import org.elasticsearch.xcontent.XContentFactory; 9 | 10 | /** Reusable stuff related to {@link Object#toString()} implementations. */ 11 | final class ToString { 12 | public static String objectToJson(ToXContent xcontentObject) { 13 | try { 14 | XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); 15 | builder.startObject(); 16 | xcontentObject.toXContent(builder, ToXContent.EMPTY_PARAMS); 17 | builder.endObject(); 18 | return Strings.toString(builder); 19 | } catch (IOException e) { 20 | try { 21 | XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); 22 | builder.startObject(); 23 | builder.field("error", e.getMessage()); 24 | builder.field("class", e.getClass().getName()); 25 | builder.endObject(); 26 | return Strings.toString(builder); 27 | } catch (IOException e2) { 28 | return "{ \"error\": \"Could not serialize the underlying error.\"}"; 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | }; -------------------------------------------------------------------------------- /src/yamlRestTest/java/org/carrot2/elasticsearch/ListAlgorithmsActionRestIT.java: -------------------------------------------------------------------------------- 1 | 2 | package org.carrot2.elasticsearch; 3 | 4 | import com.carrotsearch.randomizedtesting.annotations.Name; 5 | import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; 6 | import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; 7 | import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; 8 | 9 | /** REST API tests for {@code ListAlgorithmsAction}. */ 10 | public class ListAlgorithmsActionRestIT extends ESClientYamlSuiteTestCase { 11 | 12 | public ListAlgorithmsActionRestIT(@Name("yaml") ClientYamlTestCandidate testCandidate) { 13 | super(testCandidate); 14 | } 15 | 16 | @ParametersFactory 17 | public static Iterable parameters() throws Exception { 18 | return createParameters(); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/yamlRestTest/resources/rest-api-spec/api/algorithms.json: -------------------------------------------------------------------------------- 1 | { 2 | "algorithms": { 3 | "documentation": "List all available clustering algorithms.", 4 | "stability" : "stable", 5 | "visibility": "public", 6 | "url": { 7 | "paths": [ 8 | { 9 | "path": "_algorithms", 10 | "methods": ["GET", "POST"], 11 | "parts": {} 12 | } 13 | ] 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/yamlRestTest/resources/rest-api-spec/test/elasticsearch-carrot2/00_sanity.yml: -------------------------------------------------------------------------------- 1 | "Test that the clustering plugin is loaded": 2 | - do: 3 | cat.plugins: 4 | local: true 5 | h: component 6 | 7 | - match: 8 | $body: /^elasticsearch-carrot2\n$/ 9 | -------------------------------------------------------------------------------- /src/yamlRestTest/resources/rest-api-spec/test/elasticsearch-carrot2/01_list_algorithms.yml: -------------------------------------------------------------------------------- 1 | "List default algorithms": 2 | - do: 3 | algorithms: {} 4 | 5 | - match: { algorithms: ["Lingo", "STC", "Bisecting K-Means"] } 6 | --------------------------------------------------------------------------------