├── scripts ├── stop.sh ├── restart.sh ├── delete-index.sh ├── remove.sh ├── install.sh ├── create-index.sh ├── build.sh ├── start.sh └── add-docs.sh ├── data ├── 1.json ├── 2.json └── biocaddie.json ├── logs.sh ├── rebuild.sh ├── .gitignore ├── src ├── main │ ├── resources │ │ ├── plugin-descriptor.properties │ │ └── stoplist.all │ ├── assemblies │ │ └── plugin.xml │ └── java │ │ └── org │ │ └── nationaldataservice │ │ └── elasticsearch │ │ └── rocchio │ │ ├── RocchioPlugin.java │ │ ├── RocchioException.java │ │ ├── RocchioExpandRestAction.java │ │ └── Rocchio.java └── test │ ├── java │ └── org │ │ └── nationaldataservice │ │ └── elasticsearch │ │ └── rocchio │ │ └── test │ │ ├── unit │ │ └── RocchioTest.java │ │ └── integration │ │ ├── RocchioIT.java │ │ └── AbstractITCase.java │ └── ant │ └── integration-tests.xml ├── test.sh ├── LICENSE ├── README.md └── pom.xml /scripts/stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker stop elastic-qe-5.3.2 4 | -------------------------------------------------------------------------------- /data/1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fullname": "Hello", 3 | "text": "World" 4 | } 5 | -------------------------------------------------------------------------------- /data/2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fullname": "Hello", 3 | "text": "World 2" 4 | } 5 | -------------------------------------------------------------------------------- /scripts/restart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker restart elastic-qe-5.3.2 4 | -------------------------------------------------------------------------------- /logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker logs -f elastic-qe-5.3.2 --tail 100 4 | 5 | -------------------------------------------------------------------------------- /scripts/delete-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -u elastic:changeme -XDELETE localhost:9200/biocaddie?pretty 4 | -------------------------------------------------------------------------------- /scripts/remove.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin remove rocchio 4 | -------------------------------------------------------------------------------- /rebuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scripts/remove.sh; scripts/build.sh && scripts/install.sh && scripts/restart.sh && ./logs.sh 4 | -------------------------------------------------------------------------------- /scripts/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin install file:///plugin-src/target/releases/rocchio-0.0.1-SNAPSHOT.zip 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Eclipse project metadata 2 | .settings/ 3 | .classpath 4 | .project 5 | 6 | # Build output 7 | target/ 8 | 9 | # ElasticSearch data 10 | es-data/ 11 | -------------------------------------------------------------------------------- /scripts/create-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie?pretty -d@data/biocaddie.json 4 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mvn clean package && exit 0 \ 4 | || echo "WARNING: No native Maven installed - using Docker instead" \ 5 | && docker exec -it $(pwd):/workspace -w /workspace maven:3 mvn clean package && exit 0 6 | 7 | exit 1 8 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | description=${project.description}. 2 | version=${project.version} 3 | name=${project.artifactId} 4 | classname=org.nationaldataservice.elasticsearch.rocchio.RocchioPlugin 5 | java.version=1.8 6 | elasticsearch.version=${elasticsearch.version} -------------------------------------------------------------------------------- /scripts/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker start elastic-qe-5.3.2 && exit 0 || docker run --name=elastic-qe-5.3.2 -it -d -p 9200:9200 -v $(pwd):/plugin-src/ -v $HOME/es-5.3.2-data:/usr/share/elasticsearch/data -e "http.host=0.0.0.0" -e "transport.host=127.0.0.1" docker.elastic.co/elasticsearch/elasticsearch:5.3.2 && exit 0 4 | -------------------------------------------------------------------------------- /data/biocaddie.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "dataset": { 4 | "_all": { 5 | "type": "text", 6 | "term_vector": "with_positions_offsets_payloads", 7 | "store" : true, 8 | "analyzer" : "fulltext_analyzer" 9 | } 10 | } 11 | }, 12 | "settings": { 13 | "index" : { 14 | "number_of_shards" : 1, 15 | "number_of_replicas" : 0 16 | }, 17 | "analysis": { 18 | "analyzer": { 19 | "fulltext_analyzer": { 20 | "type": "custom", 21 | "tokenizer": "whitespace", 22 | "filter": [ 23 | "lowercase", 24 | "type_as_payload" 25 | ] 26 | } 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Point to a specific instance of elasticsearch (defaults to Docker instance) 4 | TEST_HOST="localhost" 5 | TEST_PORT="9200" 6 | TEST_USERNAME="elastic" 7 | TEST_PASSWORD="changeme" 8 | 9 | # Specify expansion / search parameters 10 | TEST_INDEX="biocaddie" 11 | SEARCH_TYPE="dataset" 12 | TEST_QUERY="multiple+sclerosis" 13 | STOP_LIST="a+an+the+and+or+of+from+on+was+to+is+-+were+at+as+we" 14 | 15 | # Override additional parameters here 16 | ADDITIONAL_ARGS="&fbTerms=20&fbDocs=50" 17 | 18 | # Otherwise, just run Rocchio and return the expanded query 19 | curl -u "${TEST_USERNAME}:${TEST_PASSWORD}" ${TEST_HOST}:${TEST_PORT}/${TEST_INDEX}/${SEARCH_TYPE}/_expand'?pretty'${ADDITIONAL_ARGS}'&query='${TEST_QUERY} 20 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 11 | elasticsearch 12 | true 13 | 14 | 15 | 16 | 17 | elasticsearch 18 | true 19 | true 20 | 21 | 22 | -------------------------------------------------------------------------------- /scripts/add-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Change this to match the path to your (unzipped) biocaddie benchmark dataset 5 | dataset_directory=$HOME/update_json_folder 6 | 7 | echo 'Started indexing!' 8 | for docid in {1..790000} 9 | do 10 | if [ "$1" == "-vvvv" ]; then 11 | echo "Indexing document: $docid" 12 | elif [ "$1" == "-vvv" -a "$(expr $docid % 10)" == "0" ]; then 13 | echo "Indexing document: $docid" 14 | elif [ "$1" == "-vv" -a "$(expr $docid % 100)" == "0" ]; then 15 | echo "Indexing document: $docid" 16 | elif [ "$1" == "-v" -a "$(expr $docid % 1000)" == "0" ]; then 17 | echo "Indexing document: $docid" 18 | elif [ "$1" != "-q" -a "$$(expr $docid % 100000)" == "0" ]; then 19 | echo "Indexing document: $docid" 20 | fi 21 | 22 | curl --silent -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie/dataset/$docid?pretty -d@$HOME/update_json_folder/$docid.json > /dev/null 23 | done 24 | 25 | echo 'Indexing complete!' 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 National Data Service 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioPlugin.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio; 2 | 3 | import java.util.Arrays; 4 | 5 | import java.util.List; 6 | import java.util.function.Supplier; 7 | 8 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; 9 | import org.elasticsearch.cluster.node.DiscoveryNodes; 10 | import org.elasticsearch.common.settings.ClusterSettings; 11 | import org.elasticsearch.common.settings.IndexScopedSettings; 12 | import org.elasticsearch.common.settings.Settings; 13 | import org.elasticsearch.common.settings.SettingsFilter; 14 | import org.elasticsearch.plugins.ActionPlugin; 15 | import org.elasticsearch.plugins.Plugin; 16 | import org.elasticsearch.rest.RestController; 17 | import org.elasticsearch.rest.RestHandler; 18 | 19 | public class RocchioPlugin extends Plugin implements ActionPlugin { 20 | @Override 21 | public List getRestHandlers(Settings settings, RestController restController, 22 | ClusterSettings clusterSettings, IndexScopedSettings indexScopedSettings, SettingsFilter settingsFilter, 23 | IndexNameExpressionResolver indexNameExpressionResolver, Supplier nodesInCluster) { 24 | return Arrays.asList(new RocchioExpandRestAction(settings, restController)); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioException.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio; 2 | 3 | 4 | public class RocchioException extends Exception { 5 | /** 6 | * Unique id to identify this {@link Exception} 7 | */ 8 | private static final long serialVersionUID = 5961496592606387768L; 9 | 10 | /** 11 | * An {@link Exception} encountered during {@link Rocchio} operations 12 | */ 13 | public RocchioException() { 14 | 15 | } 16 | 17 | /** 18 | * An {@link Exception} encountered during {@link Rocchio} operations 19 | * 20 | * @param message the {@link String} error message 21 | */ 22 | public RocchioException(String message) { 23 | super(message); 24 | } 25 | 26 | /** 27 | * An exception encountered during {@link Rocchio} operations 28 | * 29 | * @param cause the {@link Throwable} underlying cause 30 | */ 31 | public RocchioException(Throwable cause) { 32 | super(cause); 33 | } 34 | 35 | /** 36 | * An exception encountered during {@link Rocchio} operations 37 | * 38 | * @param message the {@link String} error message 39 | * @param cause the {@link Throwable} underlying cause 40 | */ 41 | public RocchioException(String message, Throwable cause) { 42 | super(message, cause); 43 | } 44 | 45 | /** 46 | * An exception encountered during {@link Rocchio} operations 47 | * 48 | * @param message the {@link String} error message 49 | * @param cause the {@link Throwable} underlying cause 50 | * @param enableSuppression a {@link boolean} indicating whether suppression is enabled 51 | * @param writableStackTrace a {@link boolean} indicating whether the stackTrace is writeable 52 | */ 53 | public RocchioException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 54 | super(message, cause, enableSuppression, writableStackTrace); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/resources/stoplist.all: -------------------------------------------------------------------------------- 1 | category 2 | wikipedia 3 | http 4 | html 5 | www 6 | https 7 | com 8 | php 9 | htm 10 | free 11 | cfm 12 | asp 13 | jsp 14 | a 15 | about 16 | above 17 | according 18 | across 19 | after 20 | afterwards 21 | again 22 | against 23 | albeit 24 | all 25 | almost 26 | alone 27 | along 28 | already 29 | also 30 | although 31 | always 32 | am 33 | among 34 | amongst 35 | an 36 | and 37 | another 38 | any 39 | anybody 40 | anyhow 41 | anyone 42 | anything 43 | anyway 44 | anywhere 45 | apart 46 | are 47 | around 48 | as 49 | at 50 | av 51 | be 52 | became 53 | because 54 | become 55 | becomes 56 | becoming 57 | been 58 | before 59 | beforehand 60 | behind 61 | being 62 | below 63 | beside 64 | besides 65 | between 66 | beyond 67 | both 68 | but 69 | by 70 | can 71 | cannot 72 | canst 73 | certain 74 | cf 75 | choose 76 | contrariwise 77 | cos 78 | could 79 | cu 80 | day 81 | do 82 | does 83 | doesn't 84 | doing 85 | dost 86 | doth 87 | double 88 | down 89 | dual 90 | during 91 | each 92 | either 93 | else 94 | elsewhere 95 | enough 96 | et 97 | etc 98 | even 99 | ever 100 | every 101 | everybody 102 | everyone 103 | everything 104 | everywhere 105 | except 106 | excepted 107 | excepting 108 | exception 109 | exclude 110 | excluding 111 | exclusive 112 | far 113 | farther 114 | farthest 115 | few 116 | ff 117 | first 118 | for 119 | formerly 120 | forth 121 | forward 122 | from 123 | front 124 | further 125 | furthermore 126 | furthest 127 | get 128 | go 129 | had 130 | halves 131 | hardly 132 | has 133 | hast 134 | hath 135 | have 136 | he 137 | hence 138 | henceforth 139 | her 140 | here 141 | hereabouts 142 | hereafter 143 | hereby 144 | herein 145 | hereto 146 | hereupon 147 | hers 148 | herself 149 | him 150 | himself 151 | hindmost 152 | his 153 | hither 154 | hitherto 155 | how 156 | however 157 | howsoever 158 | i 159 | ie 160 | if 161 | in 162 | inasmuch 163 | inc 164 | include 165 | included 166 | including 167 | indeed 168 | indoors 169 | inside 170 | insomuch 171 | instead 172 | into 173 | inward 174 | inwards 175 | is 176 | it 177 | its 178 | itself 179 | just 180 | kind 181 | kg 182 | km 183 | last 184 | latter 185 | latterly 186 | less 187 | lest 188 | let 189 | like 190 | little 191 | ltd 192 | many 193 | may 194 | maybe 195 | me 196 | meantime 197 | meanwhile 198 | might 199 | moreover 200 | most 201 | mostly 202 | more 203 | mr 204 | mrs 205 | ms 206 | much 207 | must 208 | my 209 | myself 210 | namely 211 | need 212 | neither 213 | never 214 | nevertheless 215 | next 216 | no 217 | nobody 218 | none 219 | nonetheless 220 | noone 221 | nope 222 | nor 223 | not 224 | nothing 225 | notwithstanding 226 | now 227 | nowadays 228 | nowhere 229 | of 230 | off 231 | often 232 | ok 233 | on 234 | once 235 | one 236 | only 237 | onto 238 | or 239 | other 240 | others 241 | otherwise 242 | ought 243 | our 244 | ours 245 | ourselves 246 | out 247 | outside 248 | over 249 | own 250 | per 251 | perhaps 252 | plenty 253 | provide 254 | quite 255 | rather 256 | really 257 | round 258 | said 259 | sake 260 | same 261 | sang 262 | save 263 | saw 264 | see 265 | seeing 266 | seem 267 | seemed 268 | seeming 269 | seems 270 | seen 271 | seldom 272 | selves 273 | sent 274 | several 275 | shalt 276 | she 277 | should 278 | shown 279 | sideways 280 | since 281 | slept 282 | slew 283 | slung 284 | slunk 285 | smote 286 | so 287 | some 288 | somebody 289 | somehow 290 | someone 291 | something 292 | sometime 293 | sometimes 294 | somewhat 295 | somewhere 296 | spake 297 | spat 298 | spoke 299 | spoken 300 | sprang 301 | sprung 302 | stave 303 | staves 304 | still 305 | such 306 | supposing 307 | than 308 | that 309 | the 310 | thee 311 | their 312 | them 313 | themselves 314 | then 315 | thence 316 | thenceforth 317 | there 318 | thereabout 319 | thereabouts 320 | thereafter 321 | thereby 322 | therefore 323 | therein 324 | thereof 325 | thereon 326 | thereto 327 | thereupon 328 | these 329 | they 330 | this 331 | those 332 | thou 333 | though 334 | thrice 335 | through 336 | throughout 337 | thru 338 | thus 339 | thy 340 | thyself 341 | till 342 | to 343 | together 344 | too 345 | toward 346 | towards 347 | ugh 348 | unable 349 | under 350 | underneath 351 | unless 352 | unlike 353 | until 354 | up 355 | upon 356 | upward 357 | upwards 358 | us 359 | use 360 | used 361 | using 362 | very 363 | via 364 | vs 365 | want 366 | was 367 | we 368 | week 369 | well 370 | were 371 | what 372 | whatever 373 | whatsoever 374 | when 375 | whence 376 | whenever 377 | whensoever 378 | where 379 | whereabouts 380 | whereafter 381 | whereas 382 | whereat 383 | whereby 384 | wherefore 385 | wherefrom 386 | wherein 387 | whereinto 388 | whereof 389 | whereon 390 | wheresoever 391 | whereto 392 | whereunto 393 | whereupon 394 | wherever 395 | wherewith 396 | whether 397 | whew 398 | which 399 | whichever 400 | whichsoever 401 | while 402 | whilst 403 | whither 404 | who 405 | whoa 406 | whoever 407 | whole 408 | whom 409 | whomever 410 | whomsoever 411 | whose 412 | whosoever 413 | why 414 | will 415 | wilt 416 | with 417 | within 418 | without 419 | worse 420 | worst 421 | would 422 | wow 423 | ye 424 | yet 425 | year 426 | yippee 427 | you 428 | your 429 | yours 430 | yourself 431 | yourselves -------------------------------------------------------------------------------- /src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioExpandRestAction.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | import java.nio.file.Paths; 8 | 9 | import org.apache.logging.log4j.Logger; 10 | import org.elasticsearch.client.node.NodeClient; 11 | import org.elasticsearch.common.inject.Inject; 12 | import org.elasticsearch.common.logging.ESLoggerFactory; 13 | import org.elasticsearch.common.settings.Settings; 14 | import org.elasticsearch.common.xcontent.XContentBuilder; 15 | import org.elasticsearch.common.xcontent.json.JsonXContent; 16 | import org.elasticsearch.rest.BaseRestHandler; 17 | import org.elasticsearch.rest.BytesRestResponse; 18 | import org.elasticsearch.rest.RestController; 19 | import org.elasticsearch.rest.RestRequest; 20 | import org.elasticsearch.rest.RestRequest.Method; 21 | import org.elasticsearch.rest.RestStatus; 22 | 23 | import edu.gslis.textrepresentation.FeatureVector; 24 | import joptsimple.internal.Strings; 25 | 26 | public class RocchioExpandRestAction extends BaseRestHandler { 27 | private final Logger logger = ESLoggerFactory.getLogger(RocchioExpandRestAction.class); 28 | 29 | @Inject 30 | public RocchioExpandRestAction(Settings settings, RestController controller) { 31 | super(settings); 32 | 33 | // Register your handlers here 34 | controller.registerHandler(Method.GET, "/{index}/{type}/_expand", this); 35 | controller.registerHandler(Method.GET, "/{index}/_expand", this); 36 | } 37 | 38 | /** 39 | * Helper method for throwing an error 40 | * 41 | * @param error 42 | * the String error message 43 | * @return a RestChannelConsumer to build up the error 44 | */ 45 | protected RestChannelConsumer throwError(String error) { 46 | return throwError(error, RestStatus.BAD_REQUEST); 47 | } 48 | 49 | /** 50 | * Helper method for throwing an error 51 | * 52 | * @param error 53 | * the String error message 54 | * @param status 55 | * the HTTP status to return 56 | * @return a RestChannelConsumer to build up the error 57 | */ 58 | protected RestChannelConsumer throwError(String error, RestStatus status) { 59 | this.logger.error("ERROR: " + error); 60 | return channel -> { 61 | XContentBuilder builder = JsonXContent.contentBuilder(); 62 | builder.startObject(); 63 | builder.field("error", error); 64 | builder.endObject(); 65 | channel.sendResponse(new BytesRestResponse(status, builder)); 66 | }; 67 | } 68 | 69 | @Override 70 | protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException { 71 | this.logger.debug("Executing Rocchio expand action!"); 72 | 73 | // Required path parameter 74 | String index = request.param("index"); 75 | 76 | // Required query string parameter 77 | String query = request.param("query"); 78 | 79 | // Optional parameters, with sensible defaults 80 | String type = request.param("type", "dataset"); 81 | String field = request.param("field", "_all"); 82 | double alpha = Double.parseDouble(request.param("alpha", "0.5")); 83 | double beta = Double.parseDouble(request.param("beta", "0.5")); 84 | double k1 = Double.parseDouble(request.param("k1", "1.2")); 85 | double b = Double.parseDouble(request.param("b", "0.75")); 86 | int fbDocs = Integer.parseInt(request.param("fbDocs", "10")); 87 | int fbTerms = Integer.parseInt(request.param("fbTerms", "10")); 88 | 89 | // Optional stoplist - assumes a space-delimited string of stop words 90 | // TODO: Populate list of default stop words 91 | String stoplist = request.param("stoplist", ""); 92 | 93 | // Log the request with our full parameter set 94 | this.logger.info(String.format( 95 | "Starting RocchioExpand (index=%s, query=%s, type=%s, " 96 | + "field=%s, fbDocs=%d, fbTerms=%d, α=%.2f, β=%.2f, k1=%.2f, b=%.2f, stoplist=%s)", 97 | index, query, type, field, fbDocs, fbTerms, alpha, beta, k1, b, stoplist)); 98 | 99 | // TODO: Check that type has documents added to it? 100 | // TODO: Check that the documents in the type contain the desired field? 101 | // TODO: Check that term vectors/fields stats are available for the 102 | // desired index/type/field combination? 103 | 104 | try { 105 | Rocchio rocchio = new Rocchio(client, index, type, field, alpha, beta, k1, b, stoplist); 106 | 107 | // Validate input parameters 108 | String shortCircuit = rocchio.validate(query, fbDocs, fbTerms); 109 | if (!Strings.isNullOrEmpty(shortCircuit)) { 110 | return throwError(shortCircuit); 111 | } 112 | 113 | // Expand the query 114 | this.logger.debug("Generating feedback query for (" + query + "," + fbDocs + "," + fbTerms); 115 | FeatureVector feedbackQuery = rocchio.expandQuery(query, fbDocs, fbTerms); 116 | 117 | // Format our expanded query with Lucene's boosting syntax 118 | this.logger.debug("Expanding query: " + feedbackQuery.toString()); 119 | StringBuffer expandedQuery = new StringBuffer(); 120 | String separator = ""; // start out with no separator 121 | 122 | for (String term : feedbackQuery.getFeatures()) { 123 | expandedQuery.append(separator + term + "^" + feedbackQuery.getFeatureWeight(term)); 124 | separator = " "; // add separator after first iteration 125 | } 126 | 127 | String fullQuery = expandedQuery.toString().trim(); 128 | 129 | // Return the expanded query (don't actually perform the search) 130 | this.logger.debug("Responding: " + expandedQuery.toString()); 131 | return channel -> { 132 | XContentBuilder builder = JsonXContent.contentBuilder(); 133 | builder.startObject(); 134 | 135 | builder.field("query", fullQuery); 136 | builder.endObject(); 137 | channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder)); 138 | }; 139 | } catch (Exception e) { 140 | // FIXME: Catching generic Exception is bad practice 141 | // TODO: make this more specific for production 142 | String errorMessage = e.getMessage(); 143 | if (Strings.isNullOrEmpty(errorMessage)) { 144 | errorMessage = "An unknown error was encountered."; 145 | } 146 | return throwError(errorMessage); 147 | } 148 | } 149 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rocchio expansion for ElasticSearch 2 | 3 | NDS bioCADDIE 4 | 5 | This is a prototype plugin for ElasticSearch 5.x to add Rocchio-based query expansion support using BM25 similarity. This plugin adds an ``_expand`` REST endpoint to ElasticSearch that returns a "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" with Lucene-style terms weights. This plugin was developed as part of the NDS [bioCADDIE pilot](https://biocaddie.org/expansion-models-biomedical-data-search). 6 | 7 | ## Why Rocchio? 8 | Our original goal was to implement relevance model (RM) based expansion using Lucene's language modeling similarity implementations. Our investigations revealed that [Lucene's language modeling implementation is incomplete](https://issues.apache.org/jira/browse/LUCENE-5847) and may not be suitable for use with RM. Given Lucene's origins as a vector-space implementation and current default BM25 scorer, we opted to instead implement Rocchio-style expansion. While Rocchio expansion was not originally intended for use with the BM25 retrieval model, it has proven effective. 9 | 10 | ## REST Interface 11 | 12 | Endpoint: 13 | ``/index/_expand`` 14 | 15 | Parameters: 16 | * ``type``: Document type, defaults to ``dataset`` 17 | * ``field``: Field to search, defaults to ``_all`` 18 | * ``alpha``: Original query weight, defaults to 0.5 19 | * ``beta``: Feedback query weight, defaults to 0.5 20 | * ``k1``: BM25 k1 parameter, defaults to 1.2 21 | * ``b``: BM25 b parameter, defaults to 0.75 22 | * ``fbDocs``: Number of feedback documents, defaults to 10 23 | * ``fbTerms``: Number of feedback terms, defaults to 10 24 | * ``stoplist``: Additional stoplist terms (modifies primary stoplist) 25 | * ``query``: Query to expand 26 | 27 | The expand endpoint returns a JSON object with the expanded query in "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" format with each expansion term and the associated expansion weight: 28 | ``` 29 | { 30 | "query": "term1^weight1 term2^weight2 ..." 31 | } 32 | ``` 33 | 34 | This query can be used with the standard ElasticSeach ``_search`` endpoint: 35 | ``` 36 | curl -XGET 'localhost:9200/biocaddie/_search?pretty' -H 'Content-Type: application/json' -d' 37 | { 38 | "query": { 39 | "query_string" : { 40 | "default_field" : "_all", 41 | "query" : "term1^weight1 term2^weight2" 42 | } 43 | } 44 | } 45 | ' 46 | ``` 47 | 48 | 49 | 50 | 51 | 52 | ## Prerequisites 53 | 54 | * ElasticSearch 5.3.2 (native or via Docker) 55 | * Git + Maven (native or via Docker) 56 | * ElasticSearch index 57 | 58 | ## Installing from OSSRH 59 | You can install the plugin using the following command: 60 | ```bash 61 | bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT/queryexpansion-5.3.2-20170726.231658-1.zip 62 | ``` 63 | 64 | NOTE: You can check https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT for a link to the newest `.zip` file. 65 | 66 | ## Building From Source 67 | Clone this repository: 68 | ```bash 69 | git clone nds-org/elasticsearch-queryexpansion-plugin queryexpansion && cd queryexpansion 70 | mvn package 71 | bin/elasticsearch-plugin install file:///path/to/elasticsearch-queryexpansion-plugin/target/releases/queryexpansion-5.3.2-SNAPSHOT.zip 72 | ``` 73 | 74 | 75 | ## Example usage 76 | 77 | The repository includes several scripts demonstrating how to install and use the plugin via Docker: 78 | 79 | 1. [Setup](README.md#setup) 80 | 2. [Build](README.md#build) 81 | 3. [Load](README.md#load) 82 | 4. [Test](README.md#test) 83 | 84 | ### Setup 85 | The following steps demonstrate how to build an ElasticSearch index from the bioCADDIE test collection. 86 | 87 | Make sure that the biocaddie benchmark test dataset exists somewhere on disk: 88 | ```bash 89 | cd $HOME 90 | wget https://biocaddie.org/sites/default/files/update_json_folder.zip && unzip update_json_folder.zip 91 | ``` 92 | 93 | Start ElasticSearch or run ElasticSearch 5.3.2 via Docker using the helper script: 94 | ```bash 95 | ./scripts/start.sh 96 | ``` 97 | 98 | Create an index with the required parameters (store==true): 99 | ```bash 100 | ./scripts/create-index.sh 101 | ``` 102 | 103 | NOTE: You may need to modify *dataset_path* in `./scripts/add-docs.sh` if your benchmark data is not located within `$HOME`. 104 | 105 | Finally, use the helper script to add the documents to the index: 106 | ```bash 107 | ./scripts/add-docs.sh 108 | ``` 109 | 110 | NOTE: Indexing the full benchmark set can take a long time. If you only need a small subset of the documents, you can always `Ctrl+C` once you get the desired number of records indexed. 111 | 112 | ### Build 113 | The following helper script will build the plugin using Maven (or using Docker if Maven is not installed): 114 | ```bash 115 | ./scripts/build.sh 116 | ``` 117 | Either way, the build should produce a `target/releases/` directory with the necessary `.zip` file. 118 | 119 | The `.zip` that ElasticSearch needs should be found at `./target/releases/rocchio-0.0.1-SNAPSHOT.jar`. 120 | 121 | ### Load 122 | Once the artifacts are built, we just need to install them and restart ElasticSearch. The following helper scripts assume that you are running ElasticSearch via Docker: 123 | ```bash 124 | ./scripts/install.sh 125 | ./scripts/restart.sh 126 | ``` 127 | 128 | ### Test 129 | You should now be able to test the new endpoint using the helper script or via raw `curl`: 130 | ```bash 131 | $ ./test.sh 132 | {"query":"sclerosis^2.798773920190095 study^0.4716440174771813 disease^0.584064093901503 or^0.3394485958568884 patients^0.79730633189081 multiple^1.941784058395449 was^0.4222225922753828 is^0.38702376034952857 to^0.4432445617796595 on^0.3817563584164061"} 133 | ``` 134 | 135 | You can check the container logs to see what happened under the covers: 136 | ```bash 137 | $ ./logs.sh 138 | ... 139 | [2017-07-01T04:54:54,007][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [reindex] 140 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [transport-netty3] 141 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [transport-netty4] 142 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded plugin [queryexpansion] 143 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded plugin [x-pack] 144 | [2017-07-01T04:55:00,722][INFO ][o.e.n.Node ] initialized 145 | [2017-07-01T04:55:00,744][INFO ][o.e.n.Node ] [lmIsnX7] starting ... 146 | [2017-07-01T04:55:01,467][WARN ][i.n.u.i.MacAddressUtil ] Failed to find a usable hardware address from the network interfaces; using random bytes: f8:2c:c0:8c:3e:88:3b:3b 147 | [2017-07-01T04:55:01,695][INFO ][o.e.t.TransportService ] [lmIsnX7] publish_address {127.0.0.1:9300}, bound_addresses {127.0.0.1:9300} 148 | [2017-07-01T04:55:02,082][INFO ][o.e.m.j.JvmGcMonitorService] [lmIsnX7] [gc][1] overhead, spent [260ms] collecting in the last [1s] 149 | [2017-07-01T04:55:05,179][INFO ][o.e.c.s.ClusterService ] [lmIsnX7] new_master {lmIsnX7}{lmIsnX7NRH2_Vmq6avBitQ}{iyWg9zTcQqCeF97xX-hdJQ}{127.0.0.1}{127.0.0.1:9300}, reason: zen-disco-elected-as-master ([0] nodes joined) 150 | [2017-07-01T04:55:05,305][INFO ][o.e.x.s.t.n.SecurityNetty4HttpServerTransport] [lmIsnX7] publish_address {172.17.0.2:9200}, bound_addresses {[::]:9200} 151 | [2017-07-01T04:55:05,318][INFO ][o.e.n.Node ] [lmIsnX7] started 152 | [2017-07-01T04:55:06,492][INFO ][o.e.l.LicenseService ] [lmIsnX7] license [0a8ce788-74ad-49d9-aa3c-3c46ab9100d8] mode [trial] - valid 153 | [2017-07-01T04:55:06,513][INFO ][o.e.g.GatewayService ] [lmIsnX7] recovered [4] indices into cluster_state 154 | [2017-07-01T04:55:08,078][INFO ][o.e.c.r.a.AllocationService] [lmIsnX7] Cluster health status changed from [RED] to [YELLOW] (reason: [shards started [[.monitoring-es-2-2017.07.01][0], [biocaddie][0]] ...]). 155 | [2017-07-01T04:55:13,088][INFO ][o.n.e.r.RocchioExpandRestAction] [lmIsnX7] Starting Rocchio (biocaddie,multiple sclerosis,dataset,_all,10,10,0.50,0.50,1.20,0.75) 156 | ... 157 | ``` 158 | 159 | ## Helper Scripts 160 | A few other helper scripts are included to ease testing: 161 | ```bash 162 | ./scripts/start.sh # Runs or starts your elasticsearch container 163 | ./scripts/stop.sh # Stops your elasticsearch container 164 | ./scripts/restart.sh 165 | ./scripts/create-index.sh # Creates a test index with the proper settings to enable storing term vectors 166 | ./scripts/add-docs.sh [-v] # Adds documents from the biocaddie benchmark set to your index (assumes correct paths) 167 | ./scripts/delete-index.sh # Deletes your container's test index and the records within 168 | ./scripts/build.sh # Builds up elasticsearch plugin artifacts 169 | ./scripts/install.sh # Installs the elasticsearch plugin into your running container 170 | ./scripts/remove.sh # Removes your container's installed queryexpanion plugin 171 | ./rebuild.sh # Removes the current plugin, builds the artifacts, installs the new plugin, and restarts elasticsearch to facilitate rapid development and testing 172 | ./logs.sh # View your elasticsearch container logs (tail=100) 173 | ./test.sh [search] # Performs a test query against our REST API endpoint (only expands by default, but searches if first parameter is "search") 174 | ``` 175 | 176 | # Deploying artifacts 177 | New artifacts can be deployed to OSSRH using the following command: 178 | ```bash 179 | GPG_TTY=$(tty) mvn clean deploy 180 | ``` 181 | -------------------------------------------------------------------------------- /src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/unit/RocchioTest.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio.test.unit; 2 | 3 | import static org.junit.Assert.*; 4 | import static org.mockito.Mockito.*; 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | import java.util.LinkedHashMap; 8 | import java.util.Map; 9 | 10 | import org.elasticsearch.search.SearchHits; 11 | import org.elasticsearch.search.SearchHit; 12 | import org.apache.lucene.index.Fields; 13 | import org.apache.lucene.index.Terms; 14 | import org.apache.lucene.index.TermsEnum; 15 | import org.apache.lucene.util.BytesRef; 16 | import org.elasticsearch.action.ActionFuture; 17 | import org.elasticsearch.action.ListenableActionFuture; 18 | import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; 19 | import org.elasticsearch.action.search.SearchRequestBuilder; 20 | import org.elasticsearch.action.search.SearchResponse; 21 | import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse; 22 | import org.elasticsearch.action.termvectors.MultiTermVectorsRequestBuilder; 23 | import org.elasticsearch.action.termvectors.MultiTermVectorsResponse; 24 | import org.elasticsearch.action.termvectors.TermVectorsResponse; 25 | import org.elasticsearch.client.AdminClient; 26 | import org.elasticsearch.client.Client; 27 | import org.elasticsearch.client.ClusterAdminClient; 28 | import org.elasticsearch.cluster.ClusterState; 29 | import org.elasticsearch.cluster.metadata.IndexMetaData; 30 | import org.elasticsearch.cluster.metadata.MappingMetaData; 31 | import org.elasticsearch.cluster.metadata.MetaData; 32 | import org.elasticsearch.common.collect.ImmutableOpenMap; 33 | import org.elasticsearch.index.query.QueryStringQueryBuilder; 34 | import org.junit.After; 35 | import org.junit.Before; 36 | import org.junit.Test; 37 | import org.junit.runner.RunWith; 38 | import org.mockito.runners.MockitoJUnitRunner; 39 | import org.nationaldataservice.elasticsearch.rocchio.Rocchio; 40 | 41 | import edu.gslis.textrepresentation.FeatureVector; 42 | 43 | /** 44 | * This is a simple unit test suite for the Rocchio ElasticSearch Plugin. Use 45 | * these test cases to verify correctness of the query expansion process. You 46 | * can also vary the parameters here to see how that affects the resulting 47 | * expansion. All ElasticSearch internals have been mocked with Mockito to 48 | * return fake data. 49 | * 50 | * 51 | * @author lambert8 52 | * 53 | */ 54 | @RunWith(MockitoJUnitRunner.class) 55 | public class RocchioTest { 56 | /** The Rocchio instance to test */ 57 | private Rocchio rocchio; 58 | 59 | // The common test parameter set (individual tests can still use one-off 60 | // values) 61 | private static final String TEST_INDEX = "biocaddie"; 62 | private static final String TEST_QUERY = "rat"; 63 | private static final String TEST_TYPE = "dataset"; 64 | private static final String TEST_FIELD = "_all"; 65 | private static final int TEST_FB_TERMS = 10; 66 | private static final int TEST_FB_DOCS = 50; 67 | private static final double TEST_ALPHA = 0.5; 68 | private static final double TEST_BETA = 0.5; 69 | private static final double TEST_K1 = 1.2; 70 | private static final double TEST_B = 0.75; 71 | 72 | // Mock out all of the ElasticSearch internals 73 | private static final Client client = mock(Client.class); 74 | 75 | @SuppressWarnings("unchecked") 76 | private static final ActionFuture clusterStateFuture = (ActionFuture) mock(ActionFuture.class); 77 | private static final AdminClient adminClient = mock(AdminClient.class); 78 | private static final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class); 79 | private static final ClusterState clusterState = mock(ClusterState.class); 80 | private static final ClusterStateResponse clusterStateResponse = mock(ClusterStateResponse.class); 81 | private static final MetaData clusterMetadata = mock(MetaData.class); 82 | private static final IndexMetaData mockIndexMetaData = mock(IndexMetaData.class); 83 | 84 | @SuppressWarnings("unchecked") 85 | private static final ListenableActionFuture mockMtvFuture = mock(ListenableActionFuture.class); 86 | private static final MultiTermVectorsResponse mockMtvResponse = mock(MultiTermVectorsResponse.class); 87 | private static final TermVectorsResponse mockTvResponse = mock(TermVectorsResponse.class); 88 | private static final MultiTermVectorsItemResponse mockMtvItemResponse = mock(MultiTermVectorsItemResponse.class); 89 | private static final MultiTermVectorsRequestBuilder mockMtvBuilder = mock(MultiTermVectorsRequestBuilder.class); 90 | private static final Fields mockFields = mock(Fields.class); 91 | private static final Terms mockTerms = mock(Terms.class); 92 | private static final MultiTermVectorsItemResponse[] mockMtvItemResponses = { mockMtvItemResponse }; 93 | 94 | @SuppressWarnings("unchecked") 95 | private static final ListenableActionFuture mockSearchFuture = mock(ListenableActionFuture.class); 96 | private static final SearchRequestBuilder srBuilder = mock(SearchRequestBuilder.class); 97 | private static final SearchResponse mockSearchResponse = mock(SearchResponse.class); 98 | 99 | // These are used internally, but are overridden by later mocks (see TermsEnum iteration) 100 | private static final SearchHits hits = mock(SearchHits.class); 101 | private static final SearchHit hit1 = mock(SearchHit.class); 102 | private static final SearchHit hit2 = mock(SearchHit.class); 103 | private static final SearchHit hit3 = mock(SearchHit.class); 104 | private static final SearchHit[] hitsArray = { hit1, hit2, hit3 }; 105 | 106 | private static final TermsEnum mockIterator = mock(TermsEnum.class); 107 | 108 | // The index mapping metadata and sub-mappings 109 | private static final MappingMetaData mockTypeMetadata = mock(MappingMetaData.class); 110 | private static final ImmutableOpenMap indexMappingMetadata; 111 | private static final LinkedHashMap fieldPropertiesMap = new LinkedHashMap(); 112 | private static final LinkedHashMap typePropertiesMap = new LinkedHashMap(); 113 | private static final LinkedHashMap typeMap = new LinkedHashMap(); 114 | private static final Map typeMetadataMapping = new HashMap<>(); 115 | 116 | // FIXME: finish mocking out iterator and expand 117 | private static final BytesRef termRef = new BytesRef("rat"); 118 | 119 | /** Static initializer: set up all required test data and mocks */ 120 | static { 121 | // Build up our properties mapping: { "store": true } object 122 | fieldPropertiesMap.put("store", true); 123 | 124 | // Build up our test field mapping with the properties map 125 | typePropertiesMap.put(TEST_FIELD, fieldPropertiesMap); 126 | 127 | // Build up our test type mapping from the test field mapping 128 | typeMap.put("properties", typePropertiesMap); 129 | typeMap.put("_all", fieldPropertiesMap); 130 | 131 | // Build up our test type mapping of the type metadata 132 | typeMetadataMapping.put(TEST_TYPE, mockTypeMetadata); 133 | 134 | // Build up our index mapping from the type mapping 135 | indexMappingMetadata = new ImmutableOpenMap.Builder().putAll(typeMetadataMapping).build(); 136 | 137 | try { 138 | // Mock out ElasticSearch index mapping verification 139 | when(client.admin()).thenReturn(adminClient); 140 | when(adminClient.cluster()).thenReturn(clusterAdminClient); 141 | when(clusterAdminClient.state(any())).thenReturn(clusterStateFuture); 142 | when(clusterStateFuture.actionGet()).thenReturn(clusterStateResponse); 143 | when(clusterStateResponse.getState()).thenReturn(clusterState); 144 | when(clusterState.getMetaData()).thenReturn(clusterMetadata); 145 | when(clusterMetadata.index(anyString())).thenReturn(mockIndexMetaData); 146 | when(mockIndexMetaData.getMappings()).thenReturn(indexMappingMetadata); 147 | when(mockTypeMetadata.getSourceAsMap()).thenReturn(typeMap); 148 | 149 | // Mock out ElasticSearch Search 150 | when(client.prepareSearch(anyString())).thenReturn(srBuilder); 151 | when(srBuilder.setQuery(any(QueryStringQueryBuilder.class))).thenReturn(srBuilder); 152 | when(srBuilder.setSize(anyInt())).thenReturn(srBuilder); 153 | when(srBuilder.execute()).thenReturn(mockSearchFuture); 154 | when(mockSearchFuture.actionGet()).thenReturn(mockSearchResponse); 155 | when(mockSearchResponse.getHits()).thenReturn(hits); 156 | when(hits.getHits()).thenReturn(hitsArray); 157 | when(hits.hits()).thenReturn(hitsArray); 158 | 159 | // These are used internally, but are likely 160 | // overridden by later mocks (see TermsEnum iteration) 161 | when(hits.totalHits()).thenReturn(Long.valueOf(3)); 162 | when(hits.getTotalHits()).thenReturn(Long.valueOf(3)); 163 | 164 | // Mock out ElasticSearch MultiTermVector Fields/Terms 165 | when(mockMtvBuilder.execute()).thenReturn(mockMtvFuture); 166 | when(mockMtvFuture.actionGet()).thenReturn(mockMtvResponse); 167 | when(mockMtvBuilder.add(any())).thenReturn(mockMtvBuilder); 168 | when(client.prepareMultiTermVectors()).thenReturn(mockMtvBuilder); 169 | when(mockMtvItemResponse.getResponse()).thenReturn(mockTvResponse); 170 | when(mockMtvResponse.getResponses()).thenReturn(mockMtvItemResponses); 171 | 172 | // FIXME: The two sections below return completely arbitrary values 173 | // and should be updated to something more sane 174 | // Mock out Lucene Fields/Terms 175 | when(mockTvResponse.getFields()).thenReturn(mockFields); 176 | when(mockFields.terms(TEST_FIELD)).thenReturn(mockTerms); 177 | when(mockTerms.getDocCount()).thenReturn(10); 178 | when(mockTerms.getSumTotalTermFreq()).thenReturn(10L); 179 | when(mockTerms.iterator()).thenReturn(mockIterator); 180 | 181 | // Mock out Lucene TermsEnum iteration 182 | when(mockIterator.next()).thenReturn(termRef).thenReturn(null); 183 | when(mockIterator.totalTermFreq()).thenReturn(10L); 184 | when(mockIterator.docFreq()).thenReturn(10); 185 | when(mockIterator.term()).thenReturn(termRef); 186 | } catch (IOException e) { 187 | e.printStackTrace(); 188 | fail(); 189 | } 190 | }; 191 | 192 | @Before 193 | /** Set up our test Rocchio implementation */ 194 | public void setUp() throws IOException { 195 | this.rocchio = new Rocchio(client, TEST_INDEX, TEST_TYPE, TEST_FIELD, TEST_ALPHA, TEST_BETA, TEST_K1, TEST_B); 196 | } 197 | 198 | @After 199 | /** Tear down our test Rocchio implementation */ 200 | public void tearDown() { 201 | this.rocchio = null; 202 | } 203 | 204 | @Test 205 | /** Test that validate properly returns null if all parameters are valid */ 206 | public void testValidate() throws IOException { 207 | String shouldBeNull = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS); 208 | assertNull(shouldBeNull); 209 | } 210 | 211 | @Test 212 | /** Test that validate fails when query is null */ 213 | public void testValidateInvalidQuery() throws IOException { 214 | String errorMessage = rocchio.validate("", TEST_FB_DOCS, TEST_FB_TERMS); 215 | assertNotNull(errorMessage); 216 | assertEquals(Rocchio.NULL_QUERY_ERROR, errorMessage); 217 | } 218 | 219 | @Test 220 | /** Test that validate fails when fbDocs < 1 */ 221 | public void testValidateInvalidFeedbackDocuments() throws IOException { 222 | String errorMessage = rocchio.validate(TEST_QUERY, 0, TEST_FB_TERMS); 223 | assertNotNull(errorMessage); 224 | assertEquals(Rocchio.INVALID_FB_DOCS_ERROR, errorMessage); 225 | } 226 | 227 | @Test 228 | /** Test that validate fails when fbTerms < 1 */ 229 | public void testValidateInvalidFeedbackTerms() throws IOException { 230 | String errorMessage = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, 0); 231 | assertNotNull(errorMessage); 232 | assertEquals(Rocchio.INVALID_FB_TERMS_ERROR, errorMessage); 233 | } 234 | 235 | @Test 236 | /** Test that we can expand a query against the test index */ 237 | public void testExpandQuery() throws IOException { 238 | // Expand the query 239 | FeatureVector feedbackQuery = rocchio.expandQuery(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS); 240 | 241 | // Verify expanded segments 242 | String[] segments = feedbackQuery.toString().trim().split(" "); 243 | assertEquals(2, segments.length); 244 | assertEquals("0.012976521", segments[0]); 245 | assertEquals("rat", segments[1]); 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /src/test/ant/integration-tests.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | Waiting for elasticsearch to become available on port @{port}... 84 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 0); 107 | s = true; 108 | break; 109 | } 110 | } 111 | 112 | if(!s){ 113 | d = a.length - b.length; 114 | project.setProperty("compare-result", d >= 0); 115 | } 116 | 117 | ]]> 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 134 | 135 | 136 | 137 | 138 | 139 | Starting up external cluster... 140 | 141 | 142 | running Elasticsearch 5.0.0 or superior 143 | running Elasticsearch < 5.0.0 144 | 145 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 166 | 167 | 168 | 169 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | External node started PID ${integ.pid} 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | Shutting down external node PID ${integ.pid} 189 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | Installing plugin @{name}... 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | -------------------------------------------------------------------------------- /src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/integration/RocchioIT.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio.test.integration; 2 | 3 | import org.junit.BeforeClass; 4 | import org.junit.Test; 5 | import static org.hamcrest.Matchers.*; 6 | import static org.junit.Assert.*; 7 | 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | import org.apache.http.entity.StringEntity; 13 | import org.apache.logging.log4j.Logger; 14 | import org.elasticsearch.client.Response; 15 | import org.elasticsearch.common.logging.ESLoggerFactory; 16 | 17 | /** 18 | * This is a simple integration test suite for the ElasticSearch Rocchio 19 | * Plugin.Use these test cases to verify correctness of the API endpoint, input 20 | * validation, compare performance, scale testing, etc
21 | * Before the test suite runs, the test runner will: 22 | * 23 | *
 24 |  *    * Download ElasticSearch binaries
 25 |  *    * Install the ElasticSearch Rocchio Plugin
 26 |  *    * Start up an ElasticSearch cluster
 27 |  *    * Ensure that the TEST_INDEX has been created
 28 |  *    * Ensure that TEST_INDEX contains some test documents
 29 |  *    * Run the set of test cases
 30 |  *    * Tear down the cluster
 31 |  * 
32 | * 33 | * @see {@link AbstractITCase} 34 | * @see src/test/ant/integration-tests.xml 35 | * 36 | * @author lambert8 37 | * 38 | */ 39 | public class RocchioIT extends AbstractITCase { 40 | private static final Logger staticLogger = ESLoggerFactory.getLogger(RocchioIT.class); 41 | 42 | // The common test parameter set (individual tests can still use one-off 43 | // values) 44 | private static final String TEST_INDEX = "biocaddie"; 45 | private static final String TEST_TYPE = "dataset"; 46 | private static final int TEST_FB_TERMS = 10; 47 | private static final int TEST_FB_DOCS = 5; 48 | 49 | private final String defaultEndpointParameters = "fbTerms=" + TEST_FB_TERMS + "&fbDocs=" + TEST_FB_DOCS; 50 | private final String expandEndpoint = String.format("/%s/%s/_expand?%s", TEST_INDEX, TEST_TYPE, 51 | defaultEndpointParameters); 52 | 53 | // TODO: Improve expectations 54 | private final String EXPECTED_EXPANDED_QUERY_OBJECT = "{query=dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405}"; 55 | private final String EXPECTED_EXPANDED_QUERY_STRING = "dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405"; 56 | private final String EXPECTED_SEARCH_HITS = "{_shards={total=1, failed=0, successful=1}, hits={hits=[{_index=biocaddie, _type=dataset, _source={DOCNO=1, REPOSITORY=arrayexpress_020916, TITLE=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=We characterized transcriptomes of a strain overexpressing syrA. Our work shows that the syrA transcriptome shares similar gene expression changes to the syrM and nodD3 transcriptomes and that nodD3 and syrA may be the only targets directly activated by SyrM. We propose that most of the gene expression changes observed when nodD3 is overexpressed are due to NodD3 activation of syrM expression, which in turn stimulates SyrM activation of syrA expression. The subsequent increase in SyrA abundance alters activity of the ChvI-ExoS-ExoR circuit, resulting in broad changes in gene expression. Gene expression profiling of Sinorhizobium meliloti overexpressing syrA was performed using custom Affymetrix GeneChips, ID=520401, title=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), experimentType=transcription profiling by array}, organism={experiment={species=Sinorhizobium meliloti}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=1, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=2, REPOSITORY=arrayexpress_020916, TITLE=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-05, description=A study to define the binding loci of RelA-containing NF-kappaB dimers in a human myometrial smooth muscle cell line after exposure to TNF. Monolayers of PHM1-31 cells were exposed to TNF (10ng/ml) for 1 hour or left unstimulated. The Chromatin immunoprecipitation (ChIP) assay was performed to recover RelA-bound chromatin or non-specifically bound chromatin with IgG. That chromatin was prepared and used to probe Affymetrix GeneChIP 1.0R Human Promoter arrays. Three biological replicates of each experiment were conducted. Datasets were subsequently analysed in Partek Genomics Suite V6.6 where baseline was normalised by subtraction of IgG values from conrresponding RelA-immunoprecipitated samples. Control samples immunoprecipitated with RelA were then compared with TNF-stimulated samples immunoprecipitated with RelA., ID=520482, title=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), experimentType=ChIP-chip by tiling array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=2, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=3, REPOSITORY=arrayexpress_020916, TITLE=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=This SuperSeries is composed of the SubSeries listed below. Refer to individual Series, ID=520420, title=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, experimentType=transcription profiling by array}, organism={experiment={species=Rattus norvegicus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=3, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=4, REPOSITORY=arrayexpress_020916, TITLE=Gene expression profile in Caco-2 cells treated with carnosine, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=To reveal the effects of carnosine on Caco-2 cells, we have employed whole genome microarray to detect genes that showed significantly different expression when exposed to carnosine. Caco-2 cells were treated with 1 mM carnosine for 3 days. Caco-2 cells were treated with 1 mM carnosine for 3 days. Three independent experiments were performed., ID=520441, title=Gene expression profile in Caco-2 cells treated with carnosine, experimentType=transcription profiling by array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=4, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=5, REPOSITORY=arrayexpress_020916, TITLE=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=Mutations in methyl-CpG-binding protein 2 (MeCP2), a major epigenetic regulator, are the predominant cause of Rett syndrome. We previously found that Mecp2-null microglia are deficient in phagocytic ability, and that engraftment of wild-type monocytes into the brain of Mecp2-deficient mice attenuates pathology. We have observed that Mecp2 deficiency is associated with increased levels of histone acetylation at the cis-regulatory regions of the Mecp2-regulated genes in macrophages. We hypothesized that Mecp2 recruits protein complexes containing histone deacetylases (HDACs) to repress the expression of its target genes. Our ChIP-Seq studies in bone-marrow derived macrophages revealed that Mecp2 co-localizes with Ncor2/Hdac3 protein complex at cis-regulatory regions of the target genes. These results suggest a role for Mecp2 in the recruitment and regulation of Ncor2/Hdac3 repressosome that plays a critical role in the regulation of inflammatory responses in macrophages. Examination of NCOR2 and HDAC3 genome-wide location in bone-marrow derived macrophages., ID=520444, title=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], experimentType=ChIP-seq}, organism={experiment={species=Mus musculus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=5, _score=1.0}], total=5, max_score=1.0}, took=1, timed_out=false}"; 57 | 58 | @BeforeClass 59 | public static void setUp() { 60 | // Ensure that the index exists 61 | staticLogger.info("Setting up test environment!"); 62 | createIndex(TEST_INDEX); 63 | 64 | // Ensure that documents to the index 65 | for (int i = 1; i <= 5; i++) { 66 | addDocument(TEST_INDEX, TEST_TYPE, i, DOCUMENTS_JSON[i - 1]); 67 | } 68 | 69 | // Tests will fail if we don't wait for ES to index the new documents 70 | staticLogger.info("Waiting for ES to finish indexing documents..."); 71 | wait(3000); 72 | } 73 | 74 | @Test 75 | @SuppressWarnings("unchecked") 76 | public void testPluginIsLoaded() throws Exception { 77 | 78 | Response response = client.performRequest("GET", "/_nodes/plugins"); 79 | 80 | Map nodes = (Map) entityAsMap(response).get("nodes"); 81 | for (String nodeName : nodes.keySet()) { 82 | boolean pluginFound = false; 83 | Map node = (Map) nodes.get(nodeName); 84 | List> plugins = (List>) node.get("plugins"); 85 | for (Map plugin : plugins) { 86 | String pluginName = (String) plugin.get("name"); 87 | if (pluginName.equals("rocchio")) { 88 | pluginFound = true; 89 | break; 90 | } 91 | } 92 | assertThat(pluginFound, is(true)); 93 | } 94 | } 95 | 96 | @Test 97 | public void testExpandEndpoint() throws Exception { 98 | String query = "rat"; 99 | String params = "&query=" + query; 100 | String request = expandEndpoint + params; 101 | 102 | Response response = client.performRequest("GET", request); 103 | assertEquals(EXPECTED_EXPANDED_QUERY_OBJECT, entityAsMap(response).toString()); 104 | } 105 | 106 | // FIXME: Test case currently fails (see below) 107 | //@Test 108 | /** Compare performance and */ 109 | public void testSearchPerformance() throws Exception { 110 | String indexRequest = "/" + TEST_INDEX; 111 | Response indicesResponse = client.performRequest("GET", indexRequest, contentTypeHeader); 112 | staticLogger.info(entityAsMap(indicesResponse).toString()); 113 | String query = "rat"; 114 | String searchEndpoint = "/" + TEST_INDEX + "/_search"; 115 | 116 | // Time a normal (unexpanded) search for our query 117 | long searchStart = System.nanoTime(); 118 | Response unexpandedSearchResponse = client.performRequest("GET", searchEndpoint + "?q=" + query, contentTypeHeader); 119 | long searchDuration = System.nanoTime() - searchStart; 120 | 121 | // Time a query expansion 122 | String expandParams = "&query=" + query; 123 | String expandRequest = expandEndpoint + expandParams; 124 | long expandStart = System.nanoTime(); 125 | Response expandResponse = client.performRequest("GET", expandRequest); 126 | long expandDuration = System.nanoTime() - expandStart; 127 | 128 | // Verify that expansion returns correctly 129 | String expandedQuery = entityAsMap(expandResponse).get("query").toString(); 130 | assertEquals(EXPECTED_EXPANDED_QUERY_STRING, expandedQuery); 131 | 132 | // FIXME: Test currently fails on this syntax, stating that " " is an 133 | // invalid character. I have attempt to use "+", as well as "%20" with 134 | // no luck yet. I even tried to send the query as the request body, 135 | // but struggled to find the correct syntax 136 | //StringEntity expandedSearchRequestBody = new StringEntity("{\"query\":\"" + expandedQuery.trim() + "\"}"); 137 | String expandedSearchQueryString = "?q=" + expandedQuery.trim().replaceAll(" ", "+"); 138 | 139 | // Time an expanded search on the same query 140 | long expandedSearchStart = System.nanoTime(); 141 | Response expandedSearchResponse = client.performRequest("GET", searchEndpoint + expandedSearchQueryString, contentTypeHeader); 142 | long expandedSearchDuration = System.nanoTime() - expandedSearchStart; 143 | long fullExpansionDuration = expandDuration + expandedSearchDuration; 144 | 145 | // Log expansion results 146 | staticLogger.info(String.format("Original query: %s", query)); 147 | staticLogger.info(String.format("Expanded query: %s", expandedQuery)); 148 | 149 | // Log timings 150 | staticLogger.info(String.format("Query expansion took: %d ns", expandDuration)); 151 | staticLogger.info(String.format("Expanded search took: %d ns", expandedSearchDuration)); 152 | staticLogger.info(String.format("Full expansion + search took: %d ns", fullExpansionDuration)); 153 | staticLogger.info(String.format("Unexpanded search took: %d ns", searchDuration)); 154 | 155 | // Verify that expanded search returns as expected 156 | assertEquals(EXPECTED_SEARCH_HITS, entityAsMap(expandedSearchResponse).toString()); 157 | 158 | // TODO: Analyze expanded results for accuracy? 159 | //staticLogger.info(String.format("Unexpanded search results: %s", entityAsMap(unexpandedSearchResponse))); 160 | //staticLogger.info(String.format("Expanded search results: %s", entityAsMap(expandedSearchResponse))); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/integration/AbstractITCase.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio.test.integration; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.UnsupportedEncodingException; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import org.apache.http.Header; 10 | import org.apache.http.HttpHost; 11 | import org.apache.http.entity.StringEntity; 12 | import org.apache.http.message.BasicHeader; 13 | import org.apache.logging.log4j.Logger; 14 | import org.elasticsearch.client.Response; 15 | import org.elasticsearch.client.RestClient; 16 | import org.elasticsearch.cluster.ClusterModule; 17 | import org.elasticsearch.common.bytes.BytesReference; 18 | import org.elasticsearch.common.logging.ESLoggerFactory; 19 | import org.elasticsearch.common.xcontent.NamedXContentRegistry; 20 | import org.elasticsearch.common.xcontent.XContent; 21 | import org.elasticsearch.common.xcontent.XContentBuilder; 22 | import org.elasticsearch.common.xcontent.XContentParser; 23 | import org.elasticsearch.common.xcontent.XContentType; 24 | import org.junit.AfterClass; 25 | import org.junit.BeforeClass; 26 | import static org.hamcrest.Matchers.*; 27 | import static org.junit.Assert.*; 28 | import static org.junit.Assume.*; 29 | 30 | /** 31 | * This is a simple base integration test suite class for the 32 | * ElasticSearch Rocchio Plugin. Use these test cases to verify correctness of the API endpoint, 33 | * input validation, compare performance, scale testing, etc 34 | * 35 | * 36 | * @author lambert8 37 | * 38 | */ 39 | public abstract class AbstractITCase { 40 | protected static final Logger staticLogger = ESLoggerFactory.getLogger(AbstractITCase.class); 41 | protected final static int HTTP_TEST_PORT = 9400; 42 | protected static RestClient client; 43 | 44 | protected static final Header contentTypeHeader = new BasicHeader("Content-Type", "application/json"); 45 | 46 | // TODO: Split these out into separate files 47 | // TODO: Add more documents here to scale things out, or read in the full set from disk 48 | protected static final String INDEX_JSON = "{\"mappings\":{\"dataset\":{\"_all\":{\"type\":\"text\",\"term_vector\":\"with_positions_offsets_payloads\",\"store\":true,\"analyzer\":\"fulltext_analyzer\"}}},\"settings\":{\"index\":{\"number_of_shards\":1,\"number_of_replicas\":0},\"analysis\":{\"analyzer\":{\"fulltext_analyzer\":{\"type\":\"custom\",\"tokenizer\":\"whitespace\",\"filter\":[\"lowercase\",\"type_as_payload\"]}}}}}"; 49 | protected static final String[] DOCUMENTS_JSON = { 50 | "{\"DOCNO\":\"1\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Sinorhizobium meliloti\"}},\"dataItem\":{\"description\":\"We characterized transcriptomes of a strain overexpressing syrA. Our work shows that the syrA transcriptome shares similar gene expression changes to the syrM and nodD3 transcriptomes and that nodD3 and syrA may be the only targets directly activated by SyrM. We propose that most of the gene expression changes observed when nodD3 is overexpressed are due to NodD3 activation of syrM expression, which in turn stimulates SyrM activation of syrA expression. The subsequent increase in SyrA abundance alters activity of the ChvI-ExoS-ExoR circuit, resulting in broad changes in gene expression. Gene expression profiling of Sinorhizobium meliloti overexpressing syrA was performed using custom Affymetrix GeneChips\",\"title\":\"The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA)\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520401\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA)\"}", 51 | "{\"DOCNO\":\"2\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Homo sapiens\"}},\"dataItem\":{\"description\":\"A study to define the binding loci of RelA-containing NF-kappaB dimers in a human myometrial smooth muscle cell line after exposure to TNF. Monolayers of PHM1-31 cells were exposed to TNF (10ng/ml) for 1 hour or left unstimulated. The Chromatin immunoprecipitation (ChIP) assay was performed to recover RelA-bound chromatin or non-specifically bound chromatin with IgG. That chromatin was prepared and used to probe Affymetrix GeneChIP 1.0R Human Promoter arrays. Three biological replicates of each experiment were conducted. Datasets were subsequently analysed in Partek Genomics Suite V6.6 where baseline was normalised by subtraction of IgG values from conrresponding RelA-immunoprecipitated samples. Control samples immunoprecipitated with RelA were then compared with TNF-stimulated samples immunoprecipitated with RelA.\",\"title\":\"RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter)\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-05\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520482\",\"experimentType\":\"ChIP-chip by tiling array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter)\"}", 52 | "{\"DOCNO\":\"3\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Rattus norvegicus\"}},\"dataItem\":{\"description\":\"This SuperSeries is composed of the SubSeries listed below. Refer to individual Series\",\"title\":\"Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520420\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction\"}", 53 | "{\"DOCNO\":\"4\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Homo sapiens\"}},\"dataItem\":{\"description\":\"To reveal the effects of carnosine on Caco-2 cells, we have employed whole genome microarray to detect genes that showed significantly different expression when exposed to carnosine. Caco-2 cells were treated with 1 mM carnosine for 3 days. Caco-2 cells were treated with 1 mM carnosine for 3 days. Three independent experiments were performed.\",\"title\":\"Gene expression profile in Caco-2 cells treated with carnosine\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520441\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Gene expression profile in Caco-2 cells treated with carnosine\"}", 54 | "{\"DOCNO\":\"5\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Mus musculus\"}},\"dataItem\":{\"description\":\"Mutations in methyl-CpG-binding protein 2 (MeCP2), a major epigenetic regulator, are the predominant cause of Rett syndrome. We previously found that Mecp2-null microglia are deficient in phagocytic ability, and that engraftment of wild-type monocytes into the brain of Mecp2-deficient mice attenuates pathology. We have observed that Mecp2 deficiency is associated with increased levels of histone acetylation at the cis-regulatory regions of the Mecp2-regulated genes in macrophages. We hypothesized that Mecp2 recruits protein complexes containing histone deacetylases (HDACs) to repress the expression of its target genes. Our ChIP-Seq studies in bone-marrow derived macrophages revealed that Mecp2 co-localizes with Ncor2/Hdac3 protein complex at cis-regulatory regions of the target genes. These results suggest a role for Mecp2 in the recruitment and regulation of Ncor2/Hdac3 repressosome that plays a critical role in the regulation of inflammatory responses in macrophages. Examination of NCOR2 and HDAC3 genome-wide location in bone-marrow derived macrophages.\",\"title\":\"Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq]\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520444\",\"experimentType\":\"ChIP-seq\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq]\"}" 55 | }; 56 | 57 | /** 58 | * Creates the specified index in ElasticSearch 59 | * 60 | * @param indexName 61 | * the index name to augment 62 | * @param typeName 63 | * the type name to augment 64 | * @param id 65 | * the id of the document to add 66 | * @param jsonDocument 67 | * the String JSON document to add 68 | */ 69 | protected static void createIndex(String indexName) { 70 | try { 71 | // Create our expand / search indices 72 | String endpoint = String.format("/%s", indexName); 73 | Map params = new HashMap(); 74 | StringEntity requestBody = new StringEntity(INDEX_JSON); 75 | 76 | Response resp = client.performRequest("PUT", endpoint, params, requestBody, contentTypeHeader); 77 | staticLogger.debug("Response: " + resp.getStatusLine()); 78 | 79 | } catch (IOException e) { 80 | // Ignore this...? probably already exists 81 | staticLogger.error(e.getMessage(), e); 82 | 83 | if (e instanceof UnsupportedEncodingException) { 84 | staticLogger.error("Error encoding JSON: " + e.getMessage(), e); 85 | return; 86 | } 87 | } 88 | } 89 | 90 | /** 91 | * Adds a document to the specified ElasticSearch index / type 92 | * 93 | * @param indexName 94 | * the index name to augment 95 | * @param typeName 96 | * the type name to augment 97 | * @param id 98 | * the id of the document to add 99 | * @param jsonDocument 100 | * the String JSON document to add 101 | */ 102 | protected static void addDocument(String indexName, String typeName, Integer id, String jsonDocument) { 103 | try { 104 | String documentEndpoint = String.format("/%s/%s/%d", indexName, typeName, id); 105 | StringEntity requestBody = new StringEntity(jsonDocument); 106 | Map params = new HashMap(); 107 | 108 | Response resp = client.performRequest("PUT", documentEndpoint, params, requestBody, contentTypeHeader); 109 | staticLogger.debug("Response: " + resp.getStatusLine()); 110 | 111 | } catch (IOException e) { 112 | // Ignore this...? probably already exists 113 | staticLogger.error(e.getMessage(), e); 114 | 115 | if (e instanceof UnsupportedEncodingException) { 116 | staticLogger.error("Error encoding JSON: " + e.getMessage(), e); 117 | return; 118 | } 119 | } 120 | } 121 | 122 | protected static void wait(int millis) { 123 | staticLogger.debug(String.format("Sleeping for %d milliseconds", millis)); 124 | try { 125 | Thread.sleep(millis); 126 | } catch (InterruptedException e) { 127 | staticLogger.error(e.getMessage(), e); 128 | } 129 | } 130 | 131 | /** 132 | * Create a new {@link XContentParser}. 133 | */ 134 | protected static XContentParser createParser(XContentBuilder builder) throws IOException { 135 | return builder.generator().contentType().xContent().createParser(xContentRegistry(), builder.bytes()); 136 | } 137 | 138 | /** 139 | * Create a new {@link XContentParser}. 140 | */ 141 | protected static XContentParser createParser(XContent xContent, String data) throws IOException { 142 | return xContent.createParser(xContentRegistry(), data); 143 | } 144 | 145 | /** 146 | * Create a new {@link XContentParser}. 147 | */ 148 | protected static XContentParser createParser(XContent xContent, InputStream data) throws IOException { 149 | return xContent.createParser(xContentRegistry(), data); 150 | } 151 | 152 | /** 153 | * Create a new {@link XContentParser}. 154 | */ 155 | protected static XContentParser createParser(XContent xContent, byte[] data) throws IOException { 156 | return xContent.createParser(xContentRegistry(), data); 157 | } 158 | 159 | /** 160 | * Create a new {@link XContentParser}. 161 | */ 162 | protected static XContentParser createParser(XContent xContent, BytesReference data) throws IOException { 163 | return xContent.createParser(xContentRegistry(), data); 164 | } 165 | 166 | /** 167 | * The {@link NamedXContentRegistry} to use for this test. Subclasses should 168 | * override and use liberally. 169 | */ 170 | protected static NamedXContentRegistry xContentRegistry() { 171 | return new NamedXContentRegistry(ClusterModule.getNamedXWriteables()); 172 | } 173 | 174 | public static Map entityAsMap(Response response) throws UnsupportedOperationException, IOException { 175 | XContentType xContentType = XContentType 176 | .fromMediaTypeOrFormat(response.getEntity().getContentType().getValue()); 177 | try (XContentParser parser = createParser(xContentType.xContent(), response.getEntity().getContent())) { 178 | return parser.map(); 179 | } 180 | } 181 | 182 | @BeforeClass 183 | public static void startRestClient() { 184 | client = RestClient.builder(new HttpHost("localhost", HTTP_TEST_PORT)).build(); 185 | try { 186 | Response response = client.performRequest("GET", "/"); 187 | Map responseMap = entityAsMap(response); 188 | assertThat(responseMap, hasEntry("tagline", "You Know, for Search")); 189 | staticLogger.info("Integration tests ready to start... Cluster is running."); 190 | } catch (IOException e) { 191 | // If we have an exception here, let's ignore the test 192 | staticLogger.warn("Integration tests are skipped: [{}]", e.getMessage()); 193 | assumeThat("Integration tests are skipped", e.getMessage(), not(containsString("Connection refused"))); 194 | staticLogger.error("Full error is", e); 195 | fail("Something wrong is happening. REST Client seemed to raise an exception."); 196 | } 197 | } 198 | 199 | @AfterClass 200 | public static void stopRestClient() throws IOException { 201 | if (client != null) { 202 | client.close(); 203 | client = null; 204 | } 205 | staticLogger.info("Stopping integration tests against an external cluster"); 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | edu.illinois.lis 6 | rocchio 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ElasticSearch Rocchio Plugin 11 | A custom plugin for ElasticSearch to enable Rocchio Query Expansion 12 | https://github.com/nds-org/elasticsearch-queryexpansion-plugin 13 | 14 | 15 | git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git 16 | scm:git:git:git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git 17 | scm:git:git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git 18 | 19 | 20 | 21 | 22 | MIT 23 | https://opensource.org/licenses/MIT 24 | 25 | 26 | 27 | 28 | 29 | Craig Willis 30 | willis8@illinois.edu 31 | National Data Service 32 | http://www.nationaldataservice.org/ 33 | 34 | 35 | 36 | Garrick Sherman 37 | gsherma2@illinois.edu 38 | University of Illinois 39 | http://ischool.illinois.edu/ 40 | 41 | 42 | 43 | Mike Lambert 44 | lambert8@illinois.edu 45 | National Data Service 46 | http://www.nationaldataservice.org/ 47 | 48 | 49 | 50 | 51 | org.sonatype.oss 52 | oss-parent 53 | 7 54 | 55 | 56 | 57 | 58 | ossrh 59 | https://oss.sonatype.org/content/repositories/snapshots 60 | 61 | 62 | ossrh 63 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 64 | 65 | 66 | 67 | 68 | UTF-8 69 | 70 | 71 | org.elasticsearch.distribution.zip 72 | 73 | 74 | 5.3.2 75 | 5.3.2 76 | 0.2.0-SNAPSHOT 77 | 4.11 78 | 2.6.2 79 | 1.15 80 | 2.3.0 81 | 1.8.3 82 | 6.4.2 83 | 1.4 84 | 85 | 86 | 9400 87 | 9500 88 | localhost:${integ.http.port} 89 | 90 | 91 | false 92 | false 93 | false 94 | 95 | 96 | 97 | 98 | org.apache.logging.log4j 99 | log4j-core 100 | ${log4j.version} 101 | provided 102 | 103 | 104 | 105 | 106 | junit 107 | junit 108 | ${junit.version} 109 | 110 | 111 | org.hamcrest 112 | hamcrest-core 113 | 114 | 115 | test 116 | 117 | 118 | 119 | org.elasticsearch 120 | elasticsearch 121 | ${elasticsearch.version} 122 | provided 123 | 124 | 125 | 126 | org.elasticsearch.test 127 | framework 128 | ${elasticsearch.version} 129 | test 130 | 131 | 132 | 133 | org.elasticsearch.client 134 | rest 135 | ${elasticsearch.client.version} 136 | test 137 | 138 | 139 | 140 | org.apache.lucene 141 | lucene-test-framework 142 | ${lucene.version} 143 | test 144 | 145 | 146 | 147 | commons-cli 148 | commons-cli 149 | ${commons-cli.version} 150 | 151 | 152 | 153 | org.apache.lucene 154 | lucene-sandbox 155 | ${lucene.version} 156 | provided 157 | 158 | 159 | 160 | org.apache.lucene 161 | lucene-analyzers-common 162 | ${lucene.version} 163 | provided 164 | 165 | 166 | 167 | org.apache.lucene 168 | lucene-core 169 | ${lucene.version} 170 | provided 171 | 172 | 173 | 174 | org.apache.lucene 175 | lucene-queries 176 | ${lucene.version} 177 | provided 178 | 179 | 180 | 181 | org.apache.lucene 182 | lucene-queryparser 183 | ${lucene.version} 184 | provided 185 | 186 | 187 | 188 | org.yaml 189 | snakeyaml 190 | ${snakeyaml.version} 191 | provided 192 | 193 | 194 | 195 | org.apache.xmlbeans 196 | xmlbeans 197 | ${xmlbeans.version} 198 | provided 199 | 200 | 201 | 202 | commons-beanutils 203 | commons-beanutils 204 | ${beanutils.version} 205 | provided 206 | 207 | 208 | 209 | edu.illinois.lis 210 | ir-utils 211 | ${ir-utils.version} 212 | 213 | 214 | commons-collections 215 | commons-collections 216 | 217 | 218 | org.apache.geronimo.specs 219 | geronimo-stax-api_1.0_spec 220 | 221 | 222 | xml-apis 223 | xml-apis 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | src/main/resources 233 | false 234 | 235 | *.properties 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | org.apache.maven.plugins 244 | maven-compiler-plugin 245 | 3.3 246 | 247 | 1.8 248 | 1.8 249 | 250 | 251 | 252 | 253 | 254 | org.apache.maven.plugins 255 | maven-assembly-plugin 256 | 2.6 257 | 258 | false 259 | ${project.build.directory}/releases/ 260 | 261 | ${basedir}/src/main/assemblies/plugin.xml 262 | 263 | 264 | 265 | 266 | package 267 | 268 | single 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | org.apache.maven.plugins 277 | maven-source-plugin 278 | 3.0.1 279 | 280 | 281 | attach-sources 282 | package 283 | 284 | jar 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | org.apache.maven.plugins 293 | maven-javadoc-plugin 294 | 2.10.4 295 | 296 | private 297 | true 298 | 299 | 300 | 301 | attach-javadocs 302 | package 303 | 304 | jar 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | org.apache.maven.plugins 313 | maven-gpg-plugin 314 | 1.6 315 | 316 | 317 | sign-artifacts 318 | verify 319 | 320 | sign 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | org.apache.maven.plugins 329 | maven-dependency-plugin 330 | 2.10 331 | 332 | 333 | integ-setup-dependencies 334 | pre-integration-test 335 | 336 | copy 337 | 338 | 339 | ${skipIntegTests} 340 | 341 | 342 | ${elasticsearch.groupid} 343 | elasticsearch 344 | ${elasticsearch.version} 345 | zip 346 | 347 | 348 | true 349 | ${project.build.directory}/integration-tests/binaries 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | org.apache.maven.plugins 358 | maven-antrun-plugin 359 | 1.8 360 | 361 | 362 | 363 | integ-setup 364 | pre-integration-test 365 | 366 | run 367 | 368 | 369 | ${skipIntegTests} 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | integ-teardown 378 | post-integration-test 379 | 380 | run 381 | 382 | 383 | ${skipIntegTests} 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | org.apache.maven.plugins 395 | maven-surefire-plugin 396 | 2.19 397 | 398 | 399 | default-test 400 | none 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | com.carrotsearch.randomizedtesting 410 | junit4-maven-plugin 411 | 2.3.3 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | unit-tests 426 | test 427 | 428 | junit4 429 | 430 | true 431 | 432 | ${skipUnitTests} 433 | 434 | **/*Test.class 435 | 436 | 437 | **/*$* 438 | 439 | 440 | 441 | 442 | integration-tests 443 | integration-test 444 | 445 | junit4 446 | 447 | true 448 | 449 | ${skipIntegTests} 450 | 451 | **/*IT.class 452 | 453 | 454 | **/*$* 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | -------------------------------------------------------------------------------- /src/main/java/org/nationaldataservice/elasticsearch/rocchio/Rocchio.java: -------------------------------------------------------------------------------- 1 | package org.nationaldataservice.elasticsearch.rocchio; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.net.URISyntaxException; 8 | import java.nio.file.Files; 9 | import java.nio.file.Path; 10 | import java.nio.file.Paths; 11 | import java.util.HashMap; 12 | import java.util.LinkedHashMap; 13 | import java.util.Map; 14 | import java.util.function.Supplier; 15 | 16 | import org.apache.commons.cli.Options; 17 | import org.apache.commons.cli.ParseException; 18 | import org.apache.logging.log4j.Logger; 19 | import org.apache.lucene.index.Fields; 20 | import org.apache.lucene.index.Terms; 21 | import org.apache.lucene.index.TermsEnum; 22 | import org.elasticsearch.action.search.SearchResponse; 23 | import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse; 24 | import org.elasticsearch.action.termvectors.MultiTermVectorsRequestBuilder; 25 | import org.elasticsearch.action.termvectors.MultiTermVectorsResponse; 26 | import org.elasticsearch.action.termvectors.TermVectorsRequest; 27 | import org.elasticsearch.action.termvectors.TermVectorsResponse; 28 | import org.elasticsearch.client.Client; 29 | import org.elasticsearch.client.Requests; 30 | import org.elasticsearch.cluster.metadata.IndexMetaData; 31 | import org.elasticsearch.cluster.metadata.MappingMetaData; 32 | import org.elasticsearch.common.collect.ImmutableOpenMap; 33 | import org.elasticsearch.common.logging.ESLoggerFactory; 34 | import org.elasticsearch.index.query.QueryStringQueryBuilder; 35 | import org.elasticsearch.search.SearchHit; 36 | import org.elasticsearch.search.SearchHits; 37 | //import org.elasticsearch.transport.client.PreBuiltTransportClient; 38 | 39 | import edu.gslis.textrepresentation.FeatureVector; 40 | import edu.gslis.utils.Stopper; 41 | import joptsimple.internal.Strings; 42 | 43 | /** 44 | * Rocchio implementation for Lucene based on: 45 | * https://github.com/gtsherman/lucene/blob/master/src/main/java/org/retrievable/lucene/searching/expansion/Rocchio.java 46 | * 47 | */ 48 | public class Rocchio { 49 | private static final Logger staticLogger = ESLoggerFactory.getLogger(Rocchio.class); 50 | 51 | // FIXME: These are just random guesses see NDS-958 52 | private static final int ALPHA_BETA_MIN = 0; 53 | private static final int ALPHA_BETA_MAX = 1; 54 | private static final int K1_MIN = 0; 55 | private static final int K1_MAX = 2; 56 | private static final int B_MIN = 0; 57 | private static final int B_MAX = 1; 58 | 59 | // Error Strings returned from validate() 60 | public static final String NULL_INDEX_ERROR = "You must specify an index to expand against"; 61 | public static final String NULL_QUERY_ERROR = "You must specify a query to expand"; 62 | public static final String NULL_TYPE_ERROR = "You must specify a type"; 63 | public static final String NULL_FIELD_ERROR = "You must specify a field"; 64 | public static final String INVALID_FB_TERMS_ERROR = "Number of feedback terms (fbTerms) must be a positive integer"; 65 | public static final String INVALID_FB_DOCS_ERROR = "Number of feedback documents (fbDocs) must be a positive integer"; 66 | public static final String INVALID_ALPHA_ERROR = "Alpha value must be a real number between " + ALPHA_BETA_MIN + " and " + ALPHA_BETA_MAX; 67 | public static final String INVALID_BETA_ERROR = "Beta value must be a real number between " + ALPHA_BETA_MIN + " and " + ALPHA_BETA_MAX; 68 | public static final String INVALID_K1_ERROR = "K1 value must be a real number between " + K1_MIN + " and " + K1_MAX; 69 | public static final String INVALID_B_ERROR = "B value must be a real number between " + B_MIN + " and " + B_MAX; 70 | 71 | // Read default stoplist from src/main/resource/stoplist.all 72 | private static final Stopper DEFAULT_STOPPER; 73 | private static final String STOPLIST_NAME = "stoplist.all"; 74 | static { 75 | Stopper stopper = new Stopper(); 76 | 77 | ClassLoader loader = Rocchio.class.getClassLoader(); 78 | try (BufferedReader br = new BufferedReader(new InputStreamReader(loader.getResourceAsStream(STOPLIST_NAME)))) { 79 | String curr = null; 80 | while ((curr = br.readLine()) != null) { 81 | stopper.addStopword(curr); 82 | } 83 | } catch (IOException e) { 84 | staticLogger.error(String.format("%s was not found.. continuing without a stoplist", STOPLIST_NAME), e); 85 | } 86 | 87 | DEFAULT_STOPPER = stopper; 88 | } 89 | 90 | // Error Strings returned from ensureTermVectors() 91 | /** 92 | * Returns a "nonexistent index" error message for the given index 93 | * 94 | * @param index 95 | * the {@link String} index name 96 | * @return a "nonexistent index" error message 97 | */ 98 | public static String NONEXISTENT_INDEX_ERROR(String index) { 99 | return "Index does not exist: " + index; 100 | } 101 | 102 | /** 103 | * Returns a "nonexistent type" error message for the given index/type 104 | * 105 | * @param index 106 | * the {@link String} index name 107 | * @param type 108 | * the {@link String} type name 109 | * @return a "nonexistent type error" error message 110 | */ 111 | public static String NONEXISTENT_TYPE_ERROR(String index, String type) { 112 | return "No mapping found on index " + index + " for: " + type; 113 | } 114 | 115 | /** 116 | * Returns a "disabled term vectors" error message for the given index/type/field 117 | * 118 | * @param index 119 | * the {@link String} index name 120 | * @param type 121 | * the {@link String} type name 122 | * @param field 123 | * the {@link String} field name 124 | * @return a "disabled term vectors" error message 125 | */ 126 | public static String DISABLED_TERM_VECTORS_ERROR(String index, String type, String field) { 127 | return "Term vectors storage for on " + index + "." + type + "." + field + " has been disabled"; 128 | } 129 | 130 | /** 131 | * Returns a "unconfigured term vectors" error message for the given index/type/field 132 | * 133 | * @param index 134 | * the {@link String} index name 135 | * @param type 136 | * the {@link String} type name 137 | * @param field 138 | * the {@link String} field name 139 | * @return an "unconfigured term vectors" error message 140 | */ 141 | public static String UNCONFIGURED_TERM_VECTORS_ERROR(String index, String type, String field) { 142 | return "Term vectors storage for on index " + index + "." + type + "." + field + " has not been configured"; 143 | } 144 | 145 | /** 146 | * Returns a "missing term vector field" error message for the given index/type 147 | * 148 | * @param index 149 | * the {@link String} index name 150 | * @param type 151 | * the {@link String} type name 152 | * @return a "missing term vector field" error message 153 | */ 154 | public static String MISSING_TERM_VECTOR_FIELD(String index, String type) { 155 | return "Error: no fields received for term vector - " + index + "/" + type; 156 | } 157 | 158 | /** 159 | * Returns a "missing field terms" error message for the given index/type/field 160 | * 161 | * @param index 162 | * the {@link String} index name 163 | * @param type 164 | * the {@link String} type name 165 | * @param field 166 | * the {@link String} field name 167 | * @return a "missing field terms" error message 168 | */ 169 | public static String MISSING_FIELD_TERMS(String index, String type, String field) { 170 | return "Error: no terms received for field - " + index + "/" + type + "/" + field; 171 | } 172 | 173 | private final Client client; // ElasticSearch client 174 | private final String index; // ElasticSearch index name 175 | private final String type; // Document type 176 | private final String field; // Field 177 | 178 | private final double alpha; // Rocchio alpha 179 | private final double beta; // Rocchio beta 180 | private final double k1; // BM25 k1 181 | private final double b; // BM25 b 182 | 183 | private Stopper stopper = null; 184 | 185 | // Global statistics (there's certainly a better way to handle this) 186 | long docCount = 0; // Number of documents in index 187 | double avgDocLen = 0; // Average document length, needed by BM25 188 | Map dfStats = new HashMap(); // Cached doc frequency stats 189 | 190 | /** 191 | * Instantiates a new instance of the Rocchio algorithm with the given client and parameters. 192 | * 193 | * @param client 194 | * the {@link Client} to use for the connection 195 | * @param index 196 | * the {@link String} index to expand against 197 | * @param type 198 | * the {@link String} type within the index 199 | * @param field 200 | * the {@link String} field on the type 201 | * @param alpha 202 | * the {@link double} Rocchio alpha parameter 203 | * @param beta 204 | * the {@link double} Rocchio beta parameter 205 | * @param k1 206 | * the {@link double} Rocchio k1 parameter 207 | * @param b 208 | * the {@link double} Rocchio b parameter 209 | * @param stoplist 210 | * the {@link String} list of stop words 211 | */ 212 | public Rocchio(Client client, String index, String type, String field, double alpha, double beta, double k1, 213 | double b, String stoplist) { 214 | this.client = client; 215 | this.index = index; 216 | this.type = type; 217 | this.field = field; 218 | this.alpha = alpha; 219 | this.beta = beta; 220 | this.k1 = k1; 221 | this.b = b; 222 | 223 | this.setStoplist(stoplist); 224 | } 225 | 226 | /** 227 | * Instantiates a new instance of the Rocchio algorithm with the given client and parameters. 228 | * 229 | * @param client 230 | * the {@link Client} to use for the connection 231 | * @param index 232 | * the {@link String} index to expand against 233 | * @param type 234 | * the {@link String} type within the index 235 | * @param field 236 | * the {@link String} field on the type 237 | * @param alpha 238 | * the {@link double} Rocchio alpha parameter 239 | * @param beta 240 | * the {@link double} Rocchio beta parameter 241 | * @param k1 242 | * the {@link double} Rocchio k1 parameter 243 | * @param b 244 | * the {@link double} Rocchio b parameter 245 | */ 246 | public Rocchio(Client client, String index, String type, String field, double alpha, double beta, double k1, double b) { 247 | this(client, index, type, field, alpha, beta, k1, b, null); 248 | } 249 | 250 | // Assumes a space-delimited string 251 | private Stopper setStoplist(String stoplist) { 252 | this.stopper = new Stopper(DEFAULT_STOPPER); 253 | 254 | // Short-circuit for null stoplist 255 | if (!Strings.isNullOrEmpty(stoplist)) { 256 | String[] stopwords = stoplist.split(" "); 257 | for (String term : stopwords) { 258 | stopper.addStopword(term); 259 | } 260 | } 261 | 262 | return stopper; 263 | } 264 | 265 | private void fail(String errorMessage) { 266 | staticLogger.error(errorMessage); 267 | throw new IllegalStateException(errorMessage); 268 | } 269 | 270 | private void failIf(Supplier condition, String errorMessage) { 271 | if (condition.get()) { 272 | staticLogger.error("Condition failed: " + condition.toString()); 273 | fail(errorMessage); 274 | } 275 | } 276 | 277 | /** 278 | * Verifies that String and numeric values are within their allowed ranges, then ensures that term vectors are 279 | * properly enabled on the target index. 280 | * 281 | * @param query 282 | * the String query to expand 283 | * @param fbDocs 284 | * the int number of feedback documents 285 | * @param fbTerms 286 | * the int number of feedback terms 287 | * @return the String error message, or null if no errors are encountered 288 | * @throws IOException 289 | * if the indexMetaData fails to deserialize into a map 290 | */ 291 | public String validate(String query, int fbDocs, int fbTerms) throws IOException { 292 | if (Strings.isNullOrEmpty(query)) { 293 | return NULL_QUERY_ERROR; 294 | } else if (fbDocs < 1) { 295 | return INVALID_FB_DOCS_ERROR; 296 | } else if (fbTerms < 1) { 297 | return INVALID_FB_TERMS_ERROR; 298 | } else if (Strings.isNullOrEmpty(index)) { 299 | return NULL_INDEX_ERROR; 300 | } else if (Strings.isNullOrEmpty(type)) { 301 | return NULL_TYPE_ERROR; 302 | } else if (Strings.isNullOrEmpty(field)) { 303 | return NULL_FIELD_ERROR; 304 | } else if (ALPHA_BETA_MIN > alpha || alpha > ALPHA_BETA_MAX) { 305 | return INVALID_ALPHA_ERROR; 306 | } else if (ALPHA_BETA_MIN > beta || beta > ALPHA_BETA_MAX) { 307 | return INVALID_BETA_ERROR; 308 | } else if (K1_MIN > k1 || k1 > K1_MAX) { 309 | return INVALID_K1_ERROR; 310 | } else if (B_MIN > b || b > B_MAX) { 311 | return INVALID_B_ERROR; 312 | } 313 | return this.ensureTermVectors(); 314 | } 315 | 316 | /** 317 | * Returns an error message if term vectors are misconfigured. Otherwise, returns null. 318 | * 319 | * TODO: Some of this could potentially be called at plugin startup, if we know what index/type we plan to expand 320 | * against ahead of time... 321 | * 322 | * @return the String error message, or null if no errors are encountered 323 | * 324 | * @throws IOException 325 | * if the indexMetaData fails to deserialize into a map 326 | */ 327 | @SuppressWarnings("unchecked") 328 | private String ensureTermVectors() throws IOException { 329 | // Verify that the index exists 330 | IndexMetaData indexMetaData = client.admin().cluster().state(Requests.clusterStateRequest()).actionGet() 331 | .getState().getMetaData().index(index); 332 | 333 | if (indexMetaData == null) { 334 | return NONEXISTENT_INDEX_ERROR(index); 335 | } 336 | 337 | // Verify that the index contains the desired type 338 | ImmutableOpenMap indexMap = indexMetaData.getMappings(); 339 | if (!indexMap.containsKey(type)) { 340 | return NONEXISTENT_TYPE_ERROR(index, type); 341 | } 342 | 343 | // Grab the type and analyze it to locate the field 344 | MappingMetaData typeMetadata = indexMetaData.getMappings().get(type); 345 | Map typeMap = typeMetadata.getSourceAsMap(); 346 | 347 | LinkedHashMap fieldProperties, 348 | allFieldProperties = (LinkedHashMap) typeMap.get("_all"); 349 | if (!"_all".equals(field)) { 350 | // Otherwise, we need to drill down into "properties" 351 | LinkedHashMap typePropertiesMap = (LinkedHashMap) typeMap.get("properties"); 352 | fieldProperties = (LinkedHashMap) typePropertiesMap.get(field); 353 | } else { 354 | // we can look for "store" on "_all" too 355 | fieldProperties = allFieldProperties; 356 | } 357 | 358 | // Verify that "store" is present on either _all or our target field 359 | if (allFieldProperties.containsKey("store")) { 360 | // Verify that term vector storage is enabled for all fields 361 | boolean storeEnabled = (boolean) allFieldProperties.get("store"); 362 | if (!storeEnabled) { 363 | String errorMessage = DISABLED_TERM_VECTORS_ERROR(index, type, field); 364 | staticLogger.error(errorMessage); 365 | return errorMessage; 366 | } 367 | 368 | return null; 369 | } else if (fieldProperties.containsKey("store")) { 370 | // Verify that term vector storage is enabled at the field level 371 | boolean storeEnabled = (boolean) fieldProperties.get("store"); 372 | if (!storeEnabled) { 373 | String errorMessage = DISABLED_TERM_VECTORS_ERROR(index, type, field); 374 | staticLogger.error(errorMessage); 375 | return errorMessage; 376 | } 377 | 378 | return null; 379 | } 380 | 381 | // TODO: NDS-958 - Check that type has documents added to it? 382 | // TODO: NDS-958 - Check that the documents in the type contain the desired field? 383 | // TODO: NDS-958 - Check that term vectors/fields stats are available for the desired index/type/field combination? 384 | 385 | // If neither of the above triggered, then we didn't have the right term vectors initialized on our index 386 | String errorMessage = UNCONFIGURED_TERM_VECTORS_ERROR(index, type, field); 387 | staticLogger.error(errorMessage); 388 | return errorMessage; 389 | } 390 | 391 | /** 392 | * Run the query using the client (this assumes that the client has already been initialized and is ready to 393 | * execute) 394 | * 395 | * @param index 396 | * the String index to expand against 397 | * @param query 398 | * Query string 399 | * @param numDocs 400 | * Number of results to return 401 | * @return SearchHits object 402 | */ 403 | private SearchResponse runQuery(String index, String query, int numDocs) { 404 | QueryStringQueryBuilder queryStringQueryBuilder = new QueryStringQueryBuilder(query); 405 | return client.prepareSearch(index).setQuery(queryStringQueryBuilder).setSize(numDocs).execute().actionGet(); 406 | } 407 | 408 | /** 409 | * Given a set of SearchHits, construct the feedback vector 410 | * 411 | * @param hits 412 | * SearchHits 413 | * @param fbDocs 414 | * Number of feedback documents 415 | * @return FeatureVector based on feedback documents 416 | * @throws IOException 417 | * if the TermVector has no fields, or if its Fields contain no terms 418 | */ 419 | private FeatureVector getFeedbackVector(SearchHits hits, int fbDocs) throws IOException { 420 | FeatureVector summedDocVec = new FeatureVector(this.stopper); 421 | 422 | // Use the multi termvector request to get vectors for all documents at once 423 | MultiTermVectorsRequestBuilder mtbuilder = client.prepareMultiTermVectors(); 424 | for (SearchHit hit : hits.hits()) { 425 | String id = hit.getId(); 426 | TermVectorsRequest termVectorsRequest = new TermVectorsRequest(); 427 | termVectorsRequest.index(index).id(id).type(this.type).termStatistics(true).offsets(false).positions(false) 428 | .payloads(false); 429 | 430 | mtbuilder.add(termVectorsRequest); 431 | } 432 | MultiTermVectorsResponse mtvresponse = mtbuilder.execute().actionGet(); 433 | 434 | // Iterate over the returned document vectors. Construct the feedback vector. 435 | // Store the global document count and calculate the global average document length 436 | // Store document frequencies for encountered terms in dfStats map. 437 | for (MultiTermVectorsItemResponse item : mtvresponse.getResponses()) { 438 | FeatureVector docVec = new FeatureVector(this.stopper); 439 | 440 | TermVectorsResponse tv = item.getResponse(); 441 | Fields fields = tv.getFields(); 442 | failIf(() -> tv == null, MISSING_TERM_VECTOR_FIELD(index, type)); 443 | 444 | Terms terms = fields.terms(this.field); 445 | failIf(() -> terms == null, MISSING_FIELD_TERMS(index, type, field)); 446 | 447 | // These are global settings and will be the same for all TermVectorResponses. 448 | // TODO: There's a better way to handle this. 449 | long sumTotalTermFreq = terms.getSumTotalTermFreq(); // Total number of terms in index 450 | docCount = terms.getDocCount(); // Total number of documents in index 451 | avgDocLen = sumTotalTermFreq / (double) docCount; 452 | 453 | // Get the term frequency and document frequency for each term 454 | TermsEnum termsEnum = terms.iterator(); 455 | while (termsEnum.next() != null) { 456 | String term = termsEnum.term().utf8ToString(); 457 | long freq = termsEnum.totalTermFreq(); // Frequency for term t in this document 458 | long df = termsEnum.docFreq(); // Frequency for term t in all documents (document frequency) -- a global statistic 459 | dfStats.put(term, df); // Map storing global document frequencies for seen terms, used by BM25 460 | docVec.addTerm(term, freq); // Current document vector 461 | } 462 | 463 | // Add this document to the feedback document vector with BM25 weights 464 | computeBM25Weights(docVec, summedDocVec); 465 | } 466 | 467 | // Multiply the summed term vector by beta / |Dr| 468 | FeatureVector relDocTermVec = new FeatureVector(this.stopper); 469 | for (String term : summedDocVec.getFeatures()) { 470 | relDocTermVec.addTerm(term, summedDocVec.getFeatureWeight(term) * beta / fbDocs); 471 | } 472 | 473 | return relDocTermVec; 474 | } 475 | 476 | /** 477 | * Construct the query vector with BM25 weights 478 | * 479 | * @param query 480 | * Query string 481 | * @return FeatureVector 482 | */ 483 | public FeatureVector getQueryVector(String query) { 484 | // Create a query vector and scale by alpha 485 | FeatureVector rawQueryVec = new FeatureVector(this.stopper); 486 | rawQueryVec.addText(query); 487 | 488 | FeatureVector summedQueryVec = new FeatureVector(this.stopper); 489 | computeBM25Weights(rawQueryVec, summedQueryVec); 490 | 491 | FeatureVector queryTermVec = new FeatureVector(this.stopper); 492 | for (String term : rawQueryVec.getFeatures()) { 493 | queryTermVec.addTerm(term, summedQueryVec.getFeatureWeight(term) * alpha); 494 | } 495 | 496 | return queryTermVec; 497 | } 498 | 499 | /** 500 | * Expand the query. 501 | * 502 | * @param query 503 | * Query string 504 | * @param fbDocs 505 | * Number of feedback documents 506 | * @param fbTerms 507 | * Number of feedback terms 508 | * @return Expanded feature vector 509 | * @throws IOException 510 | * if we fail to get the feedback vector 511 | */ 512 | public FeatureVector expandQuery(String query, int fbDocs, int fbTerms) throws IOException { 513 | // Run the initial query 514 | SearchHits hits = runQuery(this.index, query, fbDocs).getHits(); 515 | 516 | // Get the feedback document vector, weighted by beta 517 | FeatureVector feedbackVector = getFeedbackVector(hits, fbDocs); 518 | 519 | // Get the original query vector, weighted by alpha 520 | // Note, this is called after getFeedbackVector because it relies on dfStats 521 | FeatureVector queryVector = getQueryVector(query); 522 | 523 | // Combine query and feedbackvectors 524 | for (String term : queryVector.getFeatures()) { 525 | feedbackVector.addTerm(term, queryVector.getFeatureWeight(term)); 526 | } 527 | 528 | // Get top terms -- aka head 529 | feedbackVector.clip(fbTerms); 530 | 531 | return feedbackVector; 532 | } 533 | 534 | /** 535 | * Compute BM25 weights for the input vector and add to the output vector 536 | * 537 | * @param inputVector 538 | * the {@link FeatureVector} input 539 | * @param outputVector 540 | * the {@link FeatureVector} output 541 | */ 542 | private void computeBM25Weights(FeatureVector inputVector, FeatureVector outputVector) { 543 | for (String term : inputVector.getFeatures()) { 544 | long docOccur = dfStats.get(term); 545 | 546 | double idf = Math.log((docCount + 1) / (docOccur + 0.5)); // following Indri 547 | double tf = inputVector.getFeatureWeight(term); 548 | 549 | double weight = (idf * k1 * tf) / (tf + k1 * (1 - b + b * inputVector.getLength() / avgDocLen)); 550 | outputVector.addTerm(term, weight); 551 | } 552 | } 553 | 554 | /** 555 | * Debug: Command line options for the main() method (see below) 556 | * 557 | * @return the CLI options 558 | */ 559 | public static Options createOptions() { 560 | Options options = new Options(); 561 | options.addOption("cluster", true, "ElasticSearch cluster name (default: biocaddie)"); 562 | options.addOption("host", true, "ElasticSearch host (default: localhost)"); 563 | options.addOption("port", true, "ElasticSearch transport port (default: 9300)"); 564 | options.addOption("index", true, "ElasticSearch index name (default: biocaddie)"); 565 | options.addOption("type", true, "ElasticSearch document type (default: dataset)"); 566 | options.addOption("field", true, "ElasticSearch field (default: _all)"); 567 | options.addOption("alpha", true, "Rocchio alpha (default: 0.5)"); 568 | options.addOption("beta", true, "Rocchio beta (default: 0.5)"); 569 | options.addOption("k1", true, "BM25 k1 (default: 1.2)"); 570 | options.addOption("b", true, "BM25 b (default: 0.75)"); 571 | options.addOption("query", true, "Query string"); 572 | options.addOption("auth", true, "Basic authentication string (default: elastic:biocaddie)"); 573 | return options; 574 | } 575 | 576 | /** 577 | * Debug: this main method will run Rocchio as a standalone command-line application. 578 | * 579 | * NOTE: You will need to add the following dependency to your {@code pom.xml}: 580 | * 581 | *
582 |      *  <dependency>
583 |      *    <groupId>org.elasticsearch.client</groupId>
584 |      *    <artifactId>transport</artifactId>
585 |      *    <version>${elasticsearch.version}</version>
586 |      *  </dependency>
587 |      * 
588 | * 589 | * @param args 590 | * the command-line arguments 591 | * @throws IOException 592 | * if expandQuery throws an IOException, or if the host lookup fails (localhost shouldn't) 593 | * @throws ParseException 594 | * if the command-line arguments cannot be parsed 595 | */ 596 | public static void main(String[] args) throws IOException, ParseException { 597 | 598 | /* 599 | * Options options = createOptions(); CommandLineParser parser = new GnuParser(); CommandLine cl = 600 | * parser.parse(options, args); if (cl.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); 601 | * formatter.printHelp(Rocchio.class.getCanonicalName(), options); return; } 602 | * 603 | * // Get the many command line parameters String cluster = cl.getOptionValue("cluster", "elasticsearch"); 604 | * String host = cl.getOptionValue("host", "localhost"); int port = Integer.parseInt(cl.getOptionValue("port", 605 | * "9300")); double alpha = Double.parseDouble(cl.getOptionValue("alpha", "0.5")); double beta = 606 | * Double.parseDouble(cl.getOptionValue("beta", "0.5")); double k1 = Double.parseDouble(cl.getOptionValue("k1", 607 | * "1.2")); double b = Double.parseDouble(cl.getOptionValue("b", "0.75")); int fbTerms = 608 | * Integer.parseInt(cl.getOptionValue("fbTerms", "10")); int fbDocs = 609 | * Integer.parseInt(cl.getOptionValue("fbDocs", "10")); String index = cl.getOptionValue("index", "biocaddie"); 610 | * String type = cl.getOptionValue("type", "dataset"); String field = cl.getOptionValue("field", "_all"); 611 | * 612 | * String auth = cl.getOptionValue("auth", "elastic:biocaddie"); String query = cl.getOptionValue("query", 613 | * "multiple sclerosis"); 614 | * 615 | * // Connect to ElasticSearch Settings settings = Settings.builder().put("cluster.name", cluster).build(); 616 | * TransportClient transportClient = new PreBuiltTransportClient(settings); 617 | * transportClient.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port)); 618 | * Client client = transportClient.filterWithHeader(Collections.singletonMap( "Authorization", auth)); 619 | * 620 | * // Construct Rocchio Rocchio rocchio = new Rocchio(client, index, type, field, alpha, beta, k1, b); 621 | * 622 | * // Expand the query FeatureVector feedbackQuery = rocchio.expandQuery(query, fbDocs, fbTerms); 623 | * 624 | * // Dump the expanded query StringBuffer esQuery = new StringBuffer(); for (String term : 625 | * feedbackQuery.getFeatures()) { esQuery.append(term + "^" + feedbackQuery.getFeatureWeight(term) + " "); } 626 | * System.out.println(esQuery); 627 | * 628 | * transportClient.close(); 629 | */ 630 | } 631 | } --------------------------------------------------------------------------------