├── scripts
    ├── stop.sh
    ├── restart.sh
    ├── delete-index.sh
    ├── remove.sh
    ├── install.sh
    ├── create-index.sh
    ├── build.sh
    ├── start.sh
    └── add-docs.sh
├── data
    ├── 1.json
    ├── 2.json
    └── biocaddie.json
├── logs.sh
├── rebuild.sh
├── .gitignore
├── src
    ├── main
    │   ├── resources
    │   │   ├── plugin-descriptor.properties
    │   │   └── stoplist.all
    │   ├── assemblies
    │   │   └── plugin.xml
    │   └── java
    │   │   └── org
    │   │       └── nationaldataservice
    │   │           └── elasticsearch
    │   │               └── rocchio
    │   │                   ├── RocchioPlugin.java
    │   │                   ├── RocchioException.java
    │   │                   ├── RocchioExpandRestAction.java
    │   │                   └── Rocchio.java
    └── test
    │   ├── java
    │       └── org
    │       │   └── nationaldataservice
    │       │       └── elasticsearch
    │       │           └── rocchio
    │       │               └── test
    │       │                   ├── unit
    │       │                       └── RocchioTest.java
    │       │                   └── integration
    │       │                       ├── RocchioIT.java
    │       │                       └── AbstractITCase.java
    │   └── ant
    │       └── integration-tests.xml
├── test.sh
├── LICENSE
├── README.md
└── pom.xml


/scripts/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker stop elastic-qe-5.3.2
4 | 


--------------------------------------------------------------------------------
/data/1.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fullname": "Hello",
3 |     "text": "World"
4 | }
5 | 


--------------------------------------------------------------------------------
/data/2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fullname": "Hello",
3 |     "text": "World 2"
4 | }
5 | 


--------------------------------------------------------------------------------
/scripts/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker restart elastic-qe-5.3.2
4 | 


--------------------------------------------------------------------------------
/logs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker logs -f elastic-qe-5.3.2 --tail 100
4 | 
5 | 


--------------------------------------------------------------------------------
/scripts/delete-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -u elastic:changeme -XDELETE localhost:9200/biocaddie?pretty
4 | 


--------------------------------------------------------------------------------
/scripts/remove.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin remove rocchio 
4 | 


--------------------------------------------------------------------------------
/rebuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | scripts/remove.sh; scripts/build.sh && scripts/install.sh && scripts/restart.sh && ./logs.sh
4 | 


--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin install file:///plugin-src/target/releases/rocchio-0.0.1-SNAPSHOT.zip
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Eclipse project metadata
 2 | .settings/
 3 | .classpath
 4 | .project
 5 | 
 6 | # Build output
 7 | target/
 8 | 
 9 | # ElasticSearch data
10 | es-data/
11 | 


--------------------------------------------------------------------------------
/scripts/create-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie?pretty -d@data/biocaddie.json
4 | 


--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mvn clean package && exit 0 \
4 |     || echo "WARNING: No native Maven installed - using Docker instead" \
5 |         && docker exec -it $(pwd):/workspace -w /workspace maven:3 mvn clean package && exit 0
6 | 
7 | exit 1
8 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | description=${project.description}.
2 | version=${project.version}
3 | name=${project.artifactId}
4 | classname=org.nationaldataservice.elasticsearch.rocchio.RocchioPlugin
5 | java.version=1.8
6 | elasticsearch.version=${elasticsearch.version}


--------------------------------------------------------------------------------
/scripts/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker start elastic-qe-5.3.2 && exit 0 || docker run --name=elastic-qe-5.3.2 -it -d -p 9200:9200 -v $(pwd):/plugin-src/ -v $HOME/es-5.3.2-data:/usr/share/elasticsearch/data -e "http.host=0.0.0.0" -e "transport.host=127.0.0.1" docker.elastic.co/elasticsearch/elasticsearch:5.3.2 && exit 0
4 | 


--------------------------------------------------------------------------------
/data/biocaddie.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "mappings": {
 3 |     "dataset": {
 4 |       "_all": {
 5 |         "type": "text",
 6 |         "term_vector": "with_positions_offsets_payloads",
 7 |         "store" : true,
 8 |         "analyzer" : "fulltext_analyzer"
 9 |       }
10 |     }
11 |   },
12 |   "settings": {
13 |     "index" : {
14 |       "number_of_shards" : 1,
15 |       "number_of_replicas" : 0
16 |     },
17 |     "analysis": {
18 |       "analyzer": {
19 |         "fulltext_analyzer": {
20 |           "type": "custom",
21 |           "tokenizer": "whitespace",
22 |           "filter": [
23 |             "lowercase",
24 |             "type_as_payload"
25 |           ]
26 |         }
27 |       }
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Point to a specific instance of elasticsearch (defaults to Docker instance)
 4 | TEST_HOST="localhost"
 5 | TEST_PORT="9200"
 6 | TEST_USERNAME="elastic"
 7 | TEST_PASSWORD="changeme"
 8 | 
 9 | # Specify expansion / search parameters
10 | TEST_INDEX="biocaddie"
11 | SEARCH_TYPE="dataset"
12 | TEST_QUERY="multiple+sclerosis"
13 | STOP_LIST="a+an+the+and+or+of+from+on+was+to+is+-+were+at+as+we"
14 | 
15 | # Override additional parameters here
16 | ADDITIONAL_ARGS="&fbTerms=20&fbDocs=50"
17 | 
18 | # Otherwise, just run Rocchio and return the expanded query
19 | curl -u "${TEST_USERNAME}:${TEST_PASSWORD}" ${TEST_HOST}:${TEST_PORT}/${TEST_INDEX}/${SEARCH_TYPE}/_expand'?pretty'${ADDITIONAL_ARGS}'&query='${TEST_QUERY}
20 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>plugin</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <files>
 9 |         <file>
10 |             <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
11 |             <outputDirectory>elasticsearch</outputDirectory>
12 |             <filtered>true</filtered>
13 |         </file>
14 |     </files>
15 |     <dependencySets>
16 |         <dependencySet>
17 |             <outputDirectory>elasticsearch</outputDirectory>
18 |             <useProjectArtifact>true</useProjectArtifact>
19 |             <useTransitiveFiltering>true</useTransitiveFiltering>
20 |         </dependencySet>
21 |     </dependencySets>
22 | </assembly>


--------------------------------------------------------------------------------
/scripts/add-docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Change this to match the path to your (unzipped) biocaddie benchmark dataset
 5 | dataset_directory=$HOME/update_json_folder 
 6 | 
 7 | echo 'Started indexing!'
 8 | for docid in {1..790000}
 9 | do
10 |     if [ "$1" == "-vvvv" ]; then  
11 |         echo "Indexing document: $docid"
12 |     elif [ "$1" == "-vvv" -a "$(expr $docid % 10)" == "0" ]; then
13 |         echo "Indexing document: $docid"
14 |     elif [ "$1" == "-vv" -a "$(expr $docid % 100)" == "0" ]; then
15 |         echo "Indexing document: $docid"
16 |     elif [ "$1" == "-v" -a "$(expr $docid % 1000)" == "0" ]; then
17 |         echo "Indexing document: $docid"
18 |     elif [ "$1" != "-q" -a "$$(expr $docid % 100000)" == "0" ]; then
19 |         echo "Indexing document: $docid"
20 |     fi
21 | 
22 |     curl --silent -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie/dataset/$docid?pretty -d@$HOME/update_json_folder/$docid.json > /dev/null
23 | done
24 | 
25 | echo 'Indexing complete!'
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 National Data Service
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.nationaldataservice.elasticsearch.rocchio;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | import java.util.List;
 6 | import java.util.function.Supplier;
 7 | 
 8 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
 9 | import org.elasticsearch.cluster.node.DiscoveryNodes;
10 | import org.elasticsearch.common.settings.ClusterSettings;
11 | import org.elasticsearch.common.settings.IndexScopedSettings;
12 | import org.elasticsearch.common.settings.Settings;
13 | import org.elasticsearch.common.settings.SettingsFilter;
14 | import org.elasticsearch.plugins.ActionPlugin;
15 | import org.elasticsearch.plugins.Plugin;
16 | import org.elasticsearch.rest.RestController;
17 | import org.elasticsearch.rest.RestHandler;
18 | 
19 | public class RocchioPlugin extends Plugin implements ActionPlugin {
20 | 	@Override
21 | 	public List<RestHandler> getRestHandlers(Settings settings, RestController restController,
22 | 			ClusterSettings clusterSettings, IndexScopedSettings indexScopedSettings, SettingsFilter settingsFilter,
23 | 			IndexNameExpressionResolver indexNameExpressionResolver, Supplier<DiscoveryNodes> nodesInCluster) {
24 | 		return Arrays.asList(new RocchioExpandRestAction(settings, restController));
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioException.java:
--------------------------------------------------------------------------------
 1 | package org.nationaldataservice.elasticsearch.rocchio;
 2 | 
 3 | 
 4 | public class RocchioException extends Exception {
 5 | 	/**
 6 | 	 * Unique id to identify this {@link Exception}
 7 | 	 */
 8 | 	private static final long serialVersionUID = 5961496592606387768L;
 9 | 
10 | 	/**
11 | 	 * An {@link Exception} encountered during {@link Rocchio} operations
12 | 	 */
13 | 	public RocchioException() {
14 | 		
15 | 	}
16 | 
17 | 	/**
18 | 	 * An {@link Exception} encountered during {@link Rocchio} operations
19 | 	 * 
20 | 	 * @param message the {@link String} error message
21 | 	 */
22 | 	public RocchioException(String message) {
23 | 		super(message);
24 | 	}
25 | 
26 | 	/**
27 | 	 * An exception encountered during {@link Rocchio} operations
28 | 	 * 
29 | 	 * @param cause the {@link Throwable} underlying cause
30 | 	 */
31 | 	public RocchioException(Throwable cause) {
32 | 		super(cause);
33 | 	}
34 | 
35 | 	/**
36 | 	 * An exception encountered during {@link Rocchio} operations
37 | 	 * 
38 | 	 * @param message the {@link String} error message
39 | 	 * @param cause the {@link Throwable} underlying cause
40 | 	 */
41 | 	public RocchioException(String message, Throwable cause) {
42 | 		super(message, cause);
43 | 	}
44 | 
45 | 	/**
46 | 	 * An exception encountered during {@link Rocchio} operations
47 | 	 *
48 | 	 * @param message the {@link String} error message
49 | 	 * @param cause the {@link Throwable} underlying cause
50 | 	 * @param enableSuppression a {@link boolean} indicating whether suppression is enabled
51 | 	 * @param writableStackTrace a {@link boolean} indicating whether the stackTrace is writeable
52 | 	 */
53 | 	public RocchioException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
54 | 		super(message, cause, enableSuppression, writableStackTrace);
55 | 	}
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/resources/stoplist.all:
--------------------------------------------------------------------------------
  1 | category
  2 | wikipedia
  3 | http
  4 | html
  5 | www
  6 | https
  7 | com
  8 | php
  9 | htm
 10 | free
 11 | cfm
 12 | asp
 13 | jsp
 14 | a
 15 | about
 16 | above
 17 | according
 18 | across
 19 | after
 20 | afterwards
 21 | again
 22 | against
 23 | albeit
 24 | all
 25 | almost
 26 | alone
 27 | along
 28 | already
 29 | also
 30 | although
 31 | always
 32 | am
 33 | among
 34 | amongst
 35 | an
 36 | and
 37 | another
 38 | any
 39 | anybody
 40 | anyhow
 41 | anyone
 42 | anything
 43 | anyway
 44 | anywhere
 45 | apart
 46 | are
 47 | around
 48 | as
 49 | at
 50 | av
 51 | be
 52 | became
 53 | because
 54 | become
 55 | becomes
 56 | becoming
 57 | been
 58 | before
 59 | beforehand
 60 | behind
 61 | being
 62 | below
 63 | beside
 64 | besides
 65 | between
 66 | beyond
 67 | both
 68 | but
 69 | by
 70 | can
 71 | cannot
 72 | canst
 73 | certain
 74 | cf
 75 | choose
 76 | contrariwise
 77 | cos
 78 | could
 79 | cu
 80 | day
 81 | do
 82 | does
 83 | doesn't
 84 | doing
 85 | dost
 86 | doth
 87 | double
 88 | down
 89 | dual
 90 | during
 91 | each
 92 | either
 93 | else
 94 | elsewhere
 95 | enough
 96 | et
 97 | etc
 98 | even
 99 | ever
100 | every
101 | everybody
102 | everyone
103 | everything
104 | everywhere
105 | except
106 | excepted
107 | excepting
108 | exception
109 | exclude
110 | excluding
111 | exclusive
112 | far
113 | farther
114 | farthest
115 | few
116 | ff
117 | first
118 | for
119 | formerly
120 | forth
121 | forward
122 | from
123 | front
124 | further
125 | furthermore
126 | furthest
127 | get
128 | go
129 | had
130 | halves
131 | hardly
132 | has
133 | hast
134 | hath
135 | have
136 | he
137 | hence
138 | henceforth
139 | her
140 | here
141 | hereabouts
142 | hereafter
143 | hereby
144 | herein
145 | hereto
146 | hereupon
147 | hers
148 | herself
149 | him
150 | himself
151 | hindmost
152 | his
153 | hither
154 | hitherto
155 | how
156 | however
157 | howsoever
158 | i
159 | ie
160 | if
161 | in
162 | inasmuch
163 | inc
164 | include
165 | included
166 | including
167 | indeed
168 | indoors
169 | inside
170 | insomuch
171 | instead
172 | into
173 | inward
174 | inwards
175 | is
176 | it
177 | its
178 | itself
179 | just
180 | kind
181 | kg
182 | km
183 | last
184 | latter
185 | latterly
186 | less
187 | lest
188 | let
189 | like
190 | little
191 | ltd
192 | many
193 | may
194 | maybe
195 | me
196 | meantime
197 | meanwhile
198 | might
199 | moreover
200 | most
201 | mostly
202 | more
203 | mr
204 | mrs
205 | ms
206 | much
207 | must
208 | my
209 | myself
210 | namely
211 | need
212 | neither
213 | never
214 | nevertheless
215 | next
216 | no
217 | nobody
218 | none
219 | nonetheless
220 | noone
221 | nope
222 | nor
223 | not
224 | nothing
225 | notwithstanding
226 | now
227 | nowadays
228 | nowhere
229 | of
230 | off
231 | often
232 | ok
233 | on
234 | once
235 | one
236 | only
237 | onto
238 | or
239 | other
240 | others
241 | otherwise
242 | ought
243 | our
244 | ours
245 | ourselves
246 | out
247 | outside
248 | over
249 | own
250 | per
251 | perhaps
252 | plenty
253 | provide
254 | quite
255 | rather
256 | really
257 | round
258 | said
259 | sake
260 | same
261 | sang
262 | save
263 | saw
264 | see
265 | seeing
266 | seem
267 | seemed
268 | seeming
269 | seems
270 | seen
271 | seldom
272 | selves
273 | sent
274 | several
275 | shalt
276 | she
277 | should
278 | shown
279 | sideways
280 | since
281 | slept
282 | slew
283 | slung
284 | slunk
285 | smote
286 | so
287 | some
288 | somebody
289 | somehow
290 | someone
291 | something
292 | sometime
293 | sometimes
294 | somewhat
295 | somewhere
296 | spake
297 | spat
298 | spoke
299 | spoken
300 | sprang
301 | sprung
302 | stave
303 | staves
304 | still
305 | such
306 | supposing
307 | than
308 | that
309 | the
310 | thee
311 | their
312 | them
313 | themselves
314 | then
315 | thence
316 | thenceforth
317 | there
318 | thereabout
319 | thereabouts
320 | thereafter
321 | thereby
322 | therefore
323 | therein
324 | thereof
325 | thereon
326 | thereto
327 | thereupon
328 | these
329 | they
330 | this
331 | those
332 | thou
333 | though
334 | thrice
335 | through
336 | throughout
337 | thru
338 | thus
339 | thy
340 | thyself
341 | till
342 | to
343 | together
344 | too
345 | toward
346 | towards
347 | ugh
348 | unable
349 | under
350 | underneath
351 | unless
352 | unlike
353 | until
354 | up
355 | upon
356 | upward
357 | upwards
358 | us
359 | use
360 | used
361 | using
362 | very
363 | via
364 | vs
365 | want
366 | was
367 | we
368 | week
369 | well
370 | were
371 | what
372 | whatever
373 | whatsoever
374 | when
375 | whence
376 | whenever
377 | whensoever
378 | where
379 | whereabouts
380 | whereafter
381 | whereas
382 | whereat
383 | whereby
384 | wherefore
385 | wherefrom
386 | wherein
387 | whereinto
388 | whereof
389 | whereon
390 | wheresoever
391 | whereto
392 | whereunto
393 | whereupon
394 | wherever
395 | wherewith
396 | whether
397 | whew
398 | which
399 | whichever
400 | whichsoever
401 | while
402 | whilst
403 | whither
404 | who
405 | whoa
406 | whoever
407 | whole
408 | whom
409 | whomever
410 | whomsoever
411 | whose
412 | whosoever
413 | why
414 | will
415 | wilt
416 | with
417 | within
418 | without
419 | worse
420 | worst
421 | would
422 | wow
423 | ye
424 | yet
425 | year
426 | yippee
427 | you
428 | your
429 | yours
430 | yourself
431 | yourselves


--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioExpandRestAction.java:
--------------------------------------------------------------------------------
  1 | package org.nationaldataservice.elasticsearch.rocchio;
  2 | 
  3 | import java.io.IOException;
  4 | import java.net.URISyntaxException;
  5 | import java.nio.file.Files;
  6 | import java.nio.file.Path;
  7 | import java.nio.file.Paths;
  8 | 
  9 | import org.apache.logging.log4j.Logger;
 10 | import org.elasticsearch.client.node.NodeClient;
 11 | import org.elasticsearch.common.inject.Inject;
 12 | import org.elasticsearch.common.logging.ESLoggerFactory;
 13 | import org.elasticsearch.common.settings.Settings;
 14 | import org.elasticsearch.common.xcontent.XContentBuilder;
 15 | import org.elasticsearch.common.xcontent.json.JsonXContent;
 16 | import org.elasticsearch.rest.BaseRestHandler;
 17 | import org.elasticsearch.rest.BytesRestResponse;
 18 | import org.elasticsearch.rest.RestController;
 19 | import org.elasticsearch.rest.RestRequest;
 20 | import org.elasticsearch.rest.RestRequest.Method;
 21 | import org.elasticsearch.rest.RestStatus;
 22 | 
 23 | import edu.gslis.textrepresentation.FeatureVector;
 24 | import joptsimple.internal.Strings;
 25 | 
 26 | public class RocchioExpandRestAction extends BaseRestHandler {
 27 |     private final Logger logger = ESLoggerFactory.getLogger(RocchioExpandRestAction.class);
 28 | 
 29 |     @Inject
 30 |     public RocchioExpandRestAction(Settings settings, RestController controller) {
 31 |         super(settings);
 32 | 
 33 |         // Register your handlers here
 34 |         controller.registerHandler(Method.GET, "/{index}/{type}/_expand", this);
 35 |         controller.registerHandler(Method.GET, "/{index}/_expand", this);
 36 |     }
 37 | 
 38 |     /**
 39 |      * Helper method for throwing an error
 40 |      * 
 41 |      * @param error
 42 |      *            the String error message
 43 |      * @return a RestChannelConsumer to build up the error
 44 |      */
 45 |     protected RestChannelConsumer throwError(String error) {
 46 |         return throwError(error, RestStatus.BAD_REQUEST);
 47 |     }
 48 | 
 49 |     /**
 50 |      * Helper method for throwing an error
 51 |      * 
 52 |      * @param error
 53 |      *            the String error message
 54 |      * @param status
 55 |      *            the HTTP status to return
 56 |      * @return a RestChannelConsumer to build up the error
 57 |      */
 58 |     protected RestChannelConsumer throwError(String error, RestStatus status) {
 59 |         this.logger.error("ERROR: " + error);
 60 |         return channel -> {
 61 |             XContentBuilder builder = JsonXContent.contentBuilder();
 62 |             builder.startObject();
 63 |             builder.field("error", error);
 64 |             builder.endObject();
 65 |             channel.sendResponse(new BytesRestResponse(status, builder));
 66 |         };
 67 |     }
 68 | 
 69 |     @Override
 70 |     protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException {
 71 |         this.logger.debug("Executing Rocchio expand action!");
 72 | 
 73 |         // Required path parameter
 74 |         String index = request.param("index");
 75 | 
 76 |         // Required query string parameter
 77 |         String query = request.param("query");
 78 | 
 79 |         // Optional parameters, with sensible defaults
 80 |         String type = request.param("type", "dataset");
 81 |         String field = request.param("field", "_all");
 82 |         double alpha = Double.parseDouble(request.param("alpha", "0.5"));
 83 |         double beta = Double.parseDouble(request.param("beta", "0.5"));
 84 |         double k1 = Double.parseDouble(request.param("k1", "1.2"));
 85 |         double b = Double.parseDouble(request.param("b", "0.75"));
 86 |         int fbDocs = Integer.parseInt(request.param("fbDocs", "10"));
 87 |         int fbTerms = Integer.parseInt(request.param("fbTerms", "10"));
 88 | 
 89 |         // Optional stoplist - assumes a space-delimited string of stop words
 90 |         // TODO: Populate list of default stop words
 91 |         String stoplist = request.param("stoplist", "");
 92 | 
 93 |         // Log the request with our full parameter set
 94 |         this.logger.info(String.format(
 95 |                 "Starting RocchioExpand (index=%s, query=%s, type=%s, "
 96 |                         + "field=%s, fbDocs=%d, fbTerms=%d, α=%.2f, β=%.2f, k1=%.2f, b=%.2f, stoplist=%s)",
 97 |                 index, query, type, field, fbDocs, fbTerms, alpha, beta, k1, b, stoplist));
 98 | 
 99 |         // TODO: Check that type has documents added to it?
100 |         // TODO: Check that the documents in the type contain the desired field?
101 |         // TODO: Check that term vectors/fields stats are available for the
102 |         // desired index/type/field combination?
103 | 
104 |         try {
105 |             Rocchio rocchio = new Rocchio(client, index, type, field, alpha, beta, k1, b, stoplist);
106 |     
107 |             // Validate input parameters
108 |             String shortCircuit = rocchio.validate(query, fbDocs, fbTerms);
109 |             if (!Strings.isNullOrEmpty(shortCircuit)) {
110 |                 return throwError(shortCircuit);
111 |             }
112 |     
113 |             // Expand the query
114 |             this.logger.debug("Generating feedback query for (" + query + "," + fbDocs + "," + fbTerms);
115 |             FeatureVector feedbackQuery = rocchio.expandQuery(query, fbDocs, fbTerms);
116 |     
117 |             // Format our expanded query with Lucene's boosting syntax
118 |             this.logger.debug("Expanding query: " + feedbackQuery.toString());
119 |             StringBuffer expandedQuery = new StringBuffer();
120 |             String separator = ""; // start out with no separator
121 |     
122 |             for (String term : feedbackQuery.getFeatures()) {
123 |                 expandedQuery.append(separator + term + "^" + feedbackQuery.getFeatureWeight(term));
124 |                 separator = " "; // add separator after first iteration
125 |             }
126 |     
127 |             String fullQuery = expandedQuery.toString().trim();
128 |     
129 |             // Return the expanded query (don't actually perform the search)
130 |             this.logger.debug("Responding: " + expandedQuery.toString());
131 |             return channel -> {
132 |                 XContentBuilder builder = JsonXContent.contentBuilder();
133 |                 builder.startObject();
134 |     
135 |                 builder.field("query", fullQuery);
136 |                 builder.endObject();
137 |                 channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder));
138 |             };
139 | 		} catch (Exception e) {
140 | 			// FIXME: Catching generic Exception is bad practice
141 | 			// TODO: make this more specific for production
142 | 			String errorMessage = e.getMessage();
143 | 			if (Strings.isNullOrEmpty(errorMessage)) {
144 | 				errorMessage = "An unknown error was encountered.";
145 | 			}
146 | 			return throwError(errorMessage);
147 | 		}
148 | 	}
149 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Rocchio expansion for ElasticSearch
  2 | 
  3 | <img src="https://github.com/craig-willis/ndslabs/blob/master/docs/images/logos/NDS-badge.png" width="100" alt="NDS"> <img src="https://biocaddie.org/sites/default/files/biocaddie-logo.png" alt="bioCADDIE">
  4 | 
  5 | This is a prototype plugin for ElasticSearch 5.x to add Rocchio-based query expansion support using BM25 similarity. This plugin adds an ``_expand`` REST endpoint to ElasticSearch that returns a "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" with Lucene-style terms weights. This plugin was developed as part of the  NDS [bioCADDIE pilot](https://biocaddie.org/expansion-models-biomedical-data-search).
  6 | 
  7 | ## Why Rocchio?
  8 | Our original goal was to implement relevance model (RM) based expansion using Lucene's language modeling similarity implementations. Our investigations revealed that [Lucene's language modeling implementation is incomplete](https://issues.apache.org/jira/browse/LUCENE-5847) and may not be suitable for use with RM. Given Lucene's origins as a vector-space implementation and current default BM25 scorer, we opted to instead implement Rocchio-style expansion. While Rocchio expansion was not originally intended for use with the BM25 retrieval model, it has proven effective.
  9 | 
 10 | ## REST Interface
 11 | 
 12 | Endpoint:  
 13 | ``/index/_expand``
 14 | 
 15 | Parameters:
 16 | * ``type``: Document type, defaults to ``dataset``
 17 | * ``field``: Field to search, defaults to ``_all``
 18 | * ``alpha``: Original query weight, defaults to 0.5
 19 | * ``beta``: Feedback query weight, defaults to 0.5
 20 | * ``k1``: BM25 k1 parameter, defaults to 1.2
 21 | * ``b``: BM25 b parameter, defaults to 0.75
 22 | * ``fbDocs``: Number of feedback documents, defaults to 10
 23 | * ``fbTerms``: Number of feedback terms, defaults to 10
 24 | * ``stoplist``: Additional stoplist terms (modifies primary stoplist)
 25 | * ``query``:  Query to expand
 26 | 
 27 | The expand endpoint returns a JSON object with the expanded query in "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" format with each expansion term and the associated expansion weight:
 28 | ```
 29 | {
 30 |     "query":  "term1^weight1 term2^weight2 ..."
 31 | }
 32 | ```
 33 | 
 34 | This query can be used with the standard ElasticSeach ``_search`` endpoint:
 35 | ```
 36 | curl -XGET 'localhost:9200/biocaddie/_search?pretty' -H 'Content-Type: application/json' -d'
 37 | {
 38 |     "query": {
 39 |         "query_string" : {
 40 |             "default_field" : "_all",
 41 |             "query" : "term1^weight1 term2^weight2"
 42 |         }
 43 |     }
 44 | }
 45 | '
 46 | ```
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## Prerequisites
 53 | 
 54 | * ElasticSearch 5.3.2 (native or via Docker)
 55 | * Git + Maven (native or via Docker)
 56 | * ElasticSearch index
 57 | 
 58 | ## Installing from OSSRH
 59 | You can install the plugin using the following command:
 60 | ```bash
 61 | bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT/queryexpansion-5.3.2-20170726.231658-1.zip
 62 | ```
 63 | 
 64 | NOTE: You can check https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT for a link to the newest `.zip` file.
 65 | 
 66 | ## Building From Source
 67 | Clone this repository:
 68 | ```bash
 69 | git clone nds-org/elasticsearch-queryexpansion-plugin queryexpansion && cd queryexpansion 
 70 | mvn package
 71 | bin/elasticsearch-plugin install file:///path/to/elasticsearch-queryexpansion-plugin/target/releases/queryexpansion-5.3.2-SNAPSHOT.zip
 72 | ```
 73 | 
 74 | 
 75 | ##  Example usage
 76 | 
 77 | The repository includes several scripts demonstrating how to install and use the plugin via Docker:
 78 | 
 79 | 1. [Setup](README.md#setup)
 80 | 2. [Build](README.md#build)
 81 | 3. [Load](README.md#load)
 82 | 4. [Test](README.md#test)
 83 | 
 84 | ### Setup
 85 | The following steps demonstrate how to build an ElasticSearch index from the bioCADDIE test collection.
 86 | 
 87 | Make sure that the biocaddie benchmark test dataset exists somewhere on disk:
 88 | ```bash
 89 | cd $HOME
 90 | wget https://biocaddie.org/sites/default/files/update_json_folder.zip && unzip update_json_folder.zip
 91 | ```
 92 | 
 93 | Start ElasticSearch or run ElasticSearch 5.3.2 via Docker using the helper script:
 94 | ```bash
 95 | ./scripts/start.sh
 96 | ```
 97 | 
 98 | Create an index with the required parameters (store==true):
 99 | ```bash
100 | ./scripts/create-index.sh
101 | ```
102 | 
103 | NOTE: You may need to modify *dataset_path* in `./scripts/add-docs.sh` if your benchmark data is not located within `$HOME`.
104 | 
105 | Finally, use the helper script to add the documents to the index:
106 | ```bash
107 | ./scripts/add-docs.sh
108 | ```
109 | 
110 | NOTE: Indexing the full benchmark set can take a long time. If you only need a small subset of the documents, you can always `Ctrl+C` once you get the desired number of records indexed.
111 | 
112 | ### Build
113 | The following helper script will build the plugin using Maven (or using Docker if Maven is not installed):
114 | ```bash
115 | ./scripts/build.sh
116 | ```
117 | Either way, the build should produce a `target/releases/` directory with the necessary `.zip` file.
118 | 
119 | The `.zip` that ElasticSearch needs should be found at `./target/releases/rocchio-0.0.1-SNAPSHOT.jar`.
120 | 
121 | ### Load
122 | Once the artifacts are built, we just need to install them and restart ElasticSearch. The following helper scripts assume that you are running ElasticSearch via Docker:
123 | ```bash
124 | ./scripts/install.sh
125 | ./scripts/restart.sh
126 | ```
127 | 
128 | ### Test
129 | You should now be able to test the new endpoint using the helper script or via raw `curl`:
130 | ```bash
131 | $ ./test.sh
132 | {"query":"sclerosis^2.798773920190095 study^0.4716440174771813 disease^0.584064093901503 or^0.3394485958568884 patients^0.79730633189081 multiple^1.941784058395449 was^0.4222225922753828 is^0.38702376034952857 to^0.4432445617796595 on^0.3817563584164061"}
133 | ```
134 | 
135 | You can check the container logs to see what happened under the covers:
136 | ```bash
137 | $ ./logs.sh
138 | ...
139 | [2017-07-01T04:54:54,007][INFO ][o.e.p.PluginsService     ] [lmIsnX7] loaded module [reindex]
140 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService     ] [lmIsnX7] loaded module [transport-netty3]
141 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService     ] [lmIsnX7] loaded module [transport-netty4]
142 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService     ] [lmIsnX7] loaded plugin [queryexpansion]
143 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService     ] [lmIsnX7] loaded plugin [x-pack]
144 | [2017-07-01T04:55:00,722][INFO ][o.e.n.Node               ] initialized
145 | [2017-07-01T04:55:00,744][INFO ][o.e.n.Node               ] [lmIsnX7] starting ...
146 | [2017-07-01T04:55:01,467][WARN ][i.n.u.i.MacAddressUtil   ] Failed to find a usable hardware address from the network interfaces; using random bytes: f8:2c:c0:8c:3e:88:3b:3b
147 | [2017-07-01T04:55:01,695][INFO ][o.e.t.TransportService   ] [lmIsnX7] publish_address {127.0.0.1:9300}, bound_addresses {127.0.0.1:9300}
148 | [2017-07-01T04:55:02,082][INFO ][o.e.m.j.JvmGcMonitorService] [lmIsnX7] [gc][1] overhead, spent [260ms] collecting in the last [1s]
149 | [2017-07-01T04:55:05,179][INFO ][o.e.c.s.ClusterService   ] [lmIsnX7] new_master {lmIsnX7}{lmIsnX7NRH2_Vmq6avBitQ}{iyWg9zTcQqCeF97xX-hdJQ}{127.0.0.1}{127.0.0.1:9300}, reason: zen-disco-elected-as-master ([0] nodes joined)
150 | [2017-07-01T04:55:05,305][INFO ][o.e.x.s.t.n.SecurityNetty4HttpServerTransport] [lmIsnX7] publish_address {172.17.0.2:9200}, bound_addresses {[::]:9200}
151 | [2017-07-01T04:55:05,318][INFO ][o.e.n.Node               ] [lmIsnX7] started
152 | [2017-07-01T04:55:06,492][INFO ][o.e.l.LicenseService     ] [lmIsnX7] license [0a8ce788-74ad-49d9-aa3c-3c46ab9100d8] mode [trial] - valid
153 | [2017-07-01T04:55:06,513][INFO ][o.e.g.GatewayService     ] [lmIsnX7] recovered [4] indices into cluster_state
154 | [2017-07-01T04:55:08,078][INFO ][o.e.c.r.a.AllocationService] [lmIsnX7] Cluster health status changed from [RED] to [YELLOW] (reason: [shards started [[.monitoring-es-2-2017.07.01][0], [biocaddie][0]] ...]).
155 | [2017-07-01T04:55:13,088][INFO ][o.n.e.r.RocchioExpandRestAction] [lmIsnX7] Starting Rocchio (biocaddie,multiple sclerosis,dataset,_all,10,10,0.50,0.50,1.20,0.75)
156 | ...
157 | ```
158 | 
159 | ## Helper Scripts
160 | A few other helper scripts are included to ease testing:
161 | ```bash
162 | ./scripts/start.sh          # Runs or starts your elasticsearch container
163 | ./scripts/stop.sh           # Stops your elasticsearch container
164 | ./scripts/restart.sh
165 | ./scripts/create-index.sh   # Creates a test index with the proper settings to enable storing term vectors
166 | ./scripts/add-docs.sh [-v]  # Adds documents from the biocaddie benchmark set to your index (assumes correct paths)
167 | ./scripts/delete-index.sh   # Deletes your container's test index and the records within
168 | ./scripts/build.sh          # Builds up elasticsearch plugin artifacts
169 | ./scripts/install.sh        # Installs the elasticsearch plugin into your running container
170 | ./scripts/remove.sh         # Removes your container's installed queryexpanion plugin
171 | ./rebuild.sh                # Removes the current plugin, builds the artifacts, installs the new plugin, and restarts elasticsearch to facilitate rapid development and testing
172 | ./logs.sh                   # View your elasticsearch container logs (tail=100)
173 | ./test.sh [search]          # Performs a test query against our REST API endpoint (only expands by default, but searches if first parameter is "search")
174 | ```
175 | 
176 | # Deploying artifacts
177 | New artifacts can be deployed to OSSRH using the following command:
178 | ```bash
179 | GPG_TTY=$(tty) mvn clean deploy
180 | ```
181 | 


--------------------------------------------------------------------------------
/src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/unit/RocchioTest.java:
--------------------------------------------------------------------------------
  1 | package org.nationaldataservice.elasticsearch.rocchio.test.unit;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | import static org.mockito.Mockito.*;
  5 | import java.io.IOException;
  6 | import java.util.HashMap;
  7 | import java.util.LinkedHashMap;
  8 | import java.util.Map;
  9 | 
 10 | import org.elasticsearch.search.SearchHits;
 11 | import org.elasticsearch.search.SearchHit;
 12 | import org.apache.lucene.index.Fields;
 13 | import org.apache.lucene.index.Terms;
 14 | import org.apache.lucene.index.TermsEnum;
 15 | import org.apache.lucene.util.BytesRef;
 16 | import org.elasticsearch.action.ActionFuture;
 17 | import org.elasticsearch.action.ListenableActionFuture;
 18 | import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
 19 | import org.elasticsearch.action.search.SearchRequestBuilder;
 20 | import org.elasticsearch.action.search.SearchResponse;
 21 | import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse;
 22 | import org.elasticsearch.action.termvectors.MultiTermVectorsRequestBuilder;
 23 | import org.elasticsearch.action.termvectors.MultiTermVectorsResponse;
 24 | import org.elasticsearch.action.termvectors.TermVectorsResponse;
 25 | import org.elasticsearch.client.AdminClient;
 26 | import org.elasticsearch.client.Client;
 27 | import org.elasticsearch.client.ClusterAdminClient;
 28 | import org.elasticsearch.cluster.ClusterState;
 29 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 30 | import org.elasticsearch.cluster.metadata.MappingMetaData;
 31 | import org.elasticsearch.cluster.metadata.MetaData;
 32 | import org.elasticsearch.common.collect.ImmutableOpenMap;
 33 | import org.elasticsearch.index.query.QueryStringQueryBuilder;
 34 | import org.junit.After;
 35 | import org.junit.Before;
 36 | import org.junit.Test;
 37 | import org.junit.runner.RunWith;
 38 | import org.mockito.runners.MockitoJUnitRunner;
 39 | import org.nationaldataservice.elasticsearch.rocchio.Rocchio;
 40 | 
 41 | import edu.gslis.textrepresentation.FeatureVector;
 42 | 
 43 | /**
 44 |  * This is a simple unit test suite for the Rocchio ElasticSearch Plugin. Use
 45 |  * these test cases to verify correctness of the query expansion process. You
 46 |  * can also vary the parameters here to see how that affects the resulting
 47 |  * expansion. All ElasticSearch internals have been mocked with Mockito to
 48 |  * return fake data.
 49 |  * 
 50 |  * 
 51 |  * @author lambert8
 52 |  *
 53 |  */
 54 | @RunWith(MockitoJUnitRunner.class)
 55 | public class RocchioTest {
 56 | 	/** The Rocchio instance to test */
 57 | 	private Rocchio rocchio;
 58 | 
 59 | 	// The common test parameter set (individual tests can still use one-off
 60 | 	// values)
 61 | 	private static final String TEST_INDEX = "biocaddie";
 62 | 	private static final String TEST_QUERY = "rat";
 63 | 	private static final String TEST_TYPE = "dataset";
 64 | 	private static final String TEST_FIELD = "_all";
 65 | 	private static final int TEST_FB_TERMS = 10;
 66 | 	private static final int TEST_FB_DOCS = 50;
 67 | 	private static final double TEST_ALPHA = 0.5;
 68 | 	private static final double TEST_BETA = 0.5;
 69 | 	private static final double TEST_K1 = 1.2;
 70 | 	private static final double TEST_B = 0.75;
 71 | 
 72 | 	// Mock out all of the ElasticSearch internals
 73 | 	private static final Client client = mock(Client.class);
 74 | 
 75 | 	@SuppressWarnings("unchecked")
 76 | 	private static final ActionFuture<ClusterStateResponse> clusterStateFuture = (ActionFuture<ClusterStateResponse>) mock(ActionFuture.class);
 77 | 	private static final AdminClient adminClient = mock(AdminClient.class);
 78 | 	private static final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class);
 79 | 	private static final ClusterState clusterState = mock(ClusterState.class);
 80 | 	private static final ClusterStateResponse clusterStateResponse = mock(ClusterStateResponse.class);
 81 | 	private static final MetaData clusterMetadata = mock(MetaData.class);
 82 | 	private static final IndexMetaData mockIndexMetaData = mock(IndexMetaData.class);
 83 | 
 84 | 	@SuppressWarnings("unchecked")
 85 | 	private static final ListenableActionFuture<MultiTermVectorsResponse> mockMtvFuture = mock(ListenableActionFuture.class);
 86 | 	private static final MultiTermVectorsResponse mockMtvResponse = mock(MultiTermVectorsResponse.class);
 87 | 	private static final TermVectorsResponse mockTvResponse = mock(TermVectorsResponse.class);
 88 | 	private static final MultiTermVectorsItemResponse mockMtvItemResponse = mock(MultiTermVectorsItemResponse.class);
 89 | 	private static final MultiTermVectorsRequestBuilder mockMtvBuilder = mock(MultiTermVectorsRequestBuilder.class);
 90 | 	private static final Fields mockFields = mock(Fields.class);
 91 | 	private static final Terms mockTerms = mock(Terms.class);
 92 | 	private static final MultiTermVectorsItemResponse[] mockMtvItemResponses = { mockMtvItemResponse };
 93 | 
 94 | 	@SuppressWarnings("unchecked")
 95 | 	private static final ListenableActionFuture<SearchResponse> mockSearchFuture = mock(ListenableActionFuture.class);
 96 | 	private static final SearchRequestBuilder srBuilder = mock(SearchRequestBuilder.class);
 97 | 	private static final SearchResponse mockSearchResponse = mock(SearchResponse.class);
 98 | 
 99 | 	// These are used internally, but are overridden by later mocks (see TermsEnum iteration)
100 | 	private static final SearchHits hits = mock(SearchHits.class);
101 | 	private static final SearchHit hit1 = mock(SearchHit.class);
102 | 	private static final SearchHit hit2 = mock(SearchHit.class);
103 | 	private static final SearchHit hit3 = mock(SearchHit.class);
104 | 	private static final SearchHit[] hitsArray = { hit1, hit2, hit3 };
105 | 	
106 | 	private static final TermsEnum mockIterator = mock(TermsEnum.class);
107 | 
108 | 	// The index mapping metadata and sub-mappings
109 | 	private static final MappingMetaData mockTypeMetadata = mock(MappingMetaData.class);
110 | 	private static final ImmutableOpenMap<String, MappingMetaData> indexMappingMetadata;
111 | 	private static final LinkedHashMap<String, Object> fieldPropertiesMap = new LinkedHashMap<String, Object>();
112 | 	private static final LinkedHashMap<String, Object> typePropertiesMap = new LinkedHashMap<String, Object>();
113 | 	private static final LinkedHashMap<String, Object> typeMap = new LinkedHashMap<String, Object>();
114 | 	private static final Map<String, MappingMetaData> typeMetadataMapping = new HashMap<>();
115 | 
116 | 	// FIXME: finish mocking out iterator and expand
117 | 	private static final BytesRef termRef = new BytesRef("rat");
118 | 
119 | 	/** Static initializer: set up all required test data and mocks */
120 | 	static {
121 | 		// Build up our properties mapping: { "store": true } object
122 | 		fieldPropertiesMap.put("store", true);
123 | 
124 | 		// Build up our test field mapping with the properties map
125 | 		typePropertiesMap.put(TEST_FIELD, fieldPropertiesMap);
126 | 
127 | 		// Build up our test type mapping from the test field mapping
128 | 		typeMap.put("properties", typePropertiesMap);
129 | 		typeMap.put("_all", fieldPropertiesMap);
130 | 
131 | 		// Build up our test type mapping of the type metadata
132 | 		typeMetadataMapping.put(TEST_TYPE, mockTypeMetadata);
133 | 
134 | 		// Build up our index mapping from the type mapping
135 | 		indexMappingMetadata = new ImmutableOpenMap.Builder<String, MappingMetaData>().putAll(typeMetadataMapping).build();
136 | 
137 | 		try {
138 | 			// Mock out ElasticSearch index mapping verification
139 | 			when(client.admin()).thenReturn(adminClient);
140 | 			when(adminClient.cluster()).thenReturn(clusterAdminClient);
141 | 			when(clusterAdminClient.state(any())).thenReturn(clusterStateFuture);
142 | 			when(clusterStateFuture.actionGet()).thenReturn(clusterStateResponse);
143 | 			when(clusterStateResponse.getState()).thenReturn(clusterState);
144 | 			when(clusterState.getMetaData()).thenReturn(clusterMetadata);
145 | 			when(clusterMetadata.index(anyString())).thenReturn(mockIndexMetaData);
146 | 			when(mockIndexMetaData.getMappings()).thenReturn(indexMappingMetadata);
147 | 			when(mockTypeMetadata.getSourceAsMap()).thenReturn(typeMap);
148 | 
149 | 			// Mock out ElasticSearch Search
150 | 			when(client.prepareSearch(anyString())).thenReturn(srBuilder);
151 | 			when(srBuilder.setQuery(any(QueryStringQueryBuilder.class))).thenReturn(srBuilder);
152 | 			when(srBuilder.setSize(anyInt())).thenReturn(srBuilder);
153 | 			when(srBuilder.execute()).thenReturn(mockSearchFuture);
154 | 			when(mockSearchFuture.actionGet()).thenReturn(mockSearchResponse);
155 | 			when(mockSearchResponse.getHits()).thenReturn(hits);
156 | 			when(hits.getHits()).thenReturn(hitsArray);
157 | 			when(hits.hits()).thenReturn(hitsArray);
158 | 			
159 | 			// These are used internally, but are likely
160 | 			// overridden by later mocks (see TermsEnum iteration)
161 | 			when(hits.totalHits()).thenReturn(Long.valueOf(3));
162 | 			when(hits.getTotalHits()).thenReturn(Long.valueOf(3));
163 | 
164 | 			// Mock out ElasticSearch MultiTermVector Fields/Terms
165 | 			when(mockMtvBuilder.execute()).thenReturn(mockMtvFuture);
166 | 			when(mockMtvFuture.actionGet()).thenReturn(mockMtvResponse);
167 | 			when(mockMtvBuilder.add(any())).thenReturn(mockMtvBuilder);
168 | 			when(client.prepareMultiTermVectors()).thenReturn(mockMtvBuilder);
169 | 			when(mockMtvItemResponse.getResponse()).thenReturn(mockTvResponse);
170 | 			when(mockMtvResponse.getResponses()).thenReturn(mockMtvItemResponses);
171 | 
172 | 			// FIXME: The two sections below return completely arbitrary values
173 | 			// and should be updated to something more sane
174 | 			// Mock out Lucene Fields/Terms
175 | 			when(mockTvResponse.getFields()).thenReturn(mockFields);
176 | 			when(mockFields.terms(TEST_FIELD)).thenReturn(mockTerms);
177 | 			when(mockTerms.getDocCount()).thenReturn(10);
178 | 			when(mockTerms.getSumTotalTermFreq()).thenReturn(10L);
179 | 			when(mockTerms.iterator()).thenReturn(mockIterator);
180 | 
181 | 			// Mock out Lucene TermsEnum iteration
182 | 			when(mockIterator.next()).thenReturn(termRef).thenReturn(null);
183 | 			when(mockIterator.totalTermFreq()).thenReturn(10L);
184 | 			when(mockIterator.docFreq()).thenReturn(10);
185 | 			when(mockIterator.term()).thenReturn(termRef);
186 | 		} catch (IOException e) {
187 | 			e.printStackTrace();
188 | 			fail();
189 | 		}
190 | 	};
191 | 
192 | 	@Before
193 | 	/** Set up our test Rocchio implementation */
194 | 	public void setUp() throws IOException {
195 | 		this.rocchio = new Rocchio(client, TEST_INDEX, TEST_TYPE, TEST_FIELD, TEST_ALPHA, TEST_BETA, TEST_K1, TEST_B);
196 | 	}
197 | 
198 | 	@After
199 | 	/** Tear down our test Rocchio implementation */
200 | 	public void tearDown() {
201 | 		this.rocchio = null;
202 | 	}
203 | 
204 | 	@Test
205 | 	/** Test that validate properly returns null if all parameters are valid */
206 | 	public void testValidate() throws IOException {
207 | 		String shouldBeNull = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS);
208 | 		assertNull(shouldBeNull);
209 | 	}
210 | 
211 | 	@Test
212 | 	/** Test that validate fails when query is null */
213 | 	public void testValidateInvalidQuery() throws IOException {
214 | 		String errorMessage = rocchio.validate("", TEST_FB_DOCS, TEST_FB_TERMS);
215 | 		assertNotNull(errorMessage);
216 | 		assertEquals(Rocchio.NULL_QUERY_ERROR, errorMessage);
217 | 	}
218 | 
219 | 	@Test
220 | 	/** Test that validate fails when fbDocs < 1 */
221 | 	public void testValidateInvalidFeedbackDocuments() throws IOException {
222 | 		String errorMessage = rocchio.validate(TEST_QUERY, 0, TEST_FB_TERMS);
223 | 		assertNotNull(errorMessage);
224 | 		assertEquals(Rocchio.INVALID_FB_DOCS_ERROR, errorMessage);
225 | 	}
226 | 
227 | 	@Test
228 | 	/** Test that validate fails when fbTerms < 1 */
229 | 	public void testValidateInvalidFeedbackTerms() throws IOException {
230 | 		String errorMessage = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, 0);
231 | 		assertNotNull(errorMessage);
232 | 		assertEquals(Rocchio.INVALID_FB_TERMS_ERROR, errorMessage);
233 | 	}
234 | 
235 | 	@Test
236 | 	/** Test that we can expand a query against the test index */
237 | 	public void testExpandQuery() throws IOException {
238 | 		// Expand the query
239 | 		FeatureVector feedbackQuery = rocchio.expandQuery(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS);
240 | 
241 | 		// Verify expanded segments
242 | 		String[] segments = feedbackQuery.toString().trim().split(" ");
243 | 		assertEquals(2, segments.length);
244 | 		assertEquals("0.012976521", segments[0]);
245 | 		assertEquals("rat", segments[1]);
246 | 	}
247 | }
248 | 


--------------------------------------------------------------------------------
/src/test/ant/integration-tests.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <project name="integration-tests" xmlns:if="ant:if" xmlns:unless="ant:unless">
  3 |     <!-- our pid file for easy cleanup -->
  4 |     <property name="integ.pidfile" location="${project.build.directory}/integration-tests/run/es.pid"/>
  5 | 
  6 |     <!-- if this exists, ES is running (maybe) -->
  7 |     <available property="integ.pidfile.exists" file="${integ.pidfile}"/>
  8 | 
  9 |     <!-- name of our cluster, maybe needs changing -->
 10 |     <property name="integ.cluster.name" value="elasticsearch_integration"/>
 11 | 
 12 |     <!-- runs an OS script -->
 13 |     <macrodef name="run-script">
 14 |         <attribute name="script"/>
 15 |         <attribute name="spawn" default="false"/>
 16 |         <element name="nested" optional="true"/>
 17 |         <sequential>
 18 |             <local name="failonerror"/>
 19 |             <condition property="failonerror">
 20 |                 <isfalse value="@{spawn}"/>
 21 |             </condition>
 22 | 
 23 |             <!-- create a temp CWD, to enforce that commands don't rely on CWD -->
 24 |             <local name="temp.cwd"/>
 25 |             <tempfile property="temp.cwd" destDir="${project.build.directory}/integration-tests/run/tmp" deleteonexit="true"/>
 26 |             <mkdir dir="${temp.cwd}"/>
 27 | 
 28 |             <!-- print commands we run -->
 29 |             <local name="script.base"/>
 30 |             <basename file="@{script}" property="script.base"/>
 31 |             <!-- crappy way to output, but we need it. make it nice later -->
 32 |             <echoxml><exec script="${script.base}"><nested/></exec></echoxml>
 33 |             <exec executable="cmd" osfamily="winnt" dir="${temp.cwd}" failonerror="${failonerror}" spawn="@{spawn}" taskname="${script.base}">
 34 |                 <arg value="/c"/>
 35 |                 <arg value="&quot;"/>
 36 |                 <arg value="@{script}.bat"/>
 37 |                 <nested/>
 38 |                 <arg value="&quot;"/>
 39 |             </exec>
 40 | 
 41 |             <exec executable="bash" osfamily="unix" dir="${temp.cwd}" failonerror="${failonerror}" spawn="@{spawn}" taskname="${script.base}">
 42 |                 <arg value="@{script}"/>
 43 |                 <nested/>
 44 |             </exec>
 45 |         </sequential>
 46 |     </macrodef>
 47 | 
 48 |     <!-- extracts PID from file -->
 49 |     <macrodef name="extract-pid">
 50 |         <attribute name="file"/>
 51 |         <attribute name="property"/>
 52 |         <sequential>
 53 |             <loadfile srcFile="@{file}" property="@{property}">
 54 |                 <filterchain>
 55 |                     <striplinebreaks/>
 56 |                 </filterchain>
 57 |             </loadfile>
 58 |         </sequential>
 59 |     </macrodef>
 60 | 
 61 |     <!-- applies transformations to src and stores in dst -->
 62 |     <macrodef name="filter-property">
 63 |         <attribute name="src"/>
 64 |         <attribute name="dest"/>
 65 |         <element name="chain"/>
 66 |         <sequential>
 67 |             <loadresource property="@{dest}">
 68 |                 <propertyresource name="@{src}"/>
 69 |                 <filterchain>
 70 |                     <tokenfilter>
 71 |                         <chain/>
 72 |                     </tokenfilter>
 73 |                 </filterchain>
 74 |             </loadresource>
 75 |         </sequential>
 76 |     </macrodef>
 77 | 
 78 |     <!-- waits for elasticsearch to start -->
 79 |     <macrodef name="waitfor-elasticsearch">
 80 |         <attribute name="port"/>
 81 |         <attribute name="timeoutproperty"/>
 82 |         <sequential>
 83 |             <echo>Waiting for elasticsearch to become available on port @{port}...</echo>
 84 |             <waitfor maxwait="60" maxwaitunit="second"
 85 |                      checkevery="500" checkeveryunit="millisecond"
 86 |                      timeoutproperty="@{timeoutproperty}">
 87 |                 <http url="http://localhost:@{port}"/>
 88 |             </waitfor>
 89 |         </sequential>
 90 |     </macrodef>
 91 | 
 92 |     <scriptdef name="isGreater" language="javascript">
 93 |         <attribute name="v1"/>
 94 |         <attribute name="v2"/>
 95 |         <![CDATA[
 96 | 
 97 |             var i, l, d, s = false;
 98 | 
 99 |             a = attributes.get("v1").split('.');
100 |             b = attributes.get("v2").split('.');
101 |             l = Math.min(a.length, b.length);
102 | 
103 |             for (i=0; i<l; i++) {
104 |                 d = parseInt(a[i], 10) - parseInt(b[i], 10);
105 |                 if (d !== 0) {
106 |                     project.setProperty("compare-result", d > 0);
107 |                     s = true;
108 |                     break;
109 |                 }
110 |             }
111 | 
112 |             if(!s){
113 |                 d = a.length - b.length;
114 |                 project.setProperty("compare-result", d >= 0);
115 |             }
116 | 
117 |             ]]>
118 |     </scriptdef>
119 | 
120 |     <!-- start elasticsearch and wait until its ready -->
121 |     <macrodef name="startup-elasticsearch">
122 |         <attribute name="home" default="${project.build.directory}/integration-tests/run/elasticsearch-${elasticsearch.version}"/>
123 |         <attribute name="spawn" default="true"/>
124 |         <attribute name="es.cluster.name" default="${integ.cluster.name}"/>
125 |         <attribute name="es.http.port" default="${integ.http.port}"/>
126 |         <attribute name="es.transport.tcp.port" default="${integ.transport.port}"/>
127 |         <attribute name="es.pidfile" default="${integ.pidfile}"/>
128 |         <element name="additional-args" optional="true"/>
129 |         <sequential>
130 |             <!-- make sure no elasticsearch instance is currently running and listening on the port we need -->
131 |             <fail message="This test expects port @{es.http.port} to be free but an elasticsearch instance is already running and listening on that port.
132 |       Maybe the last test run did not manage to shut down the node correctly?
133 |       You must kill it before tests can run.">
134 |                 <condition>
135 |                     <socket server="localhost" port="@{es.http.port}"></socket>
136 |                 </condition>
137 |             </fail>
138 |             <!-- run bin/elasticsearch with args -->
139 |             <echo>Starting up external cluster...</echo>
140 |             <isGreater v1="${elasticsearch.version}" v2="5.0.0" />
141 | 
142 |             <echo if:true="${compare-result}">running Elasticsearch 5.0.0 or superior</echo>
143 |             <echo unless:true="${compare-result}">running Elasticsearch &lt; 5.0.0</echo>
144 | 
145 |             <run-script script="@{home}/bin/elasticsearch"
146 |                         spawn="@{spawn}">
147 |                 <nested>
148 |                     <arg value="-Des.pidfile=@{es.pidfile}" unless:true="${compare-result}"/>
149 |                     <arg value="-Des.cluster.name=@{es.cluster.name}" unless:true="${compare-result}"/>
150 |                     <arg value="-Des.http.port=@{es.http.port}" unless:true="${compare-result}"/>
151 |                     <arg value="-Des.transport.tcp.port=@{es.transport.tcp.port}" unless:true="${compare-result}"/>
152 |                     <arg value="-Des.network.host=127.0.0.1" unless:true="${compare-result}"/>
153 |                     <arg value="-Epidfile=@{es.pidfile}" if:true="${compare-result}"/>
154 |                     <arg value="-Ecluster.name=@{es.cluster.name}" if:true="${compare-result}"/>
155 |                     <arg value="-Ehttp.port=@{es.http.port}" if:true="${compare-result}"/>
156 |                     <arg value="-Etransport.tcp.port=@{es.transport.tcp.port}" if:true="${compare-result}"/>
157 |                     <arg value="-Enetwork.host=127.0.0.1" if:true="${compare-result}"/>
158 |                     <additional-args/>
159 |                 </nested>
160 |             </run-script>
161 | 
162 |             <!-- wait for startup -->
163 |             <local name="failed.to.start"/>
164 |             <waitfor-elasticsearch port="@{es.http.port}"
165 |                                    timeoutproperty="failed.to.start"/>
166 | 
167 |             <!-- best effort, print console log. useful if it fails especially -->
168 |             <local name="log.contents"/>
169 |             <loadfile srcFile="@{home}/logs/@{es.cluster.name}.log"
170 |                       property="log.contents"
171 |                       failonerror="false"/>
172 |             <echo message="${log.contents}" taskname="elasticsearch"/>
173 | 
174 |             <fail message="ES instance did not start" if="failed.to.start"/>
175 | 
176 |             <local name="integ.pid"/>
177 |             <extract-pid file="@{es.pidfile}" property="integ.pid"/>
178 |             <echo>External node started PID ${integ.pid}</echo>
179 |         </sequential>
180 |     </macrodef>
181 | 
182 |     <macrodef name="stop-node">
183 |         <attribute name="es.pidfile" default="${integ.pidfile}"/>
184 |         <sequential>
185 |             <local name="integ.pid"/>
186 | 
187 |             <extract-pid file="@{es.pidfile}" property="integ.pid"/>
188 |             <echo>Shutting down external node PID ${integ.pid}</echo>
189 |             <!-- verify with jps that this actually is the correct pid.
190 |             See if we can find the line "pid org.elasticsearch.bootstrap.Elasticsearch" in the output of jps -l.-->
191 |             <local name="jps.pidline"/>
192 |             <local name="jps.executable"/>
193 |             <local name="environment"/>
194 |             <property environment="environment"/>
195 |             <property name="jps.executable" location="${environment.JAVA_HOME}/bin/jps"/>
196 |             <exec executable="${jps.executable}" failonerror="true">
197 |                 <arg value="-l"/>
198 |                 <redirector outputproperty="jps.pidline">
199 |                     <outputfilterchain>
200 |                         <linecontains>
201 |                             <contains value="${integ.pid} org.elasticsearch.bootstrap.Elasticsearch"/>
202 |                         </linecontains>
203 |                     </outputfilterchain>
204 |                 </redirector>
205 |             </exec>
206 |             <fail
207 |                     message="pid file at @{es.pidfile} is ${integ.pid} but jps -l did not report any process with org.elasticsearch.bootstrap.Elasticsearch and this pid.
208 |           Did you run mvn clean? Maybe an old pid file is still lying around.">
209 |                 <condition>
210 |                     <equals arg1="${jps.pidline}" arg2=""/>
211 |                 </condition>
212 |             </fail>
213 | 
214 |             <exec executable="taskkill" failonerror="true" osfamily="winnt">
215 |                 <arg value="/F"/>
216 |                 <arg value="/PID"/>
217 |                 <arg value="${integ.pid}"/>
218 |             </exec>
219 |             <exec executable="kill" failonerror="true" osfamily="unix">
220 |                 <arg value="-9"/>
221 |                 <arg value="${integ.pid}"/>
222 |             </exec>
223 |             <delete file="@{es.pidfile}"/>
224 |         </sequential>
225 |     </macrodef>
226 | 
227 |     <target name="stop-external-cluster" if="integ.pidfile.exists">
228 |         <stop-node/>
229 |     </target>
230 | 
231 |     <target name="setup-workspace" depends="stop-external-cluster">
232 |         <sequential>
233 |             <delete dir="${project.build.directory}/integration-tests/run"/>
234 |             <unzip src="${project.build.directory}/integration-tests/binaries/elasticsearch-${elasticsearch.version}.zip"
235 |                    dest="${project.build.directory}/integration-tests/run"/>
236 |         </sequential>
237 |     </target>
238 | 
239 |     <target name="start-external-cluster" depends="setup-workspace">
240 |         <startup-elasticsearch/>
241 |     </target>
242 | 
243 |     <!-- unzip integ test artifact, install plugin, then start ES -->
244 |     <target name="start-external-cluster-with-plugin" depends="setup-workspace">
245 |         <install-plugin name="${project.artifactId}" file="${project.build.directory}/releases/${project.artifactId}-${project.version}.zip"/>
246 |         <startup-elasticsearch/>
247 |     </target>
248 | 
249 |     <!-- installs a plugin into elasticsearch -->
250 |     <macrodef name="install-plugin">
251 |         <attribute name="home" default="${project.build.directory}/integration-tests/run/elasticsearch-${elasticsearch.version}"/>
252 |         <attribute name="name"/>
253 |         <attribute name="file"/>
254 |         <sequential>
255 |             <local name="url"/>
256 |             <makeurl property="url" file="@{file}"/>
257 | 
258 |             <isGreater v1="${elasticsearch.version}" v2="5.0.0" />
259 |             <property name="commandline" value="@{home}/bin/plugin" unless:true="${compare-result}"/>
260 |             <property name="commandline" value="@{home}/bin/elasticsearch-plugin" if:true="${compare-result}"/>
261 | 
262 |             <!-- install plugin -->
263 |             <echo>Installing plugin @{name}...</echo>
264 |             <run-script script="${commandline}">
265 |                 <nested>
266 |                     <arg value="install"/>
267 |                     <arg value="${url}"/>
268 |                 </nested>
269 |             </run-script>
270 | 
271 |             <fail message="did not find plugin installed as @{name}">
272 |                 <condition>
273 |                     <not>
274 |                         <resourceexists>
275 |                             <file file="@{home}/plugins/@{name}"/>
276 |                         </resourceexists>
277 |                     </not>
278 |                 </condition>
279 |             </fail>
280 |         </sequential>
281 |     </macrodef>
282 | </project>


--------------------------------------------------------------------------------
/src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/integration/RocchioIT.java:
--------------------------------------------------------------------------------
  1 | package org.nationaldataservice.elasticsearch.rocchio.test.integration;
  2 | 
  3 | import org.junit.BeforeClass;
  4 | import org.junit.Test;
  5 | import static org.hamcrest.Matchers.*;
  6 | import static org.junit.Assert.*;
  7 | 
  8 | import java.util.HashMap;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | 
 12 | import org.apache.http.entity.StringEntity;
 13 | import org.apache.logging.log4j.Logger;
 14 | import org.elasticsearch.client.Response;
 15 | import org.elasticsearch.common.logging.ESLoggerFactory;
 16 | 
 17 | /**
 18 |  * This is a simple integration test suite for the ElasticSearch Rocchio
 19 |  * Plugin.Use these test cases to verify correctness of the API endpoint, input
 20 |  * validation, compare performance, scale testing, etc <br/>
 21 |  * Before the test suite runs, the test runner will:
 22 |  * 
 23 |  * <pre>
 24 |  *    * Download ElasticSearch binaries
 25 |  *    * Install the ElasticSearch Rocchio Plugin
 26 |  *    * Start up an ElasticSearch cluster
 27 |  *    * Ensure that the TEST_INDEX has been created
 28 |  *    * Ensure that TEST_INDEX contains some test documents
 29 |  *    * Run the set of test cases
 30 |  *    * Tear down the cluster
 31 |  * </pre>
 32 |  * 
 33 |  * @see {@link AbstractITCase}
 34 |  * @see src/test/ant/integration-tests.xml
 35 |  * 
 36 |  * @author lambert8
 37 |  *
 38 |  */
 39 | public class RocchioIT extends AbstractITCase {
 40 | 	private static final Logger staticLogger = ESLoggerFactory.getLogger(RocchioIT.class);
 41 | 
 42 | 	// The common test parameter set (individual tests can still use one-off
 43 | 	// values)
 44 | 	private static final String TEST_INDEX = "biocaddie";
 45 | 	private static final String TEST_TYPE = "dataset";
 46 | 	private static final int TEST_FB_TERMS = 10;
 47 | 	private static final int TEST_FB_DOCS = 5;
 48 | 	
 49 | 	private final String defaultEndpointParameters = "fbTerms=" + TEST_FB_TERMS + "&fbDocs=" + TEST_FB_DOCS;
 50 | 	private final String expandEndpoint = String.format("/%s/%s/_expand?%s", TEST_INDEX, TEST_TYPE,
 51 | 			defaultEndpointParameters);
 52 | 
 53 | 	// TODO: Improve expectations
 54 | 	private final String EXPECTED_EXPANDED_QUERY_OBJECT = "{query=dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405}";
 55 | 	private final String EXPECTED_EXPANDED_QUERY_STRING = "dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405";
 56 | 	private final String EXPECTED_SEARCH_HITS = "{_shards={total=1, failed=0, successful=1}, hits={hits=[{_index=biocaddie, _type=dataset, _source={DOCNO=1, REPOSITORY=arrayexpress_020916, TITLE=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=We characterized transcriptomes of a strain overexpressing syrA.  Our work shows that the syrA transcriptome shares similar gene expression changes to the syrM and nodD3 transcriptomes and that nodD3 and syrA may be the only targets directly activated by SyrM.  We propose that most of the gene expression changes observed when nodD3 is overexpressed are due to NodD3 activation of syrM expression, which in turn stimulates SyrM activation of syrA expression.  The subsequent increase in SyrA abundance alters activity of the ChvI-ExoS-ExoR circuit, resulting in broad changes in gene expression. Gene expression profiling of Sinorhizobium meliloti overexpressing syrA was performed using custom Affymetrix GeneChips, ID=520401, title=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), experimentType=transcription profiling by array}, organism={experiment={species=Sinorhizobium meliloti}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=1, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=2, REPOSITORY=arrayexpress_020916, TITLE=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-05, description=A study to define the binding loci of RelA-containing NF-kappaB dimers in a human myometrial smooth muscle cell line after exposure to TNF. Monolayers of PHM1-31 cells were exposed to TNF (10ng/ml) for 1 hour or left unstimulated. The Chromatin immunoprecipitation (ChIP) assay was performed to recover RelA-bound chromatin or non-specifically bound chromatin with IgG. That chromatin was prepared and used to probe Affymetrix GeneChIP 1.0R  Human Promoter arrays. Three biological replicates of each experiment were conducted. Datasets were subsequently analysed in Partek Genomics Suite V6.6 where baseline was normalised by subtraction of IgG values from conrresponding RelA-immunoprecipitated samples. Control samples immunoprecipitated with RelA were then compared with TNF-stimulated samples immunoprecipitated with RelA., ID=520482, title=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), experimentType=ChIP-chip by tiling array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=2, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=3, REPOSITORY=arrayexpress_020916, TITLE=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=This SuperSeries is composed of the SubSeries listed below. Refer to individual Series, ID=520420, title=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, experimentType=transcription profiling by array}, organism={experiment={species=Rattus norvegicus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=3, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=4, REPOSITORY=arrayexpress_020916, TITLE=Gene expression profile in Caco-2 cells treated with carnosine, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=To reveal the effects of carnosine on Caco-2 cells, we have employed whole genome microarray to detect genes that showed significantly different expression when exposed to carnosine. Caco-2 cells were treated with 1 mM carnosine for 3 days. Caco-2 cells were treated with 1 mM carnosine for 3 days. Three independent experiments were performed., ID=520441, title=Gene expression profile in Caco-2 cells treated with carnosine, experimentType=transcription profiling by array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=4, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=5, REPOSITORY=arrayexpress_020916, TITLE=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=Mutations in methyl-CpG-binding protein 2 (MeCP2), a major epigenetic regulator, are the predominant cause of Rett syndrome. We previously found that Mecp2-null microglia are deficient in phagocytic ability, and that engraftment of wild-type monocytes into the brain of Mecp2-deficient mice attenuates pathology. We have observed that Mecp2 deficiency is associated with increased levels of histone acetylation at the cis-regulatory regions of the Mecp2-regulated genes in macrophages. We hypothesized that Mecp2 recruits protein complexes containing histone deacetylases (HDACs) to repress the expression of its target genes. Our ChIP-Seq studies in bone-marrow derived macrophages revealed that Mecp2 co-localizes with Ncor2/Hdac3 protein complex at cis-regulatory regions of the target genes. These results suggest a role for Mecp2 in the recruitment and regulation of Ncor2/Hdac3 repressosome that plays a critical role in the regulation of inflammatory responses in macrophages. Examination of NCOR2 and HDAC3 genome-wide location in bone-marrow derived macrophages., ID=520444, title=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], experimentType=ChIP-seq}, organism={experiment={species=Mus musculus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=5, _score=1.0}], total=5, max_score=1.0}, took=1, timed_out=false}";
 57 | 	
 58 | 	@BeforeClass
 59 | 	public static void setUp() {
 60 | 		// Ensure that the index exists
 61 | 		staticLogger.info("Setting up test environment!");
 62 | 		createIndex(TEST_INDEX);
 63 | 
 64 | 		// Ensure that documents to the index
 65 | 		for (int i = 1; i <= 5; i++) {
 66 | 			addDocument(TEST_INDEX, TEST_TYPE, i, DOCUMENTS_JSON[i - 1]);
 67 | 		}
 68 | 
 69 | 		// Tests will fail if we don't wait for ES to index the new documents
 70 | 		staticLogger.info("Waiting for ES to finish indexing documents...");
 71 | 		wait(3000);
 72 | 	}
 73 | 
 74 | 	@Test
 75 | 	@SuppressWarnings("unchecked")
 76 | 	public void testPluginIsLoaded() throws Exception {
 77 | 
 78 | 		Response response = client.performRequest("GET", "/_nodes/plugins");
 79 | 
 80 | 		Map<String, Object> nodes = (Map<String, Object>) entityAsMap(response).get("nodes");
 81 | 		for (String nodeName : nodes.keySet()) {
 82 | 			boolean pluginFound = false;
 83 | 			Map<String, Object> node = (Map<String, Object>) nodes.get(nodeName);
 84 | 			List<Map<String, Object>> plugins = (List<Map<String, Object>>) node.get("plugins");
 85 | 			for (Map<String, Object> plugin : plugins) {
 86 | 				String pluginName = (String) plugin.get("name");
 87 | 				if (pluginName.equals("rocchio")) {
 88 | 					pluginFound = true;
 89 | 					break;
 90 | 				}
 91 | 			}
 92 | 			assertThat(pluginFound, is(true));
 93 | 		}
 94 | 	}
 95 | 
 96 | 	@Test
 97 | 	public void testExpandEndpoint() throws Exception {
 98 | 		String query = "rat";
 99 | 		String params = "&query=" + query;
100 | 		String request = expandEndpoint + params;
101 | 		
102 | 		Response response = client.performRequest("GET", request);
103 | 		assertEquals(EXPECTED_EXPANDED_QUERY_OBJECT, entityAsMap(response).toString());
104 | 	}
105 | 
106 | 	// FIXME: Test case currently fails (see below)
107 | 	//@Test
108 | 	/** Compare performance and  */
109 | 	public void testSearchPerformance() throws Exception {
110 | 		String indexRequest = "/" + TEST_INDEX;
111 | 		Response indicesResponse = client.performRequest("GET", indexRequest, contentTypeHeader);
112 | 		staticLogger.info(entityAsMap(indicesResponse).toString());
113 | 		String query = "rat";
114 | 		String searchEndpoint = "/" + TEST_INDEX + "/_search";
115 | 		
116 | 		// Time a normal (unexpanded) search for our query
117 | 		long searchStart = System.nanoTime();
118 | 		Response unexpandedSearchResponse = client.performRequest("GET", searchEndpoint + "?q=" + query, contentTypeHeader);
119 | 		long searchDuration = System.nanoTime() - searchStart;
120 | 
121 | 		// Time a query expansion
122 | 		String expandParams =  "&query=" + query;
123 | 		String expandRequest = expandEndpoint + expandParams;
124 | 		long expandStart = System.nanoTime();
125 | 		Response expandResponse = client.performRequest("GET", expandRequest);
126 | 		long expandDuration = System.nanoTime() - expandStart;
127 | 		
128 | 		// Verify that expansion returns correctly
129 | 		String expandedQuery = entityAsMap(expandResponse).get("query").toString();
130 | 		assertEquals(EXPECTED_EXPANDED_QUERY_STRING, expandedQuery);
131 | 
132 | 		// FIXME: Test currently fails on this syntax, stating that " " is an
133 | 		// invalid character. I have attempt to use "+", as well as "%20" with
134 | 		// no luck yet. I even tried to send the query as the request body, 
135 | 		// but struggled to find the correct syntax
136 | 		//StringEntity expandedSearchRequestBody = new StringEntity("{\"query\":\"" + expandedQuery.trim() + "\"}");
137 | 		String expandedSearchQueryString = "?q=" + expandedQuery.trim().replaceAll(" ", "+");
138 | 		
139 | 		// Time an expanded search on the same query
140 | 		long expandedSearchStart = System.nanoTime();
141 | 		Response expandedSearchResponse = client.performRequest("GET", searchEndpoint + expandedSearchQueryString, contentTypeHeader);
142 | 		long expandedSearchDuration = System.nanoTime() - expandedSearchStart;
143 | 		long fullExpansionDuration = expandDuration + expandedSearchDuration;
144 | 
145 | 		// Log expansion results
146 | 		staticLogger.info(String.format("Original query: %s", query));
147 | 		staticLogger.info(String.format("Expanded query: %s", expandedQuery));
148 | 		
149 | 		// Log timings
150 | 		staticLogger.info(String.format("Query expansion took: %d ns", expandDuration));
151 | 		staticLogger.info(String.format("Expanded search took: %d ns", expandedSearchDuration));
152 | 		staticLogger.info(String.format("Full expansion + search took: %d ns", fullExpansionDuration));
153 | 		staticLogger.info(String.format("Unexpanded search took: %d ns", searchDuration));
154 | 		
155 | 		// Verify that expanded search returns as expected
156 | 		assertEquals(EXPECTED_SEARCH_HITS, entityAsMap(expandedSearchResponse).toString());
157 | 		
158 | 		// TODO: Analyze expanded results for accuracy?
159 | 		//staticLogger.info(String.format("Unexpanded search results: %s", entityAsMap(unexpandedSearchResponse)));
160 | 		//staticLogger.info(String.format("Expanded search results: %s", entityAsMap(expandedSearchResponse)));
161 | 	}
162 | }
163 | 


--------------------------------------------------------------------------------
/src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/integration/AbstractITCase.java:
--------------------------------------------------------------------------------
  1 | package org.nationaldataservice.elasticsearch.rocchio.test.integration;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.io.UnsupportedEncodingException;
  6 | import java.util.HashMap;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.http.Header;
 10 | import org.apache.http.HttpHost;
 11 | import org.apache.http.entity.StringEntity;
 12 | import org.apache.http.message.BasicHeader;
 13 | import org.apache.logging.log4j.Logger;
 14 | import org.elasticsearch.client.Response;
 15 | import org.elasticsearch.client.RestClient;
 16 | import org.elasticsearch.cluster.ClusterModule;
 17 | import org.elasticsearch.common.bytes.BytesReference;
 18 | import org.elasticsearch.common.logging.ESLoggerFactory;
 19 | import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 20 | import org.elasticsearch.common.xcontent.XContent;
 21 | import org.elasticsearch.common.xcontent.XContentBuilder;
 22 | import org.elasticsearch.common.xcontent.XContentParser;
 23 | import org.elasticsearch.common.xcontent.XContentType;
 24 | import org.junit.AfterClass;
 25 | import org.junit.BeforeClass;
 26 | import static org.hamcrest.Matchers.*;
 27 | import static org.junit.Assert.*;
 28 | import static org.junit.Assume.*;
 29 | 
 30 | /**
 31 |  * This is a simple base integration test suite class for the 
 32 |  * ElasticSearch Rocchio Plugin. Use these test cases to verify correctness of the API endpoint, 
 33 |  * input validation, compare performance, scale testing, etc
 34 |  *
 35 |  * 
 36 |  * @author lambert8
 37 |  *
 38 |  */
 39 | public abstract class AbstractITCase {
 40 | 	protected static final Logger staticLogger = ESLoggerFactory.getLogger(AbstractITCase.class);
 41 | 	protected final static int HTTP_TEST_PORT = 9400;
 42 | 	protected static RestClient client;
 43 | 
 44 | 	protected static final Header contentTypeHeader = new BasicHeader("Content-Type", "application/json");
 45 | 
 46 | 	// TODO: Split these out into separate files
 47 | 	// TODO: Add more documents here to scale things out, or read in the full set from disk
 48 | 	protected static final String INDEX_JSON = "{\"mappings\":{\"dataset\":{\"_all\":{\"type\":\"text\",\"term_vector\":\"with_positions_offsets_payloads\",\"store\":true,\"analyzer\":\"fulltext_analyzer\"}}},\"settings\":{\"index\":{\"number_of_shards\":1,\"number_of_replicas\":0},\"analysis\":{\"analyzer\":{\"fulltext_analyzer\":{\"type\":\"custom\",\"tokenizer\":\"whitespace\",\"filter\":[\"lowercase\",\"type_as_payload\"]}}}}}";
 49 | 	protected static final String[] DOCUMENTS_JSON = {
 50 | 		"{\"DOCNO\":\"1\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Sinorhizobium meliloti\"}},\"dataItem\":{\"description\":\"We characterized transcriptomes of a strain overexpressing syrA.  Our work shows that the syrA transcriptome shares similar gene expression changes to the syrM and nodD3 transcriptomes and that nodD3 and syrA may be the only targets directly activated by SyrM.  We propose that most of the gene expression changes observed when nodD3 is overexpressed are due to NodD3 activation of syrM expression, which in turn stimulates SyrM activation of syrA expression.  The subsequent increase in SyrA abundance alters activity of the ChvI-ExoS-ExoR circuit, resulting in broad changes in gene expression. Gene expression profiling of Sinorhizobium meliloti overexpressing syrA was performed using custom Affymetrix GeneChips\",\"title\":\"The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA)\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520401\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA)\"}",
 51 | 		"{\"DOCNO\":\"2\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Homo sapiens\"}},\"dataItem\":{\"description\":\"A study to define the binding loci of RelA-containing NF-kappaB dimers in a human myometrial smooth muscle cell line after exposure to TNF. Monolayers of PHM1-31 cells were exposed to TNF (10ng/ml) for 1 hour or left unstimulated. The Chromatin immunoprecipitation (ChIP) assay was performed to recover RelA-bound chromatin or non-specifically bound chromatin with IgG. That chromatin was prepared and used to probe Affymetrix GeneChIP 1.0R  Human Promoter arrays. Three biological replicates of each experiment were conducted. Datasets were subsequently analysed in Partek Genomics Suite V6.6 where baseline was normalised by subtraction of IgG values from conrresponding RelA-immunoprecipitated samples. Control samples immunoprecipitated with RelA were then compared with TNF-stimulated samples immunoprecipitated with RelA.\",\"title\":\"RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter)\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-05\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520482\",\"experimentType\":\"ChIP-chip by tiling array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter)\"}",
 52 | 		"{\"DOCNO\":\"3\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Rattus norvegicus\"}},\"dataItem\":{\"description\":\"This SuperSeries is composed of the SubSeries listed below. Refer to individual Series\",\"title\":\"Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520420\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction\"}",
 53 | 		"{\"DOCNO\":\"4\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Homo sapiens\"}},\"dataItem\":{\"description\":\"To reveal the effects of carnosine on Caco-2 cells, we have employed whole genome microarray to detect genes that showed significantly different expression when exposed to carnosine. Caco-2 cells were treated with 1 mM carnosine for 3 days. Caco-2 cells were treated with 1 mM carnosine for 3 days. Three independent experiments were performed.\",\"title\":\"Gene expression profile in Caco-2 cells treated with carnosine\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520441\",\"experimentType\":\"transcription profiling by array\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Gene expression profile in Caco-2 cells treated with carnosine\"}",
 54 | 		"{\"DOCNO\":\"5\",\"METADATA\":{\"dataResource\":{\"keywords\":[],\"altNames\":[],\"acronyms\":[]},\"citation\":{\"count\":\"0\"},\"organism\":{\"experiment\":{\"species\":\"Mus musculus\"}},\"dataItem\":{\"description\":\"Mutations in methyl-CpG-binding protein 2 (MeCP2), a major epigenetic regulator, are the predominant cause of Rett syndrome. We previously found that Mecp2-null microglia are deficient in phagocytic ability, and that engraftment of wild-type monocytes into the brain of Mecp2-deficient mice attenuates pathology. We have observed that Mecp2 deficiency is associated with increased levels of histone acetylation at the cis-regulatory regions of the Mecp2-regulated genes in macrophages. We hypothesized that Mecp2 recruits protein complexes containing histone deacetylases (HDACs) to repress the expression of its target genes. Our ChIP-Seq studies in bone-marrow derived macrophages revealed that Mecp2 co-localizes with Ncor2/Hdac3 protein complex at cis-regulatory regions of the target genes. These results suggest a role for Mecp2 in the recruitment and regulation of Ncor2/Hdac3 repressosome that plays a critical role in the regulation of inflammatory responses in macrophages. Examination of NCOR2 and HDAC3 genome-wide location in bone-marrow derived macrophages.\",\"title\":\"Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq]\",\"releaseDate\":\"2015-03-31\",\"lastUpdateDate\":\"2015-04-04\",\"dataTypes\":[\"organism\",\"dataItem\",\"citation\"],\"ID\":\"520444\",\"experimentType\":\"ChIP-seq\"}},\"REPOSITORY\":\"arrayexpress_020916\",\"TITLE\":\"Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq]\"}"
 55 | 	};
 56 | 
 57 | 	/**
 58 | 	 * Creates the specified index in ElasticSearch
 59 | 	 * 
 60 | 	 * @param indexName
 61 | 	 *            the index name to augment
 62 | 	 * @param typeName
 63 | 	 *            the type name to augment
 64 | 	 * @param id
 65 | 	 *            the id of the document to add
 66 | 	 * @param jsonDocument
 67 | 	 *            the String JSON document to add
 68 | 	 */
 69 | 	protected static void createIndex(String indexName) {
 70 | 		try {
 71 | 			// Create our expand / search indices
 72 | 			String endpoint = String.format("/%s", indexName);
 73 | 			Map<String, String> params = new HashMap<String, String>();
 74 | 			StringEntity requestBody = new StringEntity(INDEX_JSON);
 75 | 
 76 | 			Response resp = client.performRequest("PUT", endpoint, params, requestBody, contentTypeHeader);
 77 | 			staticLogger.debug("Response: " + resp.getStatusLine());
 78 | 
 79 | 		} catch (IOException e) {
 80 | 			// Ignore this...? probably already exists
 81 | 			staticLogger.error(e.getMessage(), e);
 82 | 
 83 | 			if (e instanceof UnsupportedEncodingException) {
 84 | 				staticLogger.error("Error encoding JSON: " + e.getMessage(), e);
 85 | 				return;
 86 | 			}
 87 | 		}
 88 | 	}
 89 | 
 90 | 	/**
 91 | 	 * Adds a document to the specified ElasticSearch index / type
 92 | 	 * 
 93 | 	 * @param indexName
 94 | 	 *            the index name to augment
 95 | 	 * @param typeName
 96 | 	 *            the type name to augment
 97 | 	 * @param id
 98 | 	 *            the id of the document to add
 99 | 	 * @param jsonDocument
100 | 	 *            the String JSON document to add
101 | 	 */
102 | 	protected static void addDocument(String indexName, String typeName, Integer id, String jsonDocument) {
103 | 		try {
104 | 			String documentEndpoint = String.format("/%s/%s/%d", indexName, typeName, id);
105 | 			StringEntity requestBody = new StringEntity(jsonDocument);
106 | 			Map<String, String> params = new HashMap<String, String>();
107 | 
108 | 			Response resp = client.performRequest("PUT", documentEndpoint, params, requestBody, contentTypeHeader);
109 | 			staticLogger.debug("Response: " + resp.getStatusLine());
110 | 
111 | 		} catch (IOException e) {
112 | 			// Ignore this...? probably already exists
113 | 			staticLogger.error(e.getMessage(), e);
114 | 
115 | 			if (e instanceof UnsupportedEncodingException) {
116 | 				staticLogger.error("Error encoding JSON: " + e.getMessage(), e);
117 | 				return;
118 | 			}
119 | 		}
120 | 	}
121 | 
122 | 	protected static void wait(int millis) {
123 | 		staticLogger.debug(String.format("Sleeping for %d milliseconds", millis));
124 | 		try {
125 | 			Thread.sleep(millis);
126 | 		} catch (InterruptedException e) {
127 | 			staticLogger.error(e.getMessage(), e);
128 | 		}
129 | 	}
130 | 
131 | 	/**
132 | 	 * Create a new {@link XContentParser}.
133 | 	 */
134 | 	protected static XContentParser createParser(XContentBuilder builder) throws IOException {
135 | 		return builder.generator().contentType().xContent().createParser(xContentRegistry(), builder.bytes());
136 | 	}
137 | 
138 | 	/**
139 | 	 * Create a new {@link XContentParser}.
140 | 	 */
141 | 	protected static XContentParser createParser(XContent xContent, String data) throws IOException {
142 | 		return xContent.createParser(xContentRegistry(), data);
143 | 	}
144 | 
145 | 	/**
146 | 	 * Create a new {@link XContentParser}.
147 | 	 */
148 | 	protected static XContentParser createParser(XContent xContent, InputStream data) throws IOException {
149 | 		return xContent.createParser(xContentRegistry(), data);
150 | 	}
151 | 
152 | 	/**
153 | 	 * Create a new {@link XContentParser}.
154 | 	 */
155 | 	protected static XContentParser createParser(XContent xContent, byte[] data) throws IOException {
156 | 		return xContent.createParser(xContentRegistry(), data);
157 | 	}
158 | 
159 | 	/**
160 | 	 * Create a new {@link XContentParser}.
161 | 	 */
162 | 	protected static XContentParser createParser(XContent xContent, BytesReference data) throws IOException {
163 | 		return xContent.createParser(xContentRegistry(), data);
164 | 	}
165 | 
166 | 	/**
167 | 	 * The {@link NamedXContentRegistry} to use for this test. Subclasses should
168 | 	 * override and use liberally.
169 | 	 */
170 | 	protected static NamedXContentRegistry xContentRegistry() {
171 | 		return new NamedXContentRegistry(ClusterModule.getNamedXWriteables());
172 | 	}
173 | 
174 | 	public static Map<String, Object> entityAsMap(Response response) throws UnsupportedOperationException, IOException {
175 | 		XContentType xContentType = XContentType
176 | 				.fromMediaTypeOrFormat(response.getEntity().getContentType().getValue());
177 | 		try (XContentParser parser = createParser(xContentType.xContent(), response.getEntity().getContent())) {
178 | 			return parser.map();
179 | 		}
180 | 	}
181 | 
182 | 	@BeforeClass
183 | 	public static void startRestClient() {
184 | 		client = RestClient.builder(new HttpHost("localhost", HTTP_TEST_PORT)).build();
185 | 		try {
186 | 			Response response = client.performRequest("GET", "/");
187 | 			Map<String, Object> responseMap = entityAsMap(response);
188 | 			assertThat(responseMap, hasEntry("tagline", "You Know, for Search"));
189 | 			staticLogger.info("Integration tests ready to start... Cluster is running.");
190 | 		} catch (IOException e) {
191 | 			// If we have an exception here, let's ignore the test
192 | 			staticLogger.warn("Integration tests are skipped: [{}]", e.getMessage());
193 | 			assumeThat("Integration tests are skipped", e.getMessage(), not(containsString("Connection refused")));
194 | 			staticLogger.error("Full error is", e);
195 | 			fail("Something wrong is happening. REST Client seemed to raise an exception.");
196 | 		}
197 | 	}
198 | 
199 | 	@AfterClass
200 | 	public static void stopRestClient() throws IOException {
201 | 		if (client != null) {
202 | 			client.close();
203 | 			client = null;
204 | 		}
205 | 		staticLogger.info("Stopping integration tests against an external cluster");
206 | 	}
207 | }
208 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <groupId>edu.illinois.lis</groupId>
  6 |   <artifactId>rocchio</artifactId>
  7 |   <version>0.0.1-SNAPSHOT</version>
  8 |   <packaging>jar</packaging>
  9 | 
 10 |   <name>ElasticSearch Rocchio Plugin</name>
 11 |   <description>A custom plugin for ElasticSearch to enable Rocchio Query Expansion</description>
 12 |   <url>https://github.com/nds-org/elasticsearch-queryexpansion-plugin</url>
 13 | 
 14 |   <scm>
 15 |     <url>git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git</url>
 16 |     <connection>scm:git:git:git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git</connection>
 17 |     <developerConnection>scm:git:git@github.com:nds-org/elasticsearch-queryexpansion-plugin.git</developerConnection>
 18 |   </scm>
 19 |   
 20 |   <licenses>
 21 |     <license>
 22 |       <name>MIT</name>
 23 |       <url>https://opensource.org/licenses/MIT</url>
 24 |     </license>
 25 |   </licenses>
 26 |   
 27 |   <developers>
 28 |     <developer>
 29 |       <name>Craig Willis</name>
 30 |       <email>willis8@illinois.edu</email>
 31 |       <organization>National Data Service</organization>
 32 |       <organizationUrl>http://www.nationaldataservice.org/</organizationUrl>
 33 |     </developer>
 34 |   
 35 |     <developer>
 36 |       <name>Garrick Sherman</name>
 37 |       <email>gsherma2@illinois.edu</email>
 38 |       <organization>University of Illinois</organization>
 39 |       <organizationUrl>http://ischool.illinois.edu/</organizationUrl>
 40 |     </developer>
 41 |     
 42 |     <developer>
 43 |       <name>Mike Lambert</name>
 44 |       <email>lambert8@illinois.edu</email>
 45 |       <organization>National Data Service</organization>
 46 |       <organizationUrl>http://www.nationaldataservice.org/</organizationUrl>
 47 |     </developer>
 48 |   </developers>
 49 | 
 50 |   <parent>
 51 |     <groupId>org.sonatype.oss</groupId>
 52 |     <artifactId>oss-parent</artifactId>
 53 |     <version>7</version>
 54 |   </parent>
 55 | 
 56 |   <distributionManagement>
 57 |     <snapshotRepository>
 58 |       <id>ossrh</id>
 59 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
 60 |     </snapshotRepository>
 61 |     <repository>
 62 |       <id>ossrh</id>
 63 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 64 |     </repository>
 65 |   </distributionManagement>
 66 | 
 67 |   <properties>
 68 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 69 | 
 70 |     <!-- ElasticSearch groupId -->
 71 |     <elasticsearch.groupid>org.elasticsearch.distribution.zip</elasticsearch.groupid>
 72 | 
 73 |     <!-- Dependency versions -->
 74 |     <elasticsearch.version>5.3.2</elasticsearch.version>
 75 |     <elasticsearch.client.version>5.3.2</elasticsearch.client.version>
 76 |     <ir-utils.version>0.2.0-SNAPSHOT</ir-utils.version>
 77 |     <junit.version>4.11</junit.version>
 78 |     <log4j.version>2.6.2</log4j.version>
 79 |     <snakeyaml.version>1.15</snakeyaml.version>
 80 |     <xmlbeans.version>2.3.0</xmlbeans.version>
 81 |     <beanutils.version>1.8.3</beanutils.version>
 82 |     <lucene.version>6.4.2</lucene.version>
 83 |     <commons-cli.version>1.4</commons-cli.version>
 84 | 
 85 |     <!-- For integration tests using ANT -->
 86 |     <integ.http.port>9400</integ.http.port>
 87 |     <integ.transport.port>9500</integ.transport.port>
 88 |     <tests.rest.cluster>localhost:${integ.http.port}</tests.rest.cluster>
 89 | 
 90 |     <!-- Run with "-DskipTests=true" to skip running tests -->
 91 |     <skipTests>false</skipTests>
 92 |     <skipIntegTests>false</skipIntegTests>
 93 |     <skipUnitTests>false</skipUnitTests>
 94 |   </properties>
 95 |   
 96 |   <dependencies>
 97 |     <dependency>
 98 |       <groupId>org.apache.logging.log4j</groupId>
 99 |       <artifactId>log4j-core</artifactId>
100 |       <version>${log4j.version}</version>
101 |       <scope>provided</scope>
102 |     </dependency>
103 | 
104 | 
105 |     <dependency>
106 |       <groupId>junit</groupId>
107 |       <artifactId>junit</artifactId>
108 |       <version>${junit.version}</version>
109 |       <exclusions>
110 |         <exclusion>
111 |           <groupId>org.hamcrest</groupId>
112 |           <artifactId>hamcrest-core</artifactId>
113 |         </exclusion>
114 |       </exclusions>
115 |       <scope>test</scope>
116 |     </dependency>
117 | 
118 |     <dependency>
119 |       <groupId>org.elasticsearch</groupId>
120 |       <artifactId>elasticsearch</artifactId>
121 |       <version>${elasticsearch.version}</version>
122 |       <scope>provided</scope>
123 |     </dependency>
124 | 
125 |     <dependency>
126 |       <groupId>org.elasticsearch.test</groupId>
127 |       <artifactId>framework</artifactId>
128 |       <version>${elasticsearch.version}</version>
129 |       <scope>test</scope>
130 |     </dependency>
131 | 
132 |     <dependency>
133 |       <groupId>org.elasticsearch.client</groupId>
134 |       <artifactId>rest</artifactId>
135 |       <version>${elasticsearch.client.version}</version>
136 |       <scope>test</scope>
137 |     </dependency>
138 | 
139 |     <dependency>
140 |       <groupId>org.apache.lucene</groupId>
141 |       <artifactId>lucene-test-framework</artifactId>
142 |       <version>${lucene.version}</version>
143 |       <scope>test</scope>
144 |     </dependency>
145 | 
146 |     <dependency>
147 |       <groupId>commons-cli</groupId>
148 |       <artifactId>commons-cli</artifactId>
149 |       <version>${commons-cli.version}</version>
150 |     </dependency>
151 | 
152 |     <dependency>
153 |       <groupId>org.apache.lucene</groupId>
154 |       <artifactId>lucene-sandbox</artifactId>
155 |       <version>${lucene.version}</version>
156 |       <scope>provided</scope>
157 |     </dependency>
158 | 
159 |     <dependency>
160 |       <groupId>org.apache.lucene</groupId>
161 |       <artifactId>lucene-analyzers-common</artifactId>
162 |       <version>${lucene.version}</version>
163 |       <scope>provided</scope>
164 |     </dependency>
165 | 
166 |     <dependency>
167 |       <groupId>org.apache.lucene</groupId>
168 |       <artifactId>lucene-core</artifactId>
169 |       <version>${lucene.version}</version>
170 |       <scope>provided</scope>
171 |     </dependency>
172 | 
173 |     <dependency>
174 |       <groupId>org.apache.lucene</groupId>
175 |       <artifactId>lucene-queries</artifactId>
176 |       <version>${lucene.version}</version>
177 |       <scope>provided</scope>
178 |     </dependency>
179 | 
180 |     <dependency>
181 |       <groupId>org.apache.lucene</groupId>
182 |       <artifactId>lucene-queryparser</artifactId>
183 |       <version>${lucene.version}</version>
184 |       <scope>provided</scope>
185 |     </dependency>
186 | 
187 |     <dependency>
188 |       <groupId>org.yaml</groupId>
189 |       <artifactId>snakeyaml</artifactId>
190 |       <version>${snakeyaml.version}</version>
191 |       <scope>provided</scope>
192 |     </dependency>
193 | 
194 |     <dependency>
195 |       <groupId>org.apache.xmlbeans</groupId>
196 |       <artifactId>xmlbeans</artifactId>
197 |       <version>${xmlbeans.version}</version>
198 |       <scope>provided</scope>
199 |     </dependency>
200 | 
201 |     <dependency>
202 |       <groupId>commons-beanutils</groupId>
203 |       <artifactId>commons-beanutils</artifactId>
204 |       <version>${beanutils.version}</version>
205 |       <scope>provided</scope>
206 |     </dependency>
207 | 
208 |     <dependency>
209 |       <groupId>edu.illinois.lis</groupId>
210 |       <artifactId>ir-utils</artifactId>
211 |       <version>${ir-utils.version}</version>
212 |       <exclusions>
213 |         <exclusion>
214 |           <groupId>commons-collections</groupId>
215 |           <artifactId>commons-collections</artifactId>
216 |         </exclusion>
217 |         <exclusion>
218 |           <groupId>org.apache.geronimo.specs</groupId>
219 |           <artifactId>geronimo-stax-api_1.0_spec</artifactId>
220 |         </exclusion>
221 |         <exclusion>
222 |           <groupId>xml-apis</groupId>
223 |           <artifactId>xml-apis</artifactId>
224 |         </exclusion>
225 |       </exclusions>
226 |     </dependency>
227 |   </dependencies>
228 | 
229 |   <build>
230 |     <resources>
231 |       <resource>
232 |         <directory>src/main/resources</directory>
233 |         <filtering>false</filtering>
234 |         <excludes>
235 |           <exclude>*.properties</exclude>
236 |         </excludes>
237 |       </resource>
238 |     </resources>
239 | 
240 |     <plugins>
241 |       <!-- Maven Compiler Plugin -->
242 |       <plugin>
243 |         <groupId>org.apache.maven.plugins</groupId>
244 |         <artifactId>maven-compiler-plugin</artifactId>
245 |         <version>3.3</version>
246 |         <configuration>
247 |           <source>1.8</source>
248 |           <target>1.8</target>
249 |         </configuration>
250 |       </plugin>
251 | 
252 |       <!-- Maven Assembly Plugin -->
253 |       <plugin>
254 |         <groupId>org.apache.maven.plugins</groupId>
255 |         <artifactId>maven-assembly-plugin</artifactId>
256 |         <version>2.6</version>
257 |         <configuration>
258 |           <appendAssemblyId>false</appendAssemblyId>
259 |           <outputDirectory>${project.build.directory}/releases/</outputDirectory>
260 |           <descriptors>
261 |             <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
262 |           </descriptors>
263 |         </configuration>
264 |         <executions>
265 |           <execution>
266 |             <phase>package</phase>
267 |             <goals>
268 |               <goal>single</goal>
269 |             </goals>
270 |           </execution>
271 |         </executions>
272 |       </plugin>
273 | 
274 |       <!-- Maven Source Plugin -->
275 |       <plugin>
276 |         <groupId>org.apache.maven.plugins</groupId>
277 |         <artifactId>maven-source-plugin</artifactId>
278 |         <version>3.0.1</version>
279 |         <executions>
280 |           <execution>
281 |             <id>attach-sources</id>
282 |             <phase>package</phase>
283 |             <goals>
284 |               <goal>jar</goal>
285 |             </goals>
286 |           </execution>
287 |         </executions>
288 |       </plugin>
289 | 
290 |       <!-- Maven Javadoc Plugin -->
291 |       <plugin>
292 |         <groupId>org.apache.maven.plugins</groupId>
293 |         <artifactId>maven-javadoc-plugin</artifactId>
294 |         <version>2.10.4</version>
295 |         <configuration>
296 |           <show>private</show>
297 |           <nohelp>true</nohelp>
298 |         </configuration>
299 |         <executions>
300 |           <execution>
301 |             <id>attach-javadocs</id>
302 |             <phase>package</phase>
303 |             <goals>
304 |               <goal>jar</goal>
305 |             </goals>
306 |           </execution>
307 |         </executions>
308 |       </plugin>
309 | 
310 |       <!-- Maven GPG Plugin -->
311 |       <plugin>
312 |         <groupId>org.apache.maven.plugins</groupId>
313 |         <artifactId>maven-gpg-plugin</artifactId>
314 |         <version>1.6</version>
315 |         <executions>
316 |           <execution>
317 |             <id>sign-artifacts</id>
318 |             <phase>verify</phase>
319 |             <goals>
320 |               <goal>sign</goal>
321 |             </goals>
322 |           </execution>
323 |         </executions>
324 |       </plugin>
325 | 
326 |       <!-- Maven Dependency Plugin -->
327 |       <plugin>
328 |         <groupId>org.apache.maven.plugins</groupId>
329 |         <artifactId>maven-dependency-plugin</artifactId>
330 |         <version>2.10</version>
331 |         <executions>
332 |           <execution>
333 |             <id>integ-setup-dependencies</id>
334 |             <phase>pre-integration-test</phase>
335 |             <goals>
336 |               <goal>copy</goal>
337 |             </goals>
338 |             <configuration>
339 |               <skip>${skipIntegTests}</skip>
340 |               <artifactItems>
341 |                 <artifactItem>
342 |                   <groupId>${elasticsearch.groupid}</groupId>
343 |                   <artifactId>elasticsearch</artifactId>
344 |                   <version>${elasticsearch.version}</version>
345 |                   <type>zip</type>
346 |                 </artifactItem>
347 |               </artifactItems>
348 |               <useBaseVersion>true</useBaseVersion>
349 |               <outputDirectory>${project.build.directory}/integration-tests/binaries</outputDirectory>
350 |             </configuration>
351 |           </execution>
352 |         </executions>
353 |       </plugin>
354 | 
355 |       <!-- Maven ANT Run Plugin -->
356 |       <plugin>
357 |         <groupId>org.apache.maven.plugins</groupId>
358 |         <artifactId>maven-antrun-plugin</artifactId>
359 |         <version>1.8</version>
360 |         <executions>
361 |           <!-- start up external cluster -->
362 |           <execution>
363 |             <id>integ-setup</id>
364 |             <phase>pre-integration-test</phase>
365 |             <goals>
366 |               <goal>run</goal>
367 |             </goals>
368 |             <configuration>
369 |               <skip>${skipIntegTests}</skip>
370 |               <target>
371 |                 <ant antfile="src/test/ant/integration-tests.xml" target="start-external-cluster-with-plugin" />
372 |               </target>
373 |             </configuration>
374 |           </execution>
375 |           <!-- shut down external cluster -->
376 |           <execution>
377 |             <id>integ-teardown</id>
378 |             <phase>post-integration-test</phase>
379 |             <goals>
380 |               <goal>run</goal>
381 |             </goals>
382 |             <configuration>
383 |               <skip>${skipIntegTests}</skip>
384 |               <target>
385 |                 <ant antfile="src/test/ant/integration-tests.xml" target="stop-external-cluster" />
386 |               </target>
387 |             </configuration>
388 |           </execution>
389 |         </executions>
390 |       </plugin>
391 | 
392 |       <!-- Disable Maven Surefire Plugin (separate unit and integration tests) -->
393 |       <plugin>
394 |         <groupId>org.apache.maven.plugins</groupId>
395 |         <artifactId>maven-surefire-plugin</artifactId>
396 |         <version>2.19</version>
397 |         <executions>
398 |           <execution>
399 |             <id>default-test</id>
400 |             <phase>none</phase>
401 |           </execution>
402 |         </executions>
403 |       </plugin>
404 | 
405 | 
406 | 
407 |       <!-- Randomized Testing Plugin -->
408 |       <plugin>
409 |         <groupId>com.carrotsearch.randomizedtesting</groupId>
410 |         <artifactId>junit4-maven-plugin</artifactId>
411 |         <version>2.3.3</version>
412 | 
413 |         <configuration>
414 |           <assertions enableSystemAssertions="false">
415 |             <enable />
416 |           </assertions>
417 | 
418 |           <listeners>
419 |             <report-text />
420 |           </listeners>
421 |         </configuration>
422 | 
423 |         <executions>
424 |           <execution>
425 |             <id>unit-tests</id>
426 |             <phase>test</phase>
427 |             <goals>
428 |               <goal>junit4</goal>
429 |             </goals>
430 |             <inherited>true</inherited>
431 |             <configuration>
432 |               <skipTests>${skipUnitTests}</skipTests>
433 |               <includes>
434 |                 <include>**/*Test.class</include>
435 |               </includes>
436 |               <excludes>
437 |                 <exclude>**/*$*</exclude>
438 |               </excludes>
439 |             </configuration>
440 |           </execution>
441 |           <execution>
442 |             <id>integration-tests</id>
443 |             <phase>integration-test</phase>
444 |             <goals>
445 |               <goal>junit4</goal>
446 |             </goals>
447 |             <inherited>true</inherited>
448 |             <configuration>
449 |               <skipTests>${skipIntegTests}</skipTests>
450 |               <includes>
451 |                 <include>**/*IT.class</include>
452 |               </includes>
453 |               <excludes>
454 |                 <exclude>**/*$*</exclude>
455 |               </excludes>
456 |             </configuration>
457 |           </execution>
458 |         </executions>
459 |       </plugin>
460 |     </plugins>
461 |   </build>
462 | </project>
463 | 


--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/Rocchio.java:
--------------------------------------------------------------------------------
  1 | package org.nationaldataservice.elasticsearch.rocchio;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.io.InputStreamReader;
  7 | import java.net.URISyntaxException;
  8 | import java.nio.file.Files;
  9 | import java.nio.file.Path;
 10 | import java.nio.file.Paths;
 11 | import java.util.HashMap;
 12 | import java.util.LinkedHashMap;
 13 | import java.util.Map;
 14 | import java.util.function.Supplier;
 15 | 
 16 | import org.apache.commons.cli.Options;
 17 | import org.apache.commons.cli.ParseException;
 18 | import org.apache.logging.log4j.Logger;
 19 | import org.apache.lucene.index.Fields;
 20 | import org.apache.lucene.index.Terms;
 21 | import org.apache.lucene.index.TermsEnum;
 22 | import org.elasticsearch.action.search.SearchResponse;
 23 | import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse;
 24 | import org.elasticsearch.action.termvectors.MultiTermVectorsRequestBuilder;
 25 | import org.elasticsearch.action.termvectors.MultiTermVectorsResponse;
 26 | import org.elasticsearch.action.termvectors.TermVectorsRequest;
 27 | import org.elasticsearch.action.termvectors.TermVectorsResponse;
 28 | import org.elasticsearch.client.Client;
 29 | import org.elasticsearch.client.Requests;
 30 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 31 | import org.elasticsearch.cluster.metadata.MappingMetaData;
 32 | import org.elasticsearch.common.collect.ImmutableOpenMap;
 33 | import org.elasticsearch.common.logging.ESLoggerFactory;
 34 | import org.elasticsearch.index.query.QueryStringQueryBuilder;
 35 | import org.elasticsearch.search.SearchHit;
 36 | import org.elasticsearch.search.SearchHits;
 37 | //import org.elasticsearch.transport.client.PreBuiltTransportClient;
 38 | 
 39 | import edu.gslis.textrepresentation.FeatureVector;
 40 | import edu.gslis.utils.Stopper;
 41 | import joptsimple.internal.Strings;
 42 | 
 43 | /**
 44 |  * Rocchio implementation for Lucene based on:
 45 |  * https://github.com/gtsherman/lucene/blob/master/src/main/java/org/retrievable/lucene/searching/expansion/Rocchio.java
 46 |  * 
 47 |  */
 48 | public class Rocchio {
 49 |     private static final Logger staticLogger = ESLoggerFactory.getLogger(Rocchio.class);
 50 | 
 51 |     // FIXME: These are just random guesses see NDS-958
 52 |     private static final int ALPHA_BETA_MIN = 0;
 53 |     private static final int ALPHA_BETA_MAX = 1;
 54 |     private static final int K1_MIN = 0;
 55 |     private static final int K1_MAX = 2;
 56 |     private static final int B_MIN = 0;
 57 |     private static final int B_MAX = 1;
 58 | 
 59 |     // Error Strings returned from validate()
 60 |     public static final String NULL_INDEX_ERROR = "You must specify an index to expand against";
 61 |     public static final String NULL_QUERY_ERROR = "You must specify a query to expand";
 62 |     public static final String NULL_TYPE_ERROR = "You must specify a type";
 63 |     public static final String NULL_FIELD_ERROR = "You must specify a field";
 64 |     public static final String INVALID_FB_TERMS_ERROR = "Number of feedback terms (fbTerms) must be a positive integer";
 65 |     public static final String INVALID_FB_DOCS_ERROR = "Number of feedback documents (fbDocs) must be a positive integer";
 66 |     public static final String INVALID_ALPHA_ERROR = "Alpha value must be a real number between " + ALPHA_BETA_MIN + " and " + ALPHA_BETA_MAX;
 67 |     public static final String INVALID_BETA_ERROR = "Beta value must be a real number between " + ALPHA_BETA_MIN + " and " + ALPHA_BETA_MAX;
 68 |     public static final String INVALID_K1_ERROR = "K1 value must be a real number between " + K1_MIN + " and " + K1_MAX;
 69 |     public static final String INVALID_B_ERROR = "B value must be a real number between " + B_MIN + " and " + B_MAX;
 70 |     
 71 |     // Read default stoplist from src/main/resource/stoplist.all
 72 |     private static final Stopper DEFAULT_STOPPER;
 73 |     private static final String STOPLIST_NAME = "stoplist.all";
 74 |     static {
 75 |         Stopper stopper = new Stopper();
 76 |         
 77 |         ClassLoader loader = Rocchio.class.getClassLoader();
 78 |         try (BufferedReader br = new BufferedReader(new InputStreamReader(loader.getResourceAsStream(STOPLIST_NAME))))  {
 79 |             String curr = null;
 80 |             while ((curr = br.readLine()) != null) {
 81 |                 stopper.addStopword(curr);
 82 |             }
 83 |         } catch (IOException e) {
 84 |            staticLogger.error(String.format("%s was not found.. continuing without a stoplist", STOPLIST_NAME), e);
 85 |         }
 86 |         
 87 |         DEFAULT_STOPPER = stopper;
 88 |     }
 89 | 
 90 |     // Error Strings returned from ensureTermVectors()
 91 |     /**
 92 |      * Returns a "nonexistent index" error message for the given index
 93 |      * 
 94 |      * @param index
 95 |      *            the {@link String} index name
 96 |      * @return a "nonexistent index" error message
 97 |      */
 98 |     public static String NONEXISTENT_INDEX_ERROR(String index) {
 99 |         return "Index does not exist: " + index;
100 |     }
101 | 
102 |     /**
103 |      * Returns a "nonexistent type" error message for the given index/type
104 |      * 
105 |      * @param index
106 |      *            the {@link String} index name
107 |      * @param type
108 |      *            the {@link String} type name
109 |      * @return a "nonexistent type error" error message
110 |      */
111 |     public static String NONEXISTENT_TYPE_ERROR(String index, String type) {
112 |         return "No mapping found on index " + index + " for: " + type;
113 |     }
114 | 
115 |     /**
116 |      * Returns a "disabled term vectors" error message for the given index/type/field
117 |      * 
118 |      * @param index
119 |      *            the {@link String} index name
120 |      * @param type
121 |      *            the {@link String} type name
122 |      * @param field
123 |      *            the {@link String} field name
124 |      * @return a "disabled term vectors" error message
125 |      */
126 |     public static String DISABLED_TERM_VECTORS_ERROR(String index, String type, String field) {
127 |         return "Term vectors storage for on " + index + "." + type + "." + field + " has been disabled";
128 |     }
129 | 
130 |     /**
131 |      * Returns a "unconfigured term vectors" error message for the given index/type/field
132 |      * 
133 |      * @param index
134 |      *            the {@link String} index name
135 |      * @param type
136 |      *            the {@link String} type name
137 |      * @param field
138 |      *            the {@link String} field name
139 |      * @return an "unconfigured term vectors" error message
140 |      */
141 |     public static String UNCONFIGURED_TERM_VECTORS_ERROR(String index, String type, String field) {
142 |         return "Term vectors storage for on index " + index + "." + type + "." + field + " has not been configured";
143 |     }
144 | 
145 |     /**
146 |      * Returns a "missing term vector field" error message for the given index/type
147 |      * 
148 |      * @param index
149 |      *            the {@link String} index name
150 |      * @param type
151 |      *            the {@link String} type name
152 |      * @return a "missing term vector field" error message
153 |      */
154 |     public static String MISSING_TERM_VECTOR_FIELD(String index, String type) {
155 |         return "Error: no fields received for term vector - " + index + "/" + type;
156 |     }
157 | 
158 |     /**
159 |      * Returns a "missing field terms" error message for the given index/type/field
160 |      * 
161 |      * @param index
162 |      *            the {@link String} index name
163 |      * @param type
164 |      *            the {@link String} type name
165 |      * @param field
166 |      *            the {@link String} field name
167 |      * @return a "missing field terms" error message
168 |      */
169 |     public static String MISSING_FIELD_TERMS(String index, String type, String field) {
170 |         return "Error: no terms received for field - " + index + "/" + type + "/" + field;
171 |     }
172 | 
173 |     private final Client client;      // ElasticSearch client
174 |     private final String index;       // ElasticSearch index name
175 |     private final String type;        // Document type
176 |     private final String field;       // Field
177 | 
178 |     private final double alpha;       // Rocchio alpha
179 |     private final double beta;        // Rocchio beta
180 |     private final double k1;          // BM25 k1
181 |     private final double b;           // BM25 b
182 | 
183 |     private Stopper stopper = null;
184 | 
185 |     // Global statistics (there's certainly a better way to handle this)
186 |     long docCount = 0;                                          // Number of documents in index
187 |     double avgDocLen = 0;                                       // Average document length, needed by BM25
188 |     Map<String, Long> dfStats = new HashMap<String, Long>();    // Cached doc frequency stats
189 | 
190 |     /**
191 |      * Instantiates a new instance of the Rocchio algorithm with the given client and parameters.
192 |      * 
193 |      * @param client
194 |      *            the {@link Client} to use for the connection
195 |      * @param index
196 |      *            the {@link String} index to expand against
197 |      * @param type
198 |      *            the {@link String} type within the index
199 |      * @param field
200 |      *            the {@link String} field on the type
201 |      * @param alpha
202 |      *            the {@link double} Rocchio alpha parameter
203 |      * @param beta
204 |      *            the {@link double} Rocchio beta parameter
205 |      * @param k1
206 |      *            the {@link double} Rocchio k1 parameter
207 |      * @param b
208 |      *            the {@link double} Rocchio b parameter
209 |      * @param stoplist
210 |      *            the {@link String} list of stop words
211 |      */
212 |     public Rocchio(Client client, String index, String type, String field, double alpha, double beta, double k1,
213 |             double b, String stoplist) {
214 |         this.client = client;
215 |         this.index = index;
216 |         this.type = type;
217 |         this.field = field;
218 |         this.alpha = alpha;
219 |         this.beta = beta;
220 |         this.k1 = k1;
221 |         this.b = b;
222 | 
223 |         this.setStoplist(stoplist);
224 |     }
225 | 
226 |     /**
227 |      * Instantiates a new instance of the Rocchio algorithm with the given client and parameters.
228 |      * 
229 |      * @param client
230 |      *            the {@link Client} to use for the connection
231 |      * @param index
232 |      *            the {@link String} index to expand against
233 |      * @param type
234 |      *            the {@link String} type within the index
235 |      * @param field
236 |      *            the {@link String} field on the type
237 |      * @param alpha
238 |      *            the {@link double} Rocchio alpha parameter
239 |      * @param beta
240 |      *            the {@link double} Rocchio beta parameter
241 |      * @param k1
242 |      *            the {@link double} Rocchio k1 parameter
243 |      * @param b
244 |      *            the {@link double} Rocchio b parameter
245 |      */
246 |     public Rocchio(Client client, String index, String type, String field, double alpha, double beta, double k1, double b) {
247 |         this(client, index, type, field, alpha, beta, k1, b, null);
248 |     }
249 | 
250 |     // Assumes a space-delimited string
251 |     private Stopper setStoplist(String stoplist) {
252 |         this.stopper = new Stopper(DEFAULT_STOPPER);
253 |         
254 |         // Short-circuit for null stoplist
255 |         if (!Strings.isNullOrEmpty(stoplist)) {
256 |             String[] stopwords = stoplist.split(" ");
257 |             for (String term : stopwords) {
258 |                 stopper.addStopword(term);
259 |             }
260 |         }
261 |         
262 |         return stopper;
263 |     }
264 | 
265 |     private void fail(String errorMessage) {
266 |         staticLogger.error(errorMessage);
267 |         throw new IllegalStateException(errorMessage);
268 |     }
269 | 
270 |     private void failIf(Supplier<Boolean> condition, String errorMessage) {
271 |         if (condition.get()) {
272 |             staticLogger.error("Condition failed: " + condition.toString());
273 |             fail(errorMessage);
274 |         }
275 |     }
276 | 
277 |     /**
278 |      * Verifies that String and numeric values are within their allowed ranges, then ensures that term vectors are
279 |      * properly enabled on the target index.
280 |      * 
281 |      * @param query
282 |      *            the String query to expand
283 |      * @param fbDocs
284 |      *            the int number of feedback documents
285 |      * @param fbTerms
286 |      *            the int number of feedback terms
287 |      * @return the String error message, or null if no errors are encountered
288 |      * @throws IOException
289 |      *             if the indexMetaData fails to deserialize into a map
290 |      */
291 |     public String validate(String query, int fbDocs, int fbTerms) throws IOException {
292 |         if (Strings.isNullOrEmpty(query)) {
293 |             return NULL_QUERY_ERROR;
294 |         } else if (fbDocs < 1) {
295 |             return INVALID_FB_DOCS_ERROR;
296 |         } else if (fbTerms < 1) {
297 |             return INVALID_FB_TERMS_ERROR;
298 |         } else if (Strings.isNullOrEmpty(index)) {
299 |             return NULL_INDEX_ERROR;
300 |         } else if (Strings.isNullOrEmpty(type)) {
301 |             return NULL_TYPE_ERROR;
302 |         } else if (Strings.isNullOrEmpty(field)) {
303 |             return NULL_FIELD_ERROR;
304 |         } else if (ALPHA_BETA_MIN > alpha || alpha > ALPHA_BETA_MAX) {
305 |             return INVALID_ALPHA_ERROR;
306 |         } else if (ALPHA_BETA_MIN > beta || beta > ALPHA_BETA_MAX) {
307 |             return INVALID_BETA_ERROR;
308 |         } else if (K1_MIN > k1 || k1 > K1_MAX) {
309 |             return INVALID_K1_ERROR;
310 |         } else if (B_MIN > b || b > B_MAX) {
311 |             return INVALID_B_ERROR;
312 |         }
313 |         return this.ensureTermVectors();
314 |     }
315 | 
316 |     /**
317 |      * Returns an error message if term vectors are misconfigured. Otherwise, returns null.
318 |      * 
319 |      * TODO: Some of this could potentially be called at plugin startup, if we know what index/type we plan to expand
320 |      * against ahead of time...
321 |      * 
322 |      * @return the String error message, or null if no errors are encountered
323 |      * 
324 |      * @throws IOException
325 |      *             if the indexMetaData fails to deserialize into a map
326 |      */
327 |     @SuppressWarnings("unchecked")
328 |     private String ensureTermVectors() throws IOException {
329 |         // Verify that the index exists
330 |         IndexMetaData indexMetaData = client.admin().cluster().state(Requests.clusterStateRequest()).actionGet()
331 |                 .getState().getMetaData().index(index);
332 | 
333 |         if (indexMetaData == null) {
334 |             return NONEXISTENT_INDEX_ERROR(index);
335 |         }
336 | 
337 |         // Verify that the index contains the desired type
338 |         ImmutableOpenMap<String, MappingMetaData> indexMap = indexMetaData.getMappings();
339 |         if (!indexMap.containsKey(type)) {
340 |             return NONEXISTENT_TYPE_ERROR(index, type);
341 |         }
342 | 
343 |         // Grab the type and analyze it to locate the field
344 |         MappingMetaData typeMetadata = indexMetaData.getMappings().get(type);
345 |         Map<String, Object> typeMap = typeMetadata.getSourceAsMap();
346 | 
347 |         LinkedHashMap<String, Object> fieldProperties,
348 |                 allFieldProperties = (LinkedHashMap<String, Object>) typeMap.get("_all");
349 |         if (!"_all".equals(field)) {
350 |             // Otherwise, we need to drill down into "properties"
351 |             LinkedHashMap<String, Object> typePropertiesMap = (LinkedHashMap<String, Object>) typeMap.get("properties");
352 |             fieldProperties = (LinkedHashMap<String, Object>) typePropertiesMap.get(field);
353 |         } else {
354 |             // we can look for "store" on "_all" too
355 |             fieldProperties = allFieldProperties;
356 |         }
357 | 
358 |         // Verify that "store" is present on either _all or our target field
359 |         if (allFieldProperties.containsKey("store")) {
360 |             // Verify that term vector storage is enabled for all fields
361 |             boolean storeEnabled = (boolean) allFieldProperties.get("store");
362 |             if (!storeEnabled) {
363 |                 String errorMessage = DISABLED_TERM_VECTORS_ERROR(index, type, field);
364 |                 staticLogger.error(errorMessage);
365 |                 return errorMessage;
366 |             }
367 | 
368 |             return null;
369 |         } else if (fieldProperties.containsKey("store")) {
370 |             // Verify that term vector storage is enabled at the field level
371 |             boolean storeEnabled = (boolean) fieldProperties.get("store");
372 |             if (!storeEnabled) {
373 |                 String errorMessage = DISABLED_TERM_VECTORS_ERROR(index, type, field);
374 |                 staticLogger.error(errorMessage);
375 |                 return errorMessage;
376 |             }
377 | 
378 |             return null;
379 |         }
380 | 
381 |         // TODO: NDS-958 - Check that type has documents added to it?
382 |         // TODO: NDS-958 - Check that the documents in the type contain the desired field?
383 |         // TODO: NDS-958 - Check that term vectors/fields stats are available for the desired index/type/field combination?
384 | 
385 |         // If neither of the above triggered, then we didn't have the right term vectors initialized on our index
386 |         String errorMessage = UNCONFIGURED_TERM_VECTORS_ERROR(index, type, field);
387 |         staticLogger.error(errorMessage);
388 |         return errorMessage;
389 |     }
390 | 
391 |     /**
392 |      * Run the query using the client (this assumes that the client has already been initialized and is ready to
393 |      * execute)
394 |      * 
395 |      * @param index
396 |      *            the String index to expand against
397 |      * @param query
398 |      *            Query string
399 |      * @param numDocs
400 |      *            Number of results to return
401 |      * @return SearchHits object
402 |      */
403 |     private SearchResponse runQuery(String index, String query, int numDocs) {
404 |         QueryStringQueryBuilder queryStringQueryBuilder = new QueryStringQueryBuilder(query);
405 |         return client.prepareSearch(index).setQuery(queryStringQueryBuilder).setSize(numDocs).execute().actionGet();
406 |     }
407 | 
408 |     /**
409 |      * Given a set of SearchHits, construct the feedback vector
410 |      * 
411 |      * @param hits
412 |      *            SearchHits
413 |      * @param fbDocs
414 |      *            Number of feedback documents
415 |      * @return FeatureVector based on feedback documents
416 |      * @throws IOException
417 |      *             if the TermVector has no fields, or if its Fields contain no terms
418 |      */
419 |     private FeatureVector getFeedbackVector(SearchHits hits, int fbDocs) throws IOException {
420 |         FeatureVector summedDocVec = new FeatureVector(this.stopper);
421 | 
422 |         // Use the multi termvector request to get vectors for all documents at once
423 |         MultiTermVectorsRequestBuilder mtbuilder = client.prepareMultiTermVectors();
424 |         for (SearchHit hit : hits.hits()) {
425 |             String id = hit.getId();
426 |             TermVectorsRequest termVectorsRequest = new TermVectorsRequest();
427 |             termVectorsRequest.index(index).id(id).type(this.type).termStatistics(true).offsets(false).positions(false)
428 |                     .payloads(false);
429 | 
430 |             mtbuilder.add(termVectorsRequest);
431 |         }
432 |         MultiTermVectorsResponse mtvresponse = mtbuilder.execute().actionGet();
433 | 
434 |         // Iterate over the returned document vectors. Construct the feedback vector.
435 |         // Store the global document count and calculate the global average document length
436 |         // Store document frequencies for encountered terms in dfStats map.
437 |         for (MultiTermVectorsItemResponse item : mtvresponse.getResponses()) {
438 |             FeatureVector docVec = new FeatureVector(this.stopper);
439 | 
440 |             TermVectorsResponse tv = item.getResponse();
441 |             Fields fields = tv.getFields();
442 |             failIf(() -> tv == null, MISSING_TERM_VECTOR_FIELD(index, type));
443 | 
444 |             Terms terms = fields.terms(this.field);
445 |             failIf(() -> terms == null, MISSING_FIELD_TERMS(index, type, field));
446 | 
447 |             // These are global settings and will be the same for all TermVectorResponses.
448 |             // TODO: There's a better way to handle this.
449 |             long sumTotalTermFreq = terms.getSumTotalTermFreq();  // Total number of terms in index
450 |             docCount = terms.getDocCount();                       // Total number of documents in index
451 |             avgDocLen = sumTotalTermFreq / (double) docCount;
452 | 
453 |             // Get the term frequency and document frequency for each term
454 |             TermsEnum termsEnum = terms.iterator();
455 |             while (termsEnum.next() != null) {
456 |                 String term = termsEnum.term().utf8ToString();
457 |                 long freq = termsEnum.totalTermFreq();  // Frequency for term t in this document
458 |                 long df = termsEnum.docFreq();          // Frequency for term t in all documents (document frequency) -- a global statistic
459 |                 dfStats.put(term, df);                  // Map storing global document frequencies for seen terms, used by BM25
460 |                 docVec.addTerm(term, freq);             // Current document vector
461 |             }
462 | 
463 |             // Add this document to the feedback document vector with BM25 weights
464 |             computeBM25Weights(docVec, summedDocVec);
465 |         }
466 | 
467 |         // Multiply the summed term vector by beta / |Dr|
468 |         FeatureVector relDocTermVec = new FeatureVector(this.stopper);
469 |         for (String term : summedDocVec.getFeatures()) {
470 |             relDocTermVec.addTerm(term, summedDocVec.getFeatureWeight(term) * beta / fbDocs);
471 |         }
472 | 
473 |         return relDocTermVec;
474 |     }
475 | 
476 |     /**
477 |      * Construct the query vector with BM25 weights
478 |      * 
479 |      * @param query
480 |      *            Query string
481 |      * @return FeatureVector
482 |      */
483 |     public FeatureVector getQueryVector(String query) {
484 |         // Create a query vector and scale by alpha
485 |         FeatureVector rawQueryVec = new FeatureVector(this.stopper);
486 |         rawQueryVec.addText(query);
487 | 
488 |         FeatureVector summedQueryVec = new FeatureVector(this.stopper);
489 |         computeBM25Weights(rawQueryVec, summedQueryVec);
490 | 
491 |         FeatureVector queryTermVec = new FeatureVector(this.stopper);
492 |         for (String term : rawQueryVec.getFeatures()) {
493 |             queryTermVec.addTerm(term, summedQueryVec.getFeatureWeight(term) * alpha);
494 |         }
495 | 
496 |         return queryTermVec;
497 |     }
498 | 
499 |     /**
500 |      * Expand the query.
501 |      * 
502 |      * @param query
503 |      *            Query string
504 |      * @param fbDocs
505 |      *            Number of feedback documents
506 |      * @param fbTerms
507 |      *            Number of feedback terms
508 |      * @return Expanded feature vector
509 |      * @throws IOException
510 |      *             if we fail to get the feedback vector
511 |      */
512 |     public FeatureVector expandQuery(String query, int fbDocs, int fbTerms) throws IOException {
513 |         // Run the initial query
514 |         SearchHits hits = runQuery(this.index, query, fbDocs).getHits();
515 | 
516 |         // Get the feedback document vector, weighted by beta
517 |         FeatureVector feedbackVector = getFeedbackVector(hits, fbDocs);
518 | 
519 |         // Get the original query vector, weighted by alpha
520 |         // Note, this is called after getFeedbackVector because it relies on dfStats
521 |         FeatureVector queryVector = getQueryVector(query);
522 | 
523 |         // Combine query and feedbackvectors
524 |         for (String term : queryVector.getFeatures()) {
525 |             feedbackVector.addTerm(term, queryVector.getFeatureWeight(term));
526 |         }
527 | 
528 |         // Get top terms -- aka head
529 |         feedbackVector.clip(fbTerms);
530 | 
531 |         return feedbackVector;
532 |     }
533 | 
534 |     /**
535 |      * Compute BM25 weights for the input vector and add to the output vector
536 |      * 
537 |      * @param inputVector
538 |      *            the {@link FeatureVector} input
539 |      * @param outputVector
540 |      *            the {@link FeatureVector} output
541 |      */
542 |     private void computeBM25Weights(FeatureVector inputVector, FeatureVector outputVector) {
543 |         for (String term : inputVector.getFeatures()) {
544 |             long docOccur = dfStats.get(term);
545 | 
546 |             double idf = Math.log((docCount + 1) / (docOccur + 0.5)); // following Indri
547 |             double tf = inputVector.getFeatureWeight(term);
548 | 
549 |             double weight = (idf * k1 * tf) / (tf + k1 * (1 - b + b * inputVector.getLength() / avgDocLen));
550 |             outputVector.addTerm(term, weight);
551 |         }
552 |     }
553 | 
554 |     /**
555 |      * Debug: Command line options for the main() method (see below)
556 |      * 
557 |      * @return the CLI options
558 |      */
559 |     public static Options createOptions() {
560 |         Options options = new Options();
561 |         options.addOption("cluster", true, "ElasticSearch cluster name (default: biocaddie)");
562 |         options.addOption("host", true, "ElasticSearch host (default: localhost)");
563 |         options.addOption("port", true, "ElasticSearch transport port (default: 9300)");
564 |         options.addOption("index", true, "ElasticSearch index name (default: biocaddie)");
565 |         options.addOption("type", true, "ElasticSearch document type  (default: dataset)");
566 |         options.addOption("field", true, "ElasticSearch  field  (default: _all)");
567 |         options.addOption("alpha", true, "Rocchio alpha (default: 0.5)");
568 |         options.addOption("beta", true, "Rocchio beta (default: 0.5)");
569 |         options.addOption("k1", true, "BM25 k1 (default: 1.2)");
570 |         options.addOption("b", true, "BM25 b (default: 0.75)");
571 |         options.addOption("query", true, "Query string");
572 |         options.addOption("auth", true, "Basic authentication string (default: elastic:biocaddie)");
573 |         return options;
574 |     }
575 | 
576 |     /**
577 |      * Debug: this main method will run Rocchio as a standalone command-line application.
578 |      * 
579 |      * NOTE: You will need to add the following dependency to your {@code pom.xml}:
580 |      * 
581 |      * <pre>
582 |      *  &lt;dependency&gt;
583 |      *    &lt;groupId&gt;org.elasticsearch.client&lt;/groupId&gt;
584 |      *    &lt;artifactId&gt;transport&lt;/artifactId&gt;
585 |      *    &lt;version&gt;${elasticsearch.version}&lt;/version&gt;
586 |      *  &lt;/dependency&gt;
587 |      * </pre>
588 |      * 
589 |      * @param args
590 |      *            the command-line arguments
591 |      * @throws IOException
592 |      *             if expandQuery throws an IOException, or if the host lookup fails (localhost shouldn't)
593 |      * @throws ParseException
594 |      *             if the command-line arguments cannot be parsed
595 |      */
596 |     public static void main(String[] args) throws IOException, ParseException {
597 | 
598 |         /*
599 |          * Options options = createOptions(); CommandLineParser parser = new GnuParser(); CommandLine cl =
600 |          * parser.parse(options, args); if (cl.hasOption("help")) { HelpFormatter formatter = new HelpFormatter();
601 |          * formatter.printHelp(Rocchio.class.getCanonicalName(), options); return; }
602 |          * 
603 |          * // Get the many command line parameters String cluster = cl.getOptionValue("cluster", "elasticsearch");
604 |          * String host = cl.getOptionValue("host", "localhost"); int port = Integer.parseInt(cl.getOptionValue("port",
605 |          * "9300")); double alpha = Double.parseDouble(cl.getOptionValue("alpha", "0.5")); double beta =
606 |          * Double.parseDouble(cl.getOptionValue("beta", "0.5")); double k1 = Double.parseDouble(cl.getOptionValue("k1",
607 |          * "1.2")); double b = Double.parseDouble(cl.getOptionValue("b", "0.75")); int fbTerms =
608 |          * Integer.parseInt(cl.getOptionValue("fbTerms", "10")); int fbDocs =
609 |          * Integer.parseInt(cl.getOptionValue("fbDocs", "10")); String index = cl.getOptionValue("index", "biocaddie");
610 |          * String type = cl.getOptionValue("type", "dataset"); String field = cl.getOptionValue("field", "_all");
611 |          * 
612 |          * String auth = cl.getOptionValue("auth", "elastic:biocaddie"); String query = cl.getOptionValue("query",
613 |          * "multiple sclerosis");
614 |          * 
615 |          * // Connect to ElasticSearch Settings settings = Settings.builder().put("cluster.name", cluster).build();
616 |          * TransportClient transportClient = new PreBuiltTransportClient(settings);
617 |          * transportClient.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port));
618 |          * Client client = transportClient.filterWithHeader(Collections.singletonMap( "Authorization", auth));
619 |          * 
620 |          * // Construct Rocchio Rocchio rocchio = new Rocchio(client, index, type, field, alpha, beta, k1, b);
621 |          * 
622 |          * // Expand the query FeatureVector feedbackQuery = rocchio.expandQuery(query, fbDocs, fbTerms);
623 |          * 
624 |          * // Dump the expanded query StringBuffer esQuery = new StringBuffer(); for (String term :
625 |          * feedbackQuery.getFeatures()) { esQuery.append(term + "^" + feedbackQuery.getFeatureWeight(term) + " "); }
626 |          * System.out.println(esQuery);
627 |          * 
628 |          * transportClient.close();
629 |          */
630 |     }
631 | }


--------------------------------------------------------------------------------