├── scripts
├── stop.sh
├── restart.sh
├── delete-index.sh
├── remove.sh
├── install.sh
├── create-index.sh
├── build.sh
├── start.sh
└── add-docs.sh
├── data
├── 1.json
├── 2.json
└── biocaddie.json
├── logs.sh
├── rebuild.sh
├── .gitignore
├── src
├── main
│ ├── resources
│ │ ├── plugin-descriptor.properties
│ │ └── stoplist.all
│ ├── assemblies
│ │ └── plugin.xml
│ └── java
│ │ └── org
│ │ └── nationaldataservice
│ │ └── elasticsearch
│ │ └── rocchio
│ │ ├── RocchioPlugin.java
│ │ ├── RocchioException.java
│ │ ├── RocchioExpandRestAction.java
│ │ └── Rocchio.java
└── test
│ ├── java
│ └── org
│ │ └── nationaldataservice
│ │ └── elasticsearch
│ │ └── rocchio
│ │ └── test
│ │ ├── unit
│ │ └── RocchioTest.java
│ │ └── integration
│ │ ├── RocchioIT.java
│ │ └── AbstractITCase.java
│ └── ant
│ └── integration-tests.xml
├── test.sh
├── LICENSE
├── README.md
└── pom.xml
/scripts/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker stop elastic-qe-5.3.2
4 |
--------------------------------------------------------------------------------
/data/1.json:
--------------------------------------------------------------------------------
1 | {
2 | "fullname": "Hello",
3 | "text": "World"
4 | }
5 |
--------------------------------------------------------------------------------
/data/2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fullname": "Hello",
3 | "text": "World 2"
4 | }
5 |
--------------------------------------------------------------------------------
/scripts/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker restart elastic-qe-5.3.2
4 |
--------------------------------------------------------------------------------
/logs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker logs -f elastic-qe-5.3.2 --tail 100
4 |
5 |
--------------------------------------------------------------------------------
/scripts/delete-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | curl -u elastic:changeme -XDELETE localhost:9200/biocaddie?pretty
4 |
--------------------------------------------------------------------------------
/scripts/remove.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin remove rocchio
4 |
--------------------------------------------------------------------------------
/rebuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | scripts/remove.sh; scripts/build.sh && scripts/install.sh && scripts/restart.sh && ./logs.sh
4 |
--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker exec -it elastic-qe-5.3.2 bin/elasticsearch-plugin install file:///plugin-src/target/releases/rocchio-0.0.1-SNAPSHOT.zip
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Eclipse project metadata
2 | .settings/
3 | .classpath
4 | .project
5 |
6 | # Build output
7 | target/
8 |
9 | # ElasticSearch data
10 | es-data/
11 |
--------------------------------------------------------------------------------
/scripts/create-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | curl -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie?pretty -d@data/biocaddie.json
4 |
--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mvn clean package && exit 0 \
4 | || echo "WARNING: No native Maven installed - using Docker instead" \
5 | && docker exec -it $(pwd):/workspace -w /workspace maven:3 mvn clean package && exit 0
6 |
7 | exit 1
8 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | description=${project.description}.
2 | version=${project.version}
3 | name=${project.artifactId}
4 | classname=org.nationaldataservice.elasticsearch.rocchio.RocchioPlugin
5 | java.version=1.8
6 | elasticsearch.version=${elasticsearch.version}
--------------------------------------------------------------------------------
/scripts/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker start elastic-qe-5.3.2 && exit 0 || docker run --name=elastic-qe-5.3.2 -it -d -p 9200:9200 -v $(pwd):/plugin-src/ -v $HOME/es-5.3.2-data:/usr/share/elasticsearch/data -e "http.host=0.0.0.0" -e "transport.host=127.0.0.1" docker.elastic.co/elasticsearch/elasticsearch:5.3.2 && exit 0
4 |
--------------------------------------------------------------------------------
/data/biocaddie.json:
--------------------------------------------------------------------------------
1 | {
2 | "mappings": {
3 | "dataset": {
4 | "_all": {
5 | "type": "text",
6 | "term_vector": "with_positions_offsets_payloads",
7 | "store" : true,
8 | "analyzer" : "fulltext_analyzer"
9 | }
10 | }
11 | },
12 | "settings": {
13 | "index" : {
14 | "number_of_shards" : 1,
15 | "number_of_replicas" : 0
16 | },
17 | "analysis": {
18 | "analyzer": {
19 | "fulltext_analyzer": {
20 | "type": "custom",
21 | "tokenizer": "whitespace",
22 | "filter": [
23 | "lowercase",
24 | "type_as_payload"
25 | ]
26 | }
27 | }
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Point to a specific instance of elasticsearch (defaults to Docker instance)
4 | TEST_HOST="localhost"
5 | TEST_PORT="9200"
6 | TEST_USERNAME="elastic"
7 | TEST_PASSWORD="changeme"
8 |
9 | # Specify expansion / search parameters
10 | TEST_INDEX="biocaddie"
11 | SEARCH_TYPE="dataset"
12 | TEST_QUERY="multiple+sclerosis"
13 | STOP_LIST="a+an+the+and+or+of+from+on+was+to+is+-+were+at+as+we"
14 |
15 | # Override additional parameters here
16 | ADDITIONAL_ARGS="&fbTerms=20&fbDocs=50"
17 |
18 | # Otherwise, just run Rocchio and return the expanded query
19 | curl -u "${TEST_USERNAME}:${TEST_PASSWORD}" ${TEST_HOST}:${TEST_PORT}/${TEST_INDEX}/${SEARCH_TYPE}/_expand'?pretty'${ADDITIONAL_ARGS}'&query='${TEST_QUERY}
20 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | plugin
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
11 | elasticsearch
12 | true
13 |
14 |
15 |
16 |
17 | elasticsearch
18 | true
19 | true
20 |
21 |
22 |
--------------------------------------------------------------------------------
/scripts/add-docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # Change this to match the path to your (unzipped) biocaddie benchmark dataset
5 | dataset_directory=$HOME/update_json_folder
6 |
7 | echo 'Started indexing!'
8 | for docid in {1..790000}
9 | do
10 | if [ "$1" == "-vvvv" ]; then
11 | echo "Indexing document: $docid"
12 | elif [ "$1" == "-vvv" -a "$(expr $docid % 10)" == "0" ]; then
13 | echo "Indexing document: $docid"
14 | elif [ "$1" == "-vv" -a "$(expr $docid % 100)" == "0" ]; then
15 | echo "Indexing document: $docid"
16 | elif [ "$1" == "-v" -a "$(expr $docid % 1000)" == "0" ]; then
17 | echo "Indexing document: $docid"
18 | elif [ "$1" != "-q" -a "$$(expr $docid % 100000)" == "0" ]; then
19 | echo "Indexing document: $docid"
20 | fi
21 |
22 | curl --silent -u elastic:changeme -XPUT --header 'Content-Type: application/json' localhost:9200/biocaddie/dataset/$docid?pretty -d@$HOME/update_json_folder/$docid.json > /dev/null
23 | done
24 |
25 | echo 'Indexing complete!'
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 National Data Service
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioPlugin.java:
--------------------------------------------------------------------------------
1 | package org.nationaldataservice.elasticsearch.rocchio;
2 |
3 | import java.util.Arrays;
4 |
5 | import java.util.List;
6 | import java.util.function.Supplier;
7 |
8 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
9 | import org.elasticsearch.cluster.node.DiscoveryNodes;
10 | import org.elasticsearch.common.settings.ClusterSettings;
11 | import org.elasticsearch.common.settings.IndexScopedSettings;
12 | import org.elasticsearch.common.settings.Settings;
13 | import org.elasticsearch.common.settings.SettingsFilter;
14 | import org.elasticsearch.plugins.ActionPlugin;
15 | import org.elasticsearch.plugins.Plugin;
16 | import org.elasticsearch.rest.RestController;
17 | import org.elasticsearch.rest.RestHandler;
18 |
19 | public class RocchioPlugin extends Plugin implements ActionPlugin {
20 | @Override
21 | public List getRestHandlers(Settings settings, RestController restController,
22 | ClusterSettings clusterSettings, IndexScopedSettings indexScopedSettings, SettingsFilter settingsFilter,
23 | IndexNameExpressionResolver indexNameExpressionResolver, Supplier nodesInCluster) {
24 | return Arrays.asList(new RocchioExpandRestAction(settings, restController));
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioException.java:
--------------------------------------------------------------------------------
1 | package org.nationaldataservice.elasticsearch.rocchio;
2 |
3 |
4 | public class RocchioException extends Exception {
5 | /**
6 | * Unique id to identify this {@link Exception}
7 | */
8 | private static final long serialVersionUID = 5961496592606387768L;
9 |
10 | /**
11 | * An {@link Exception} encountered during {@link Rocchio} operations
12 | */
13 | public RocchioException() {
14 |
15 | }
16 |
17 | /**
18 | * An {@link Exception} encountered during {@link Rocchio} operations
19 | *
20 | * @param message the {@link String} error message
21 | */
22 | public RocchioException(String message) {
23 | super(message);
24 | }
25 |
26 | /**
27 | * An exception encountered during {@link Rocchio} operations
28 | *
29 | * @param cause the {@link Throwable} underlying cause
30 | */
31 | public RocchioException(Throwable cause) {
32 | super(cause);
33 | }
34 |
35 | /**
36 | * An exception encountered during {@link Rocchio} operations
37 | *
38 | * @param message the {@link String} error message
39 | * @param cause the {@link Throwable} underlying cause
40 | */
41 | public RocchioException(String message, Throwable cause) {
42 | super(message, cause);
43 | }
44 |
45 | /**
46 | * An exception encountered during {@link Rocchio} operations
47 | *
48 | * @param message the {@link String} error message
49 | * @param cause the {@link Throwable} underlying cause
50 | * @param enableSuppression a {@link boolean} indicating whether suppression is enabled
51 | * @param writableStackTrace a {@link boolean} indicating whether the stackTrace is writeable
52 | */
53 | public RocchioException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
54 | super(message, cause, enableSuppression, writableStackTrace);
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/resources/stoplist.all:
--------------------------------------------------------------------------------
1 | category
2 | wikipedia
3 | http
4 | html
5 | www
6 | https
7 | com
8 | php
9 | htm
10 | free
11 | cfm
12 | asp
13 | jsp
14 | a
15 | about
16 | above
17 | according
18 | across
19 | after
20 | afterwards
21 | again
22 | against
23 | albeit
24 | all
25 | almost
26 | alone
27 | along
28 | already
29 | also
30 | although
31 | always
32 | am
33 | among
34 | amongst
35 | an
36 | and
37 | another
38 | any
39 | anybody
40 | anyhow
41 | anyone
42 | anything
43 | anyway
44 | anywhere
45 | apart
46 | are
47 | around
48 | as
49 | at
50 | av
51 | be
52 | became
53 | because
54 | become
55 | becomes
56 | becoming
57 | been
58 | before
59 | beforehand
60 | behind
61 | being
62 | below
63 | beside
64 | besides
65 | between
66 | beyond
67 | both
68 | but
69 | by
70 | can
71 | cannot
72 | canst
73 | certain
74 | cf
75 | choose
76 | contrariwise
77 | cos
78 | could
79 | cu
80 | day
81 | do
82 | does
83 | doesn't
84 | doing
85 | dost
86 | doth
87 | double
88 | down
89 | dual
90 | during
91 | each
92 | either
93 | else
94 | elsewhere
95 | enough
96 | et
97 | etc
98 | even
99 | ever
100 | every
101 | everybody
102 | everyone
103 | everything
104 | everywhere
105 | except
106 | excepted
107 | excepting
108 | exception
109 | exclude
110 | excluding
111 | exclusive
112 | far
113 | farther
114 | farthest
115 | few
116 | ff
117 | first
118 | for
119 | formerly
120 | forth
121 | forward
122 | from
123 | front
124 | further
125 | furthermore
126 | furthest
127 | get
128 | go
129 | had
130 | halves
131 | hardly
132 | has
133 | hast
134 | hath
135 | have
136 | he
137 | hence
138 | henceforth
139 | her
140 | here
141 | hereabouts
142 | hereafter
143 | hereby
144 | herein
145 | hereto
146 | hereupon
147 | hers
148 | herself
149 | him
150 | himself
151 | hindmost
152 | his
153 | hither
154 | hitherto
155 | how
156 | however
157 | howsoever
158 | i
159 | ie
160 | if
161 | in
162 | inasmuch
163 | inc
164 | include
165 | included
166 | including
167 | indeed
168 | indoors
169 | inside
170 | insomuch
171 | instead
172 | into
173 | inward
174 | inwards
175 | is
176 | it
177 | its
178 | itself
179 | just
180 | kind
181 | kg
182 | km
183 | last
184 | latter
185 | latterly
186 | less
187 | lest
188 | let
189 | like
190 | little
191 | ltd
192 | many
193 | may
194 | maybe
195 | me
196 | meantime
197 | meanwhile
198 | might
199 | moreover
200 | most
201 | mostly
202 | more
203 | mr
204 | mrs
205 | ms
206 | much
207 | must
208 | my
209 | myself
210 | namely
211 | need
212 | neither
213 | never
214 | nevertheless
215 | next
216 | no
217 | nobody
218 | none
219 | nonetheless
220 | noone
221 | nope
222 | nor
223 | not
224 | nothing
225 | notwithstanding
226 | now
227 | nowadays
228 | nowhere
229 | of
230 | off
231 | often
232 | ok
233 | on
234 | once
235 | one
236 | only
237 | onto
238 | or
239 | other
240 | others
241 | otherwise
242 | ought
243 | our
244 | ours
245 | ourselves
246 | out
247 | outside
248 | over
249 | own
250 | per
251 | perhaps
252 | plenty
253 | provide
254 | quite
255 | rather
256 | really
257 | round
258 | said
259 | sake
260 | same
261 | sang
262 | save
263 | saw
264 | see
265 | seeing
266 | seem
267 | seemed
268 | seeming
269 | seems
270 | seen
271 | seldom
272 | selves
273 | sent
274 | several
275 | shalt
276 | she
277 | should
278 | shown
279 | sideways
280 | since
281 | slept
282 | slew
283 | slung
284 | slunk
285 | smote
286 | so
287 | some
288 | somebody
289 | somehow
290 | someone
291 | something
292 | sometime
293 | sometimes
294 | somewhat
295 | somewhere
296 | spake
297 | spat
298 | spoke
299 | spoken
300 | sprang
301 | sprung
302 | stave
303 | staves
304 | still
305 | such
306 | supposing
307 | than
308 | that
309 | the
310 | thee
311 | their
312 | them
313 | themselves
314 | then
315 | thence
316 | thenceforth
317 | there
318 | thereabout
319 | thereabouts
320 | thereafter
321 | thereby
322 | therefore
323 | therein
324 | thereof
325 | thereon
326 | thereto
327 | thereupon
328 | these
329 | they
330 | this
331 | those
332 | thou
333 | though
334 | thrice
335 | through
336 | throughout
337 | thru
338 | thus
339 | thy
340 | thyself
341 | till
342 | to
343 | together
344 | too
345 | toward
346 | towards
347 | ugh
348 | unable
349 | under
350 | underneath
351 | unless
352 | unlike
353 | until
354 | up
355 | upon
356 | upward
357 | upwards
358 | us
359 | use
360 | used
361 | using
362 | very
363 | via
364 | vs
365 | want
366 | was
367 | we
368 | week
369 | well
370 | were
371 | what
372 | whatever
373 | whatsoever
374 | when
375 | whence
376 | whenever
377 | whensoever
378 | where
379 | whereabouts
380 | whereafter
381 | whereas
382 | whereat
383 | whereby
384 | wherefore
385 | wherefrom
386 | wherein
387 | whereinto
388 | whereof
389 | whereon
390 | wheresoever
391 | whereto
392 | whereunto
393 | whereupon
394 | wherever
395 | wherewith
396 | whether
397 | whew
398 | which
399 | whichever
400 | whichsoever
401 | while
402 | whilst
403 | whither
404 | who
405 | whoa
406 | whoever
407 | whole
408 | whom
409 | whomever
410 | whomsoever
411 | whose
412 | whosoever
413 | why
414 | will
415 | wilt
416 | with
417 | within
418 | without
419 | worse
420 | worst
421 | would
422 | wow
423 | ye
424 | yet
425 | year
426 | yippee
427 | you
428 | your
429 | yours
430 | yourself
431 | yourselves
--------------------------------------------------------------------------------
/src/main/java/org/nationaldataservice/elasticsearch/rocchio/RocchioExpandRestAction.java:
--------------------------------------------------------------------------------
1 | package org.nationaldataservice.elasticsearch.rocchio;
2 |
3 | import java.io.IOException;
4 | import java.net.URISyntaxException;
5 | import java.nio.file.Files;
6 | import java.nio.file.Path;
7 | import java.nio.file.Paths;
8 |
9 | import org.apache.logging.log4j.Logger;
10 | import org.elasticsearch.client.node.NodeClient;
11 | import org.elasticsearch.common.inject.Inject;
12 | import org.elasticsearch.common.logging.ESLoggerFactory;
13 | import org.elasticsearch.common.settings.Settings;
14 | import org.elasticsearch.common.xcontent.XContentBuilder;
15 | import org.elasticsearch.common.xcontent.json.JsonXContent;
16 | import org.elasticsearch.rest.BaseRestHandler;
17 | import org.elasticsearch.rest.BytesRestResponse;
18 | import org.elasticsearch.rest.RestController;
19 | import org.elasticsearch.rest.RestRequest;
20 | import org.elasticsearch.rest.RestRequest.Method;
21 | import org.elasticsearch.rest.RestStatus;
22 |
23 | import edu.gslis.textrepresentation.FeatureVector;
24 | import joptsimple.internal.Strings;
25 |
26 | public class RocchioExpandRestAction extends BaseRestHandler {
27 | private final Logger logger = ESLoggerFactory.getLogger(RocchioExpandRestAction.class);
28 |
29 | @Inject
30 | public RocchioExpandRestAction(Settings settings, RestController controller) {
31 | super(settings);
32 |
33 | // Register your handlers here
34 | controller.registerHandler(Method.GET, "/{index}/{type}/_expand", this);
35 | controller.registerHandler(Method.GET, "/{index}/_expand", this);
36 | }
37 |
38 | /**
39 | * Helper method for throwing an error
40 | *
41 | * @param error
42 | * the String error message
43 | * @return a RestChannelConsumer to build up the error
44 | */
45 | protected RestChannelConsumer throwError(String error) {
46 | return throwError(error, RestStatus.BAD_REQUEST);
47 | }
48 |
49 | /**
50 | * Helper method for throwing an error
51 | *
52 | * @param error
53 | * the String error message
54 | * @param status
55 | * the HTTP status to return
56 | * @return a RestChannelConsumer to build up the error
57 | */
58 | protected RestChannelConsumer throwError(String error, RestStatus status) {
59 | this.logger.error("ERROR: " + error);
60 | return channel -> {
61 | XContentBuilder builder = JsonXContent.contentBuilder();
62 | builder.startObject();
63 | builder.field("error", error);
64 | builder.endObject();
65 | channel.sendResponse(new BytesRestResponse(status, builder));
66 | };
67 | }
68 |
69 | @Override
70 | protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException {
71 | this.logger.debug("Executing Rocchio expand action!");
72 |
73 | // Required path parameter
74 | String index = request.param("index");
75 |
76 | // Required query string parameter
77 | String query = request.param("query");
78 |
79 | // Optional parameters, with sensible defaults
80 | String type = request.param("type", "dataset");
81 | String field = request.param("field", "_all");
82 | double alpha = Double.parseDouble(request.param("alpha", "0.5"));
83 | double beta = Double.parseDouble(request.param("beta", "0.5"));
84 | double k1 = Double.parseDouble(request.param("k1", "1.2"));
85 | double b = Double.parseDouble(request.param("b", "0.75"));
86 | int fbDocs = Integer.parseInt(request.param("fbDocs", "10"));
87 | int fbTerms = Integer.parseInt(request.param("fbTerms", "10"));
88 |
89 | // Optional stoplist - assumes a space-delimited string of stop words
90 | // TODO: Populate list of default stop words
91 | String stoplist = request.param("stoplist", "");
92 |
93 | // Log the request with our full parameter set
94 | this.logger.info(String.format(
95 | "Starting RocchioExpand (index=%s, query=%s, type=%s, "
96 | + "field=%s, fbDocs=%d, fbTerms=%d, α=%.2f, β=%.2f, k1=%.2f, b=%.2f, stoplist=%s)",
97 | index, query, type, field, fbDocs, fbTerms, alpha, beta, k1, b, stoplist));
98 |
99 | // TODO: Check that type has documents added to it?
100 | // TODO: Check that the documents in the type contain the desired field?
101 | // TODO: Check that term vectors/fields stats are available for the
102 | // desired index/type/field combination?
103 |
104 | try {
105 | Rocchio rocchio = new Rocchio(client, index, type, field, alpha, beta, k1, b, stoplist);
106 |
107 | // Validate input parameters
108 | String shortCircuit = rocchio.validate(query, fbDocs, fbTerms);
109 | if (!Strings.isNullOrEmpty(shortCircuit)) {
110 | return throwError(shortCircuit);
111 | }
112 |
113 | // Expand the query
114 | this.logger.debug("Generating feedback query for (" + query + "," + fbDocs + "," + fbTerms);
115 | FeatureVector feedbackQuery = rocchio.expandQuery(query, fbDocs, fbTerms);
116 |
117 | // Format our expanded query with Lucene's boosting syntax
118 | this.logger.debug("Expanding query: " + feedbackQuery.toString());
119 | StringBuffer expandedQuery = new StringBuffer();
120 | String separator = ""; // start out with no separator
121 |
122 | for (String term : feedbackQuery.getFeatures()) {
123 | expandedQuery.append(separator + term + "^" + feedbackQuery.getFeatureWeight(term));
124 | separator = " "; // add separator after first iteration
125 | }
126 |
127 | String fullQuery = expandedQuery.toString().trim();
128 |
129 | // Return the expanded query (don't actually perform the search)
130 | this.logger.debug("Responding: " + expandedQuery.toString());
131 | return channel -> {
132 | XContentBuilder builder = JsonXContent.contentBuilder();
133 | builder.startObject();
134 |
135 | builder.field("query", fullQuery);
136 | builder.endObject();
137 | channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder));
138 | };
139 | } catch (Exception e) {
140 | // FIXME: Catching generic Exception is bad practice
141 | // TODO: make this more specific for production
142 | String errorMessage = e.getMessage();
143 | if (Strings.isNullOrEmpty(errorMessage)) {
144 | errorMessage = "An unknown error was encountered.";
145 | }
146 | return throwError(errorMessage);
147 | }
148 | }
149 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Rocchio expansion for ElasticSearch
2 |
3 |
4 |
5 | This is a prototype plugin for ElasticSearch 5.x to add Rocchio-based query expansion support using BM25 similarity. This plugin adds an ``_expand`` REST endpoint to ElasticSearch that returns a "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" with Lucene-style terms weights. This plugin was developed as part of the NDS [bioCADDIE pilot](https://biocaddie.org/expansion-models-biomedical-data-search).
6 |
7 | ## Why Rocchio?
8 | Our original goal was to implement relevance model (RM) based expansion using Lucene's language modeling similarity implementations. Our investigations revealed that [Lucene's language modeling implementation is incomplete](https://issues.apache.org/jira/browse/LUCENE-5847) and may not be suitable for use with RM. Given Lucene's origins as a vector-space implementation and current default BM25 scorer, we opted to instead implement Rocchio-style expansion. While Rocchio expansion was not originally intended for use with the BM25 retrieval model, it has proven effective.
9 |
10 | ## REST Interface
11 |
12 | Endpoint:
13 | ``/index/_expand``
14 |
15 | Parameters:
16 | * ``type``: Document type, defaults to ``dataset``
17 | * ``field``: Field to search, defaults to ``_all``
18 | * ``alpha``: Original query weight, defaults to 0.5
19 | * ``beta``: Feedback query weight, defaults to 0.5
20 | * ``k1``: BM25 k1 parameter, defaults to 1.2
21 | * ``b``: BM25 b parameter, defaults to 0.75
22 | * ``fbDocs``: Number of feedback documents, defaults to 10
23 | * ``fbTerms``: Number of feedback terms, defaults to 10
24 | * ``stoplist``: Additional stoplist terms (modifies primary stoplist)
25 | * ``query``: Query to expand
26 |
27 | The expand endpoint returns a JSON object with the expanded query in "[query string query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)" format with each expansion term and the associated expansion weight:
28 | ```
29 | {
30 | "query": "term1^weight1 term2^weight2 ..."
31 | }
32 | ```
33 |
34 | This query can be used with the standard ElasticSeach ``_search`` endpoint:
35 | ```
36 | curl -XGET 'localhost:9200/biocaddie/_search?pretty' -H 'Content-Type: application/json' -d'
37 | {
38 | "query": {
39 | "query_string" : {
40 | "default_field" : "_all",
41 | "query" : "term1^weight1 term2^weight2"
42 | }
43 | }
44 | }
45 | '
46 | ```
47 |
48 |
49 |
50 |
51 |
52 | ## Prerequisites
53 |
54 | * ElasticSearch 5.3.2 (native or via Docker)
55 | * Git + Maven (native or via Docker)
56 | * ElasticSearch index
57 |
58 | ## Installing from OSSRH
59 | You can install the plugin using the following command:
60 | ```bash
61 | bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT/queryexpansion-5.3.2-20170726.231658-1.zip
62 | ```
63 |
64 | NOTE: You can check https://oss.sonatype.org/content/repositories/snapshots/edu/illinois/lis/queryexpansion/5.3.2-SNAPSHOT for a link to the newest `.zip` file.
65 |
66 | ## Building From Source
67 | Clone this repository:
68 | ```bash
69 | git clone nds-org/elasticsearch-queryexpansion-plugin queryexpansion && cd queryexpansion
70 | mvn package
71 | bin/elasticsearch-plugin install file:///path/to/elasticsearch-queryexpansion-plugin/target/releases/queryexpansion-5.3.2-SNAPSHOT.zip
72 | ```
73 |
74 |
75 | ## Example usage
76 |
77 | The repository includes several scripts demonstrating how to install and use the plugin via Docker:
78 |
79 | 1. [Setup](README.md#setup)
80 | 2. [Build](README.md#build)
81 | 3. [Load](README.md#load)
82 | 4. [Test](README.md#test)
83 |
84 | ### Setup
85 | The following steps demonstrate how to build an ElasticSearch index from the bioCADDIE test collection.
86 |
87 | Make sure that the biocaddie benchmark test dataset exists somewhere on disk:
88 | ```bash
89 | cd $HOME
90 | wget https://biocaddie.org/sites/default/files/update_json_folder.zip && unzip update_json_folder.zip
91 | ```
92 |
93 | Start ElasticSearch or run ElasticSearch 5.3.2 via Docker using the helper script:
94 | ```bash
95 | ./scripts/start.sh
96 | ```
97 |
98 | Create an index with the required parameters (store==true):
99 | ```bash
100 | ./scripts/create-index.sh
101 | ```
102 |
103 | NOTE: You may need to modify *dataset_path* in `./scripts/add-docs.sh` if your benchmark data is not located within `$HOME`.
104 |
105 | Finally, use the helper script to add the documents to the index:
106 | ```bash
107 | ./scripts/add-docs.sh
108 | ```
109 |
110 | NOTE: Indexing the full benchmark set can take a long time. If you only need a small subset of the documents, you can always `Ctrl+C` once you get the desired number of records indexed.
111 |
112 | ### Build
113 | The following helper script will build the plugin using Maven (or using Docker if Maven is not installed):
114 | ```bash
115 | ./scripts/build.sh
116 | ```
117 | Either way, the build should produce a `target/releases/` directory with the necessary `.zip` file.
118 |
119 | The `.zip` that ElasticSearch needs should be found at `./target/releases/rocchio-0.0.1-SNAPSHOT.jar`.
120 |
121 | ### Load
122 | Once the artifacts are built, we just need to install them and restart ElasticSearch. The following helper scripts assume that you are running ElasticSearch via Docker:
123 | ```bash
124 | ./scripts/install.sh
125 | ./scripts/restart.sh
126 | ```
127 |
128 | ### Test
129 | You should now be able to test the new endpoint using the helper script or via raw `curl`:
130 | ```bash
131 | $ ./test.sh
132 | {"query":"sclerosis^2.798773920190095 study^0.4716440174771813 disease^0.584064093901503 or^0.3394485958568884 patients^0.79730633189081 multiple^1.941784058395449 was^0.4222225922753828 is^0.38702376034952857 to^0.4432445617796595 on^0.3817563584164061"}
133 | ```
134 |
135 | You can check the container logs to see what happened under the covers:
136 | ```bash
137 | $ ./logs.sh
138 | ...
139 | [2017-07-01T04:54:54,007][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [reindex]
140 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [transport-netty3]
141 | [2017-07-01T04:54:54,008][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded module [transport-netty4]
142 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded plugin [queryexpansion]
143 | [2017-07-01T04:54:54,009][INFO ][o.e.p.PluginsService ] [lmIsnX7] loaded plugin [x-pack]
144 | [2017-07-01T04:55:00,722][INFO ][o.e.n.Node ] initialized
145 | [2017-07-01T04:55:00,744][INFO ][o.e.n.Node ] [lmIsnX7] starting ...
146 | [2017-07-01T04:55:01,467][WARN ][i.n.u.i.MacAddressUtil ] Failed to find a usable hardware address from the network interfaces; using random bytes: f8:2c:c0:8c:3e:88:3b:3b
147 | [2017-07-01T04:55:01,695][INFO ][o.e.t.TransportService ] [lmIsnX7] publish_address {127.0.0.1:9300}, bound_addresses {127.0.0.1:9300}
148 | [2017-07-01T04:55:02,082][INFO ][o.e.m.j.JvmGcMonitorService] [lmIsnX7] [gc][1] overhead, spent [260ms] collecting in the last [1s]
149 | [2017-07-01T04:55:05,179][INFO ][o.e.c.s.ClusterService ] [lmIsnX7] new_master {lmIsnX7}{lmIsnX7NRH2_Vmq6avBitQ}{iyWg9zTcQqCeF97xX-hdJQ}{127.0.0.1}{127.0.0.1:9300}, reason: zen-disco-elected-as-master ([0] nodes joined)
150 | [2017-07-01T04:55:05,305][INFO ][o.e.x.s.t.n.SecurityNetty4HttpServerTransport] [lmIsnX7] publish_address {172.17.0.2:9200}, bound_addresses {[::]:9200}
151 | [2017-07-01T04:55:05,318][INFO ][o.e.n.Node ] [lmIsnX7] started
152 | [2017-07-01T04:55:06,492][INFO ][o.e.l.LicenseService ] [lmIsnX7] license [0a8ce788-74ad-49d9-aa3c-3c46ab9100d8] mode [trial] - valid
153 | [2017-07-01T04:55:06,513][INFO ][o.e.g.GatewayService ] [lmIsnX7] recovered [4] indices into cluster_state
154 | [2017-07-01T04:55:08,078][INFO ][o.e.c.r.a.AllocationService] [lmIsnX7] Cluster health status changed from [RED] to [YELLOW] (reason: [shards started [[.monitoring-es-2-2017.07.01][0], [biocaddie][0]] ...]).
155 | [2017-07-01T04:55:13,088][INFO ][o.n.e.r.RocchioExpandRestAction] [lmIsnX7] Starting Rocchio (biocaddie,multiple sclerosis,dataset,_all,10,10,0.50,0.50,1.20,0.75)
156 | ...
157 | ```
158 |
159 | ## Helper Scripts
160 | A few other helper scripts are included to ease testing:
161 | ```bash
162 | ./scripts/start.sh # Runs or starts your elasticsearch container
163 | ./scripts/stop.sh # Stops your elasticsearch container
164 | ./scripts/restart.sh
165 | ./scripts/create-index.sh # Creates a test index with the proper settings to enable storing term vectors
166 | ./scripts/add-docs.sh [-v] # Adds documents from the biocaddie benchmark set to your index (assumes correct paths)
167 | ./scripts/delete-index.sh # Deletes your container's test index and the records within
168 | ./scripts/build.sh # Builds up elasticsearch plugin artifacts
169 | ./scripts/install.sh # Installs the elasticsearch plugin into your running container
170 | ./scripts/remove.sh # Removes your container's installed queryexpanion plugin
171 | ./rebuild.sh # Removes the current plugin, builds the artifacts, installs the new plugin, and restarts elasticsearch to facilitate rapid development and testing
172 | ./logs.sh # View your elasticsearch container logs (tail=100)
173 | ./test.sh [search] # Performs a test query against our REST API endpoint (only expands by default, but searches if first parameter is "search")
174 | ```
175 |
176 | # Deploying artifacts
177 | New artifacts can be deployed to OSSRH using the following command:
178 | ```bash
179 | GPG_TTY=$(tty) mvn clean deploy
180 | ```
181 |
--------------------------------------------------------------------------------
/src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/unit/RocchioTest.java:
--------------------------------------------------------------------------------
1 | package org.nationaldataservice.elasticsearch.rocchio.test.unit;
2 |
3 | import static org.junit.Assert.*;
4 | import static org.mockito.Mockito.*;
5 | import java.io.IOException;
6 | import java.util.HashMap;
7 | import java.util.LinkedHashMap;
8 | import java.util.Map;
9 |
10 | import org.elasticsearch.search.SearchHits;
11 | import org.elasticsearch.search.SearchHit;
12 | import org.apache.lucene.index.Fields;
13 | import org.apache.lucene.index.Terms;
14 | import org.apache.lucene.index.TermsEnum;
15 | import org.apache.lucene.util.BytesRef;
16 | import org.elasticsearch.action.ActionFuture;
17 | import org.elasticsearch.action.ListenableActionFuture;
18 | import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
19 | import org.elasticsearch.action.search.SearchRequestBuilder;
20 | import org.elasticsearch.action.search.SearchResponse;
21 | import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse;
22 | import org.elasticsearch.action.termvectors.MultiTermVectorsRequestBuilder;
23 | import org.elasticsearch.action.termvectors.MultiTermVectorsResponse;
24 | import org.elasticsearch.action.termvectors.TermVectorsResponse;
25 | import org.elasticsearch.client.AdminClient;
26 | import org.elasticsearch.client.Client;
27 | import org.elasticsearch.client.ClusterAdminClient;
28 | import org.elasticsearch.cluster.ClusterState;
29 | import org.elasticsearch.cluster.metadata.IndexMetaData;
30 | import org.elasticsearch.cluster.metadata.MappingMetaData;
31 | import org.elasticsearch.cluster.metadata.MetaData;
32 | import org.elasticsearch.common.collect.ImmutableOpenMap;
33 | import org.elasticsearch.index.query.QueryStringQueryBuilder;
34 | import org.junit.After;
35 | import org.junit.Before;
36 | import org.junit.Test;
37 | import org.junit.runner.RunWith;
38 | import org.mockito.runners.MockitoJUnitRunner;
39 | import org.nationaldataservice.elasticsearch.rocchio.Rocchio;
40 |
41 | import edu.gslis.textrepresentation.FeatureVector;
42 |
43 | /**
44 | * This is a simple unit test suite for the Rocchio ElasticSearch Plugin. Use
45 | * these test cases to verify correctness of the query expansion process. You
46 | * can also vary the parameters here to see how that affects the resulting
47 | * expansion. All ElasticSearch internals have been mocked with Mockito to
48 | * return fake data.
49 | *
50 | *
51 | * @author lambert8
52 | *
53 | */
54 | @RunWith(MockitoJUnitRunner.class)
55 | public class RocchioTest {
56 | /** The Rocchio instance to test */
57 | private Rocchio rocchio;
58 |
59 | // The common test parameter set (individual tests can still use one-off
60 | // values)
61 | private static final String TEST_INDEX = "biocaddie";
62 | private static final String TEST_QUERY = "rat";
63 | private static final String TEST_TYPE = "dataset";
64 | private static final String TEST_FIELD = "_all";
65 | private static final int TEST_FB_TERMS = 10;
66 | private static final int TEST_FB_DOCS = 50;
67 | private static final double TEST_ALPHA = 0.5;
68 | private static final double TEST_BETA = 0.5;
69 | private static final double TEST_K1 = 1.2;
70 | private static final double TEST_B = 0.75;
71 |
72 | // Mock out all of the ElasticSearch internals
73 | private static final Client client = mock(Client.class);
74 |
75 | @SuppressWarnings("unchecked")
76 | private static final ActionFuture clusterStateFuture = (ActionFuture) mock(ActionFuture.class);
77 | private static final AdminClient adminClient = mock(AdminClient.class);
78 | private static final ClusterAdminClient clusterAdminClient = mock(ClusterAdminClient.class);
79 | private static final ClusterState clusterState = mock(ClusterState.class);
80 | private static final ClusterStateResponse clusterStateResponse = mock(ClusterStateResponse.class);
81 | private static final MetaData clusterMetadata = mock(MetaData.class);
82 | private static final IndexMetaData mockIndexMetaData = mock(IndexMetaData.class);
83 |
84 | @SuppressWarnings("unchecked")
85 | private static final ListenableActionFuture mockMtvFuture = mock(ListenableActionFuture.class);
86 | private static final MultiTermVectorsResponse mockMtvResponse = mock(MultiTermVectorsResponse.class);
87 | private static final TermVectorsResponse mockTvResponse = mock(TermVectorsResponse.class);
88 | private static final MultiTermVectorsItemResponse mockMtvItemResponse = mock(MultiTermVectorsItemResponse.class);
89 | private static final MultiTermVectorsRequestBuilder mockMtvBuilder = mock(MultiTermVectorsRequestBuilder.class);
90 | private static final Fields mockFields = mock(Fields.class);
91 | private static final Terms mockTerms = mock(Terms.class);
92 | private static final MultiTermVectorsItemResponse[] mockMtvItemResponses = { mockMtvItemResponse };
93 |
94 | @SuppressWarnings("unchecked")
95 | private static final ListenableActionFuture mockSearchFuture = mock(ListenableActionFuture.class);
96 | private static final SearchRequestBuilder srBuilder = mock(SearchRequestBuilder.class);
97 | private static final SearchResponse mockSearchResponse = mock(SearchResponse.class);
98 |
99 | // These are used internally, but are overridden by later mocks (see TermsEnum iteration)
100 | private static final SearchHits hits = mock(SearchHits.class);
101 | private static final SearchHit hit1 = mock(SearchHit.class);
102 | private static final SearchHit hit2 = mock(SearchHit.class);
103 | private static final SearchHit hit3 = mock(SearchHit.class);
104 | private static final SearchHit[] hitsArray = { hit1, hit2, hit3 };
105 |
106 | private static final TermsEnum mockIterator = mock(TermsEnum.class);
107 |
108 | // The index mapping metadata and sub-mappings
109 | private static final MappingMetaData mockTypeMetadata = mock(MappingMetaData.class);
110 | private static final ImmutableOpenMap indexMappingMetadata;
111 | private static final LinkedHashMap fieldPropertiesMap = new LinkedHashMap();
112 | private static final LinkedHashMap typePropertiesMap = new LinkedHashMap();
113 | private static final LinkedHashMap typeMap = new LinkedHashMap();
114 | private static final Map typeMetadataMapping = new HashMap<>();
115 |
116 | // FIXME: finish mocking out iterator and expand
117 | private static final BytesRef termRef = new BytesRef("rat");
118 |
119 | /** Static initializer: set up all required test data and mocks */
120 | static {
121 | // Build up our properties mapping: { "store": true } object
122 | fieldPropertiesMap.put("store", true);
123 |
124 | // Build up our test field mapping with the properties map
125 | typePropertiesMap.put(TEST_FIELD, fieldPropertiesMap);
126 |
127 | // Build up our test type mapping from the test field mapping
128 | typeMap.put("properties", typePropertiesMap);
129 | typeMap.put("_all", fieldPropertiesMap);
130 |
131 | // Build up our test type mapping of the type metadata
132 | typeMetadataMapping.put(TEST_TYPE, mockTypeMetadata);
133 |
134 | // Build up our index mapping from the type mapping
135 | indexMappingMetadata = new ImmutableOpenMap.Builder().putAll(typeMetadataMapping).build();
136 |
137 | try {
138 | // Mock out ElasticSearch index mapping verification
139 | when(client.admin()).thenReturn(adminClient);
140 | when(adminClient.cluster()).thenReturn(clusterAdminClient);
141 | when(clusterAdminClient.state(any())).thenReturn(clusterStateFuture);
142 | when(clusterStateFuture.actionGet()).thenReturn(clusterStateResponse);
143 | when(clusterStateResponse.getState()).thenReturn(clusterState);
144 | when(clusterState.getMetaData()).thenReturn(clusterMetadata);
145 | when(clusterMetadata.index(anyString())).thenReturn(mockIndexMetaData);
146 | when(mockIndexMetaData.getMappings()).thenReturn(indexMappingMetadata);
147 | when(mockTypeMetadata.getSourceAsMap()).thenReturn(typeMap);
148 |
149 | // Mock out ElasticSearch Search
150 | when(client.prepareSearch(anyString())).thenReturn(srBuilder);
151 | when(srBuilder.setQuery(any(QueryStringQueryBuilder.class))).thenReturn(srBuilder);
152 | when(srBuilder.setSize(anyInt())).thenReturn(srBuilder);
153 | when(srBuilder.execute()).thenReturn(mockSearchFuture);
154 | when(mockSearchFuture.actionGet()).thenReturn(mockSearchResponse);
155 | when(mockSearchResponse.getHits()).thenReturn(hits);
156 | when(hits.getHits()).thenReturn(hitsArray);
157 | when(hits.hits()).thenReturn(hitsArray);
158 |
159 | // These are used internally, but are likely
160 | // overridden by later mocks (see TermsEnum iteration)
161 | when(hits.totalHits()).thenReturn(Long.valueOf(3));
162 | when(hits.getTotalHits()).thenReturn(Long.valueOf(3));
163 |
164 | // Mock out ElasticSearch MultiTermVector Fields/Terms
165 | when(mockMtvBuilder.execute()).thenReturn(mockMtvFuture);
166 | when(mockMtvFuture.actionGet()).thenReturn(mockMtvResponse);
167 | when(mockMtvBuilder.add(any())).thenReturn(mockMtvBuilder);
168 | when(client.prepareMultiTermVectors()).thenReturn(mockMtvBuilder);
169 | when(mockMtvItemResponse.getResponse()).thenReturn(mockTvResponse);
170 | when(mockMtvResponse.getResponses()).thenReturn(mockMtvItemResponses);
171 |
172 | // FIXME: The two sections below return completely arbitrary values
173 | // and should be updated to something more sane
174 | // Mock out Lucene Fields/Terms
175 | when(mockTvResponse.getFields()).thenReturn(mockFields);
176 | when(mockFields.terms(TEST_FIELD)).thenReturn(mockTerms);
177 | when(mockTerms.getDocCount()).thenReturn(10);
178 | when(mockTerms.getSumTotalTermFreq()).thenReturn(10L);
179 | when(mockTerms.iterator()).thenReturn(mockIterator);
180 |
181 | // Mock out Lucene TermsEnum iteration
182 | when(mockIterator.next()).thenReturn(termRef).thenReturn(null);
183 | when(mockIterator.totalTermFreq()).thenReturn(10L);
184 | when(mockIterator.docFreq()).thenReturn(10);
185 | when(mockIterator.term()).thenReturn(termRef);
186 | } catch (IOException e) {
187 | e.printStackTrace();
188 | fail();
189 | }
190 | };
191 |
192 | @Before
193 | /** Set up our test Rocchio implementation */
194 | public void setUp() throws IOException {
195 | this.rocchio = new Rocchio(client, TEST_INDEX, TEST_TYPE, TEST_FIELD, TEST_ALPHA, TEST_BETA, TEST_K1, TEST_B);
196 | }
197 |
198 | @After
199 | /** Tear down our test Rocchio implementation */
200 | public void tearDown() {
201 | this.rocchio = null;
202 | }
203 |
204 | @Test
205 | /** Test that validate properly returns null if all parameters are valid */
206 | public void testValidate() throws IOException {
207 | String shouldBeNull = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS);
208 | assertNull(shouldBeNull);
209 | }
210 |
211 | @Test
212 | /** Test that validate fails when query is null */
213 | public void testValidateInvalidQuery() throws IOException {
214 | String errorMessage = rocchio.validate("", TEST_FB_DOCS, TEST_FB_TERMS);
215 | assertNotNull(errorMessage);
216 | assertEquals(Rocchio.NULL_QUERY_ERROR, errorMessage);
217 | }
218 |
219 | @Test
220 | /** Test that validate fails when fbDocs < 1 */
221 | public void testValidateInvalidFeedbackDocuments() throws IOException {
222 | String errorMessage = rocchio.validate(TEST_QUERY, 0, TEST_FB_TERMS);
223 | assertNotNull(errorMessage);
224 | assertEquals(Rocchio.INVALID_FB_DOCS_ERROR, errorMessage);
225 | }
226 |
227 | @Test
228 | /** Test that validate fails when fbTerms < 1 */
229 | public void testValidateInvalidFeedbackTerms() throws IOException {
230 | String errorMessage = rocchio.validate(TEST_QUERY, TEST_FB_DOCS, 0);
231 | assertNotNull(errorMessage);
232 | assertEquals(Rocchio.INVALID_FB_TERMS_ERROR, errorMessage);
233 | }
234 |
235 | @Test
236 | /** Test that we can expand a query against the test index */
237 | public void testExpandQuery() throws IOException {
238 | // Expand the query
239 | FeatureVector feedbackQuery = rocchio.expandQuery(TEST_QUERY, TEST_FB_DOCS, TEST_FB_TERMS);
240 |
241 | // Verify expanded segments
242 | String[] segments = feedbackQuery.toString().trim().split(" ");
243 | assertEquals(2, segments.length);
244 | assertEquals("0.012976521", segments[0]);
245 | assertEquals("rat", segments[1]);
246 | }
247 | }
248 |
--------------------------------------------------------------------------------
/src/test/ant/integration-tests.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 | Waiting for elasticsearch to become available on port @{port}...
84 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 | 0);
107 | s = true;
108 | break;
109 | }
110 | }
111 |
112 | if(!s){
113 | d = a.length - b.length;
114 | project.setProperty("compare-result", d >= 0);
115 | }
116 |
117 | ]]>
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
134 |
135 |
136 |
137 |
138 |
139 | Starting up external cluster...
140 |
141 |
142 | running Elasticsearch 5.0.0 or superior
143 | running Elasticsearch < 5.0.0
144 |
145 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
166 |
167 |
168 |
169 |
172 |
173 |
174 |
175 |
176 |
177 |
178 | External node started PID ${integ.pid}
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 | Shutting down external node PID ${integ.pid}
189 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 | Installing plugin @{name}...
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
--------------------------------------------------------------------------------
/src/test/java/org/nationaldataservice/elasticsearch/rocchio/test/integration/RocchioIT.java:
--------------------------------------------------------------------------------
1 | package org.nationaldataservice.elasticsearch.rocchio.test.integration;
2 |
3 | import org.junit.BeforeClass;
4 | import org.junit.Test;
5 | import static org.hamcrest.Matchers.*;
6 | import static org.junit.Assert.*;
7 |
8 | import java.util.HashMap;
9 | import java.util.List;
10 | import java.util.Map;
11 |
12 | import org.apache.http.entity.StringEntity;
13 | import org.apache.logging.log4j.Logger;
14 | import org.elasticsearch.client.Response;
15 | import org.elasticsearch.common.logging.ESLoggerFactory;
16 |
17 | /**
18 | * This is a simple integration test suite for the ElasticSearch Rocchio
19 | * Plugin.Use these test cases to verify correctness of the API endpoint, input
20 | * validation, compare performance, scale testing, etc
21 | * Before the test suite runs, the test runner will:
22 | *
23 | *
24 | * * Download ElasticSearch binaries
25 | * * Install the ElasticSearch Rocchio Plugin
26 | * * Start up an ElasticSearch cluster
27 | * * Ensure that the TEST_INDEX has been created
28 | * * Ensure that TEST_INDEX contains some test documents
29 | * * Run the set of test cases
30 | * * Tear down the cluster
31 | *
32 | *
33 | * @see {@link AbstractITCase}
34 | * @see src/test/ant/integration-tests.xml
35 | *
36 | * @author lambert8
37 | *
38 | */
39 | public class RocchioIT extends AbstractITCase {
40 | private static final Logger staticLogger = ESLoggerFactory.getLogger(RocchioIT.class);
41 |
42 | // The common test parameter set (individual tests can still use one-off
43 | // values)
44 | private static final String TEST_INDEX = "biocaddie";
45 | private static final String TEST_TYPE = "dataset";
46 | private static final int TEST_FB_TERMS = 10;
47 | private static final int TEST_FB_DOCS = 5;
48 |
49 | private final String defaultEndpointParameters = "fbTerms=" + TEST_FB_TERMS + "&fbDocs=" + TEST_FB_DOCS;
50 | private final String expandEndpoint = String.format("/%s/%s/_expand?%s", TEST_INDEX, TEST_TYPE,
51 | defaultEndpointParameters);
52 |
53 | // TODO: Improve expectations
54 | private final String EXPECTED_EXPANDED_QUERY_OBJECT = "{query=dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405}";
55 | private final String EXPECTED_EXPANDED_QUERY_STRING = "dorsal^0.09029725274935405 rat^0.7267361001145776 aging-associated^0.09029725274935405 root^0.09029725274935405 bladder^0.09029725274935405 effect^0.09029725274935405 ganglia^0.09029725274935405 oxidative^0.09029725274935405 urinary^0.09029725274935405 preventive^0.09029725274935405";
56 | private final String EXPECTED_SEARCH_HITS = "{_shards={total=1, failed=0, successful=1}, hits={hits=[{_index=biocaddie, _type=dataset, _source={DOCNO=1, REPOSITORY=arrayexpress_020916, TITLE=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=We characterized transcriptomes of a strain overexpressing syrA. Our work shows that the syrA transcriptome shares similar gene expression changes to the syrM and nodD3 transcriptomes and that nodD3 and syrA may be the only targets directly activated by SyrM. We propose that most of the gene expression changes observed when nodD3 is overexpressed are due to NodD3 activation of syrM expression, which in turn stimulates SyrM activation of syrA expression. The subsequent increase in SyrA abundance alters activity of the ChvI-ExoS-ExoR circuit, resulting in broad changes in gene expression. Gene expression profiling of Sinorhizobium meliloti overexpressing syrA was performed using custom Affymetrix GeneChips, ID=520401, title=The Sinorhizobium meliloti SyrM regulon: effects on global gene expression are mediated by syrA and nodD3 (SyrA), experimentType=transcription profiling by array}, organism={experiment={species=Sinorhizobium meliloti}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=1, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=2, REPOSITORY=arrayexpress_020916, TITLE=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-05, description=A study to define the binding loci of RelA-containing NF-kappaB dimers in a human myometrial smooth muscle cell line after exposure to TNF. Monolayers of PHM1-31 cells were exposed to TNF (10ng/ml) for 1 hour or left unstimulated. The Chromatin immunoprecipitation (ChIP) assay was performed to recover RelA-bound chromatin or non-specifically bound chromatin with IgG. That chromatin was prepared and used to probe Affymetrix GeneChIP 1.0R Human Promoter arrays. Three biological replicates of each experiment were conducted. Datasets were subsequently analysed in Partek Genomics Suite V6.6 where baseline was normalised by subtraction of IgG values from conrresponding RelA-immunoprecipitated samples. Control samples immunoprecipitated with RelA were then compared with TNF-stimulated samples immunoprecipitated with RelA., ID=520482, title=RelA Nuclear factor-kappaB (NF-kB) Subunit binding Loci in Promoter Regions of PHM1-31 Myometrial Smooth Muscle Cells (Promoter), experimentType=ChIP-chip by tiling array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=2, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=3, REPOSITORY=arrayexpress_020916, TITLE=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=This SuperSeries is composed of the SubSeries listed below. Refer to individual Series, ID=520420, title=Aging-associated inflammatory and oxidative changes in the rat urinary bladder and dorsal root ganglia - preventive effect of caloric restriction, experimentType=transcription profiling by array}, organism={experiment={species=Rattus norvegicus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=3, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=4, REPOSITORY=arrayexpress_020916, TITLE=Gene expression profile in Caco-2 cells treated with carnosine, METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=To reveal the effects of carnosine on Caco-2 cells, we have employed whole genome microarray to detect genes that showed significantly different expression when exposed to carnosine. Caco-2 cells were treated with 1 mM carnosine for 3 days. Caco-2 cells were treated with 1 mM carnosine for 3 days. Three independent experiments were performed., ID=520441, title=Gene expression profile in Caco-2 cells treated with carnosine, experimentType=transcription profiling by array}, organism={experiment={species=Homo sapiens}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=4, _score=1.0}, {_index=biocaddie, _type=dataset, _source={DOCNO=5, REPOSITORY=arrayexpress_020916, TITLE=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], METADATA={dataItem={dataTypes=[organism, dataItem, citation], releaseDate=2015-03-31, lastUpdateDate=2015-04-04, description=Mutations in methyl-CpG-binding protein 2 (MeCP2), a major epigenetic regulator, are the predominant cause of Rett syndrome. We previously found that Mecp2-null microglia are deficient in phagocytic ability, and that engraftment of wild-type monocytes into the brain of Mecp2-deficient mice attenuates pathology. We have observed that Mecp2 deficiency is associated with increased levels of histone acetylation at the cis-regulatory regions of the Mecp2-regulated genes in macrophages. We hypothesized that Mecp2 recruits protein complexes containing histone deacetylases (HDACs) to repress the expression of its target genes. Our ChIP-Seq studies in bone-marrow derived macrophages revealed that Mecp2 co-localizes with Ncor2/Hdac3 protein complex at cis-regulatory regions of the target genes. These results suggest a role for Mecp2 in the recruitment and regulation of Ncor2/Hdac3 repressosome that plays a critical role in the regulation of inflammatory responses in macrophages. Examination of NCOR2 and HDAC3 genome-wide location in bone-marrow derived macrophages., ID=520444, title=Mecp2: an unexpected regulator of macrophage gene expression and function [ChIP-Seq], experimentType=ChIP-seq}, organism={experiment={species=Mus musculus}}, citation={count=0}, dataResource={altNames=[], acronyms=[], keywords=[]}}}, _id=5, _score=1.0}], total=5, max_score=1.0}, took=1, timed_out=false}";
57 |
58 | @BeforeClass
59 | public static void setUp() {
60 | // Ensure that the index exists
61 | staticLogger.info("Setting up test environment!");
62 | createIndex(TEST_INDEX);
63 |
64 | // Ensure that documents to the index
65 | for (int i = 1; i <= 5; i++) {
66 | addDocument(TEST_INDEX, TEST_TYPE, i, DOCUMENTS_JSON[i - 1]);
67 | }
68 |
69 | // Tests will fail if we don't wait for ES to index the new documents
70 | staticLogger.info("Waiting for ES to finish indexing documents...");
71 | wait(3000);
72 | }
73 |
74 | @Test
75 | @SuppressWarnings("unchecked")
76 | public void testPluginIsLoaded() throws Exception {
77 |
78 | Response response = client.performRequest("GET", "/_nodes/plugins");
79 |
80 | Map nodes = (Map) entityAsMap(response).get("nodes");
81 | for (String nodeName : nodes.keySet()) {
82 | boolean pluginFound = false;
83 | Map node = (Map) nodes.get(nodeName);
84 | List