However, if you use this during indexing, you must follow it with
68 | * {@link FlattenGraphFilter} to squash tokens on top of one another
69 | * like {@link SynonymFilter}, because the indexer can't directly
70 | * consume a graph. To get fully correct positional queries when your
71 | * synonym replacements are multiple tokens, you should instead apply
72 | * synonyms using this {@code TokenFilter} at query time and translate
73 | * the resulting graph to a {@code TermAutomatonQuery} e.g. using
74 | * {@code TokenStreamToTermAutomatonQuery}.
75 | *
76 | *
NOTE: this cannot consume an incoming graph; results will
77 | * be undefined.
78 | *
79 | * @lucene.experimental */
80 |
81 | public final class DynamicSynonymGraphFilter extends AbsSynonymFilter {
82 |
83 | public static final String TYPE_SYNONYM = "SYNONYM";
84 |
85 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
86 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
87 | private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
88 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
89 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
90 |
91 | private SynonymMap synonyms;
92 | private final boolean ignoreCase;
93 |
94 | private FST fst;
95 |
96 | private FST.BytesReader fstReader;
97 | private FST.Arc scratchArc;
98 | private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
99 | private final BytesRef scratchBytes = new BytesRef();
100 | private final CharsRefBuilder scratchChars = new CharsRefBuilder();
101 | private final LinkedList outputBuffer = new LinkedList<>();
102 |
103 | private int nextNodeOut;
104 | private int lastNodeOut;
105 | private int maxLookaheadUsed;
106 |
107 | // For testing:
108 | private int captureCount;
109 |
110 | private boolean liveToken;
111 |
112 | // Start/end offset of the current match:
113 | private int matchStartOffset;
114 | private int matchEndOffset;
115 |
116 | // True once the input TokenStream is exhausted:
117 | private boolean finished;
118 |
119 | private int lookaheadNextRead;
120 | private int lookaheadNextWrite;
121 |
122 | private RollingBuffer lookahead = new RollingBuffer() {
123 | @Override
124 | protected BufferedInputToken newInstance() {
125 | return new BufferedInputToken();
126 | }
127 | };
128 |
129 | static class BufferedInputToken implements RollingBuffer.Resettable {
130 | final CharsRefBuilder term = new CharsRefBuilder();
131 | State state;
132 | int startOffset = -1;
133 | int endOffset = -1;
134 |
135 | @Override
136 | public void reset() {
137 | state = null;
138 | term.clear();
139 |
140 | // Intentionally invalid to ferret out bugs:
141 | startOffset = -1;
142 | endOffset = -1;
143 | }
144 | }
145 |
146 | static class BufferedOutputToken {
147 | final String term;
148 |
149 | // Non-null if this was an incoming token:
150 | final State state;
151 |
152 | final int startNode;
153 | final int endNode;
154 |
155 | public BufferedOutputToken(State state, String term, int startNode, int endNode) {
156 | this.state = state;
157 | this.term = term;
158 | this.startNode = startNode;
159 | this.endNode = endNode;
160 | }
161 | }
162 |
163 | /**
164 | * Apply previously built synonyms to incoming tokens.
165 | * @param input input tokenstream
166 | * @param synonyms synonym map
167 | * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
168 | * Note, if you set this to true, it's your responsibility to lowercase
169 | * the input entries when you create the {@link SynonymMap}
170 | */
171 | public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
172 | super(input);
173 | update(synonyms);
174 | this.ignoreCase = ignoreCase;
175 | }
176 |
177 | @Override
178 | public boolean incrementToken() throws IOException {
179 | //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut);
180 |
181 | assert lastNodeOut <= nextNodeOut;
182 |
183 | if (outputBuffer.isEmpty() == false) {
184 | // We still have pending outputs from a prior synonym match:
185 | releaseBufferedToken();
186 | //System.out.println(" syn: ret buffered=" + this);
187 | assert liveToken == false;
188 | return true;
189 | }
190 |
191 | // Try to parse a new synonym match at the current token:
192 |
193 | if (parse()) {
194 | // A new match was found:
195 | releaseBufferedToken();
196 | //System.out.println(" syn: after parse, ret buffered=" + this);
197 | assert liveToken == false;
198 | return true;
199 | }
200 |
201 | if (lookaheadNextRead == lookaheadNextWrite) {
202 |
203 | // Fast path: parse pulled one token, but it didn't match
204 | // the start for any synonym, so we now return it "live" w/o having
205 | // cloned all of its atts:
206 | if (finished) {
207 | //System.out.println(" syn: ret END");
208 | return false;
209 | }
210 |
211 | assert liveToken;
212 | liveToken = false;
213 |
214 | // NOTE: no need to change posInc since it's relative, i.e. whatever
215 | // node our output is upto will just increase by the incoming posInc.
216 | // We also don't need to change posLen, but only because we cannot
217 | // consume a graph, so the incoming token can never span a future
218 | // synonym match.
219 |
220 | } else {
221 | // We still have buffered lookahead tokens from a previous
222 | // parse attempt that required lookahead; just replay them now:
223 | //System.out.println(" restore buffer");
224 | assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite;
225 | BufferedInputToken token = lookahead.get(lookaheadNextRead);
226 | lookaheadNextRead++;
227 |
228 | restoreState(token.state);
229 |
230 | lookahead.freeBefore(lookaheadNextRead);
231 |
232 | //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset());
233 | assert liveToken == false;
234 | }
235 |
236 | lastNodeOut += posIncrAtt.getPositionIncrement();
237 | nextNodeOut = lastNodeOut + posLenAtt.getPositionLength();
238 |
239 | //System.out.println(" syn: ret lookahead=" + this);
240 |
241 | return true;
242 | }
243 |
244 | private void releaseBufferedToken() throws IOException {
245 | //System.out.println(" releaseBufferedToken");
246 |
247 | BufferedOutputToken token = outputBuffer.pollFirst();
248 |
249 | if (token.state != null) {
250 | // This is an original input token (keepOrig=true case):
251 | //System.out.println(" hasState");
252 | restoreState(token.state);
253 | //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset());
254 | } else {
255 | clearAttributes();
256 | //System.out.println(" no state");
257 | termAtt.append(token.term);
258 |
259 | // We better have a match already:
260 | assert matchStartOffset != -1;
261 |
262 | offsetAtt.setOffset(matchStartOffset, matchEndOffset);
263 | //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset);
264 | typeAtt.setType(TYPE_SYNONYM);
265 | }
266 |
267 | //System.out.println(" lastNodeOut=" + lastNodeOut);
268 | //System.out.println(" term=" + termAtt);
269 |
270 | posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut);
271 | lastNodeOut = token.startNode;
272 | posLenAtt.setPositionLength(token.endNode - token.startNode);
273 | }
274 |
275 | /** Scans the next input token(s) to see if a synonym matches. Returns true
276 | * if a match was found. */
277 | private boolean parse() throws IOException {
278 | // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this));
279 |
280 | // Holds the longest match we've seen so far:
281 | BytesRef matchOutput = null;
282 | int matchInputLength = 0;
283 |
284 | BytesRef pendingOutput = fst.outputs.getNoOutput();
285 | fst.getFirstArc(scratchArc);
286 |
287 | assert scratchArc.output() == fst.outputs.getNoOutput();
288 |
289 | // How many tokens in the current match
290 | int matchLength = 0;
291 | boolean doFinalCapture = false;
292 |
293 | int lookaheadUpto = lookaheadNextRead;
294 | matchStartOffset = -1;
295 |
296 | byToken:
297 | while (true) {
298 | //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos());
299 |
300 | // Pull next token's chars:
301 | final char[] buffer;
302 | final int bufferLen;
303 | final int inputEndOffset;
304 |
305 | if (lookaheadUpto <= lookahead.getMaxPos()) {
306 | // Still in our lookahead buffer
307 | BufferedInputToken token = lookahead.get(lookaheadUpto);
308 | lookaheadUpto++;
309 | buffer = token.term.chars();
310 | bufferLen = token.term.length();
311 | inputEndOffset = token.endOffset;
312 | //System.out.println(" use buffer now max=" + lookahead.getMaxPos());
313 | if (matchStartOffset == -1) {
314 | matchStartOffset = token.startOffset;
315 | }
316 | } else {
317 |
318 | // We used up our lookahead buffer of input tokens
319 | // -- pull next real input token:
320 |
321 | assert finished || liveToken == false;
322 |
323 | if (finished) {
324 | //System.out.println(" break: finished");
325 | break;
326 | } else if (input.incrementToken()) {
327 | //System.out.println(" input.incrToken");
328 | liveToken = true;
329 | buffer = termAtt.buffer();
330 | bufferLen = termAtt.length();
331 | if (matchStartOffset == -1) {
332 | matchStartOffset = offsetAtt.startOffset();
333 | }
334 | inputEndOffset = offsetAtt.endOffset();
335 |
336 | lookaheadUpto++;
337 | } else {
338 | // No more input tokens
339 | finished = true;
340 | //System.out.println(" break: now set finished");
341 | break;
342 | }
343 | }
344 |
345 | matchLength++;
346 | //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen));
347 |
348 | // Run each char in this token through the FST:
349 | int bufUpto = 0;
350 | while (bufUpto < bufferLen) {
351 | final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
352 | if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
353 | break byToken;
354 | }
355 |
356 | // Accum the output
357 | pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
358 | bufUpto += Character.charCount(codePoint);
359 | }
360 |
361 | assert bufUpto == bufferLen;
362 |
363 | // OK, entire token matched; now see if this is a final
364 | // state in the FST (a match):
365 | if (scratchArc.isFinal()) {
366 | matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
367 | matchInputLength = matchLength;
368 | matchEndOffset = inputEndOffset;
369 | //System.out.println(" ** match");
370 | }
371 |
372 | // See if the FST can continue matching (ie, needs to
373 | // see the next input token):
374 | if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
375 | // No further rules can match here; we're done
376 | // searching for matching rules starting at the
377 | // current input position.
378 | break;
379 | } else {
380 | // More matching is possible -- accum the output (if
381 | // any) of the WORD_SEP arc:
382 | pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
383 | doFinalCapture = true;
384 | if (liveToken) {
385 | capture();
386 | }
387 | }
388 | }
389 |
390 | if (doFinalCapture && liveToken && finished == false) {
391 | // Must capture the final token if we captured any prior tokens:
392 | capture();
393 | }
394 |
395 | if (matchOutput != null) {
396 |
397 | if (liveToken) {
398 | // Single input token synonym; we must buffer it now:
399 | capture();
400 | }
401 |
402 | // There is a match!
403 | bufferOutputTokens(matchOutput, matchInputLength);
404 | lookaheadNextRead += matchInputLength;
405 | //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
406 | lookahead.freeBefore(lookaheadNextRead);
407 | //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
408 | return true;
409 | } else {
410 | //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead);
411 | return false;
412 | }
413 |
414 | //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
415 | }
416 |
417 | /** Expands the output graph into the necessary tokens, adding
418 | * synonyms as side paths parallel to the input tokens, and
419 | * buffers them in the output token buffer. */
420 | private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
421 | bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
422 |
423 | final int code = bytesReader.readVInt();
424 | final boolean keepOrig = (code & 0x1) == 0;
425 | //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength);
426 |
427 | // How many nodes along all paths; we need this to assign the
428 | // node ID for the final end node where all paths merge back:
429 | int totalPathNodes;
430 | if (keepOrig) {
431 | assert matchInputLength > 0;
432 | totalPathNodes = matchInputLength - 1;
433 | } else {
434 | totalPathNodes = 0;
435 | }
436 |
437 | // How many synonyms we will insert over this match:
438 | final int count = code >>> 1;
439 |
440 | // TODO: we could encode this instead into the FST:
441 |
442 | // 1st pass: count how many new nodes we need
443 | List> paths = new ArrayList<>();
444 | for(int outputIDX=0;outputIDX path = new ArrayList<>();
451 | paths.add(path);
452 | int chEnd = scratchChars.length();
453 | for(int chUpto=0; chUpto<=chEnd; chUpto++) {
454 | if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) {
455 | path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart));
456 | lastStart = 1 + chUpto;
457 | }
458 | }
459 |
460 | assert path.size() > 0;
461 | totalPathNodes += path.size() - 1;
462 | }
463 | //System.out.println(" totalPathNodes=" + totalPathNodes);
464 |
465 | // 2nd pass: buffer tokens for the graph fragment
466 |
467 | // NOTE: totalPathNodes will be 0 in the case where the matched
468 | // input is a single token and all outputs are also a single token
469 |
470 | // We "spawn" a side-path for each of the outputs for this matched
471 | // synonym, all ending back at this end node:
472 |
473 | int startNode = nextNodeOut;
474 |
475 | int endNode = startNode + totalPathNodes + 1;
476 | //System.out.println(" " + paths.size() + " new side-paths");
477 |
478 | // First, fanout all tokens departing start node for these new side paths:
479 | int newNodeCount = 0;
480 | for(List path : paths) {
481 | int pathEndNode;
482 | //System.out.println(" path size=" + path.size());
483 | if (path.size() == 1) {
484 | // Single token output, so there are no intermediate nodes:
485 | pathEndNode = endNode;
486 | } else {
487 | pathEndNode = nextNodeOut + newNodeCount + 1;
488 | newNodeCount += path.size() - 1;
489 | }
490 | outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode));
491 | }
492 |
493 | // We must do the original tokens last, else the offsets "go backwards":
494 | if (keepOrig) {
495 | BufferedInputToken token = lookahead.get(lookaheadNextRead);
496 | int inputEndNode;
497 | if (matchInputLength == 1) {
498 | // Single token matched input, so there are no intermediate nodes:
499 | inputEndNode = endNode;
500 | } else {
501 | inputEndNode = nextNodeOut + newNodeCount + 1;
502 | }
503 |
504 | //System.out.println(" keepOrig first token: " + token.term);
505 |
506 | outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode));
507 | }
508 |
509 | nextNodeOut = endNode;
510 |
511 | // Do full side-path for each syn output:
512 | for(int pathID=0;pathID path = paths.get(pathID);
514 | if (path.size() > 1) {
515 | int lastNode = outputBuffer.get(pathID).endNode;
516 | for(int i=1;i 1) {
525 | // Do full "side path" with the original tokens:
526 | int lastNode = outputBuffer.get(paths.size()).endNode;
527 | for(int i=1;i();
588 |
589 | }
590 |
591 | // for testing
592 | int getCaptureCount() {
593 | return captureCount;
594 | }
595 |
596 | // for testing
597 | int getMaxLookaheadUsed() {
598 | return maxLookaheadUsed;
599 | }
600 | }
601 |
--------------------------------------------------------------------------------
/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java:
--------------------------------------------------------------------------------
1 | package com.bellszhu.elasticsearch.plugin.synonym.analysis;
2 |
3 | /*
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | import java.io.IOException;
21 | import java.util.Arrays;
22 |
23 | import org.apache.lucene.analysis.TokenStream;
24 | import org.apache.lucene.analysis.synonym.SynonymMap;
25 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28 | import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
29 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
30 | import org.apache.lucene.store.ByteArrayDataInput;
31 | import org.apache.lucene.util.ArrayUtil;
32 | import org.apache.lucene.util.AttributeSource;
33 | import org.apache.lucene.util.BytesRef;
34 | import org.apache.lucene.util.CharsRef;
35 | import org.apache.lucene.util.CharsRefBuilder;
36 | import org.apache.lucene.util.RamUsageEstimator;
37 | import org.apache.lucene.util.fst.FST;
38 |
39 | /**
40 | * Matches single or multi word synonyms in a token stream. This token stream
41 | * cannot properly handle position increments != 1, ie, you should place this
42 | * filter before filtering out stop words.
43 | *
44 | *
45 | * Note that with the current implementation, parsing is greedy, so whenever
46 | * multiple parses would apply, the rule starting the earliest and parsing the
47 | * most tokens wins. For example if you have these rules:
48 | *
49 | *
50 | * a -> x
51 | * a b -> y
52 | * b c d -> z
53 | *
54 | *
55 | * Then input a b c d e parses to y b c
56 | * d, ie the 2nd rule "wins" because it started earliest and matched the
57 | * most input tokens of other rules starting at that point.
58 | *
59 | *
60 | *
61 | * A future improvement to this filter could allow non-greedy parsing, such that
62 | * the 3rd rule would win, and also separately allow multiple parses, such that
63 | * all 3 rules would match, perhaps even on a rule by rule basis.
64 | *
65 | *
66 | *
67 | * NOTE: when a match occurs, the output tokens associated with the
68 | * matching rule are "stacked" on top of the input stream (if the rule had
69 | * keepOrig=true) and also on top of another matched rule's output
70 | * tokens. This is not a correct solution, as really the output should be an
71 | * arbitrary graph/lattice. For example, with the above match, you would expect
72 | * an exact PhraseQuery"y b
73 | * c" to match the parsed tokens, but it will fail to do so. This
74 | * limitation is necessary because Lucene's TokenStream (and index) cannot yet
75 | * represent an arbitrary graph.
76 | *
77 | *
78 | *
79 | * NOTE: If multiple incoming tokens arrive on the same position, only
80 | * the first token at that position is used for parsing. Subsequent tokens
81 | * simply pass through and are not parsed. A future improvement would be to
82 | * allow these tokens to also be matched.
83 | *
84 | */
85 |
86 | // TODO: maybe we should resolve token -> wordID then run
87 | // FST on wordIDs, for better perf?
88 |
89 | // TODO: a more efficient approach would be Aho/Corasick's
90 | // algorithm
91 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
92 | // It improves over the current approach here
93 | // because it does not fully re-start matching at every
94 | // token. For example if one pattern is "a b c x"
95 | // and another is "b c d" and the input is "a b c d", on
96 | // trying to parse "a b c x" but failing when you got to x,
97 | // rather than starting over again your really should
98 | // immediately recognize that "b c d" matches at the next
99 | // input. I suspect this won't matter that much in
100 | // practice, but it's possible on some set of synonyms it
101 | // will. We'd have to modify Aho/Corasick to enforce our
102 | // conflict resolving (eg greedy matching) because that algo
103 | // finds all matches. This really amounts to adding a .*
104 | // closure to the FST and then determinizing it.
105 | //
106 | // Another possible solution is described at
107 | // http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
108 |
109 | public final class DynamicSynonymFilter extends AbsSynonymFilter {
110 |
111 | private static final String TYPE_SYNONYM = "SYNONYM";
112 | private final boolean ignoreCase;
113 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
114 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
115 | private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
116 |
117 | // TODO: we should set PositionLengthAttr too...
118 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
119 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
120 | private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
121 | private final BytesRef scratchBytes = new BytesRef();
122 | private final CharsRefBuilder scratchChars = new CharsRefBuilder();
123 | private SynonymMap synonyms;
124 | private int rollBufferSize;
125 |
126 | private int captureCount;
127 | // How many future input tokens have already been matched
128 | // to a synonym; because the matching is "greedy" we don't
129 | // try to do any more matching for such tokens:
130 | private int inputSkipCount;
131 |
132 | // Rolling buffer, holding pending input tokens we had to
133 | // clone because we needed to look ahead, indexed by
134 | // position:
135 | private PendingInput[] futureInputs;
136 | // Rolling buffer, holding stack of pending synonym
137 | // outputs, indexed by position:
138 | private PendingOutputs[] futureOutputs;
139 |
140 | // Where (in rolling buffers) to write next input saved state:
141 | private int nextWrite;
142 |
143 | // Where (in rolling buffers) to read next input saved state:
144 | private int nextRead;
145 |
146 | // True once we've read last token
147 | private boolean finished;
148 |
149 | private FST.Arc scratchArc;
150 |
151 | private FST fst;
152 |
153 | private FST.BytesReader fstReader;
154 | /*
155 | * This is the core of this TokenFilter: it locates the synonym matches and
156 | * buffers up the results into futureInputs/Outputs.
157 | *
158 | * NOTE: this calls input.incrementToken and does not capture the state if
159 | * no further tokens were checked. So caller must then forward state to our
160 | * caller, or capture:
161 | */
162 | private int lastStartOffset;
163 | private int lastEndOffset;
164 |
165 | /**
166 | * @param input input tokenstream
167 | * @param synonyms synonym map
168 | * @param ignoreCase case-folds input for matching with
169 | * {@link Character#toLowerCase(int)}. Note, if you set this to
170 | * true, its your responsibility to lowercase the input entries
171 | * when you create the {@link SynonymMap}
172 | */
173 | DynamicSynonymFilter(TokenStream input, SynonymMap synonyms,
174 | boolean ignoreCase) {
175 | super(input);
176 | this.ignoreCase = ignoreCase;
177 | update(synonyms);
178 | }
179 |
180 | private void capture() {
181 | captureCount++;
182 | final PendingInput input = futureInputs[nextWrite];
183 |
184 | input.state = captureState();
185 | input.consumed = false;
186 | input.term.copyChars(termAtt.buffer(), 0, termAtt.length());
187 |
188 | nextWrite = rollIncr(nextWrite);
189 |
190 | // Buffer head should never catch up to tail:
191 | assert nextWrite != nextRead;
192 | }
193 |
194 | private void parse() throws IOException {
195 |
196 | assert inputSkipCount == 0;
197 |
198 | int curNextRead = nextRead;
199 |
200 | // Holds the longest match we've seen so far:
201 | BytesRef matchOutput = null;
202 | int matchInputLength = 0;
203 | int matchEndOffset = -1;
204 |
205 | BytesRef pendingOutput = fst.outputs.getNoOutput();
206 | fst.getFirstArc(scratchArc);
207 |
208 | assert scratchArc.output() == fst.outputs.getNoOutput();
209 |
210 | int tokenCount = 0;
211 |
212 | byToken:
213 | while (true) {
214 |
215 | // Pull next token's chars:
216 | final char[] buffer;
217 | final int bufferLen;
218 |
219 | int inputEndOffset = 0;
220 |
221 | if (curNextRead == nextWrite) {
222 |
223 | // We used up our lookahead buffer of input tokens
224 | // -- pull next real input token:
225 | if (finished) {
226 | break;
227 | } else {
228 | assert futureInputs[nextWrite].consumed;
229 | // Not correct: a syn match whose output is longer
230 | // than its input can set future inputs keepOrig
231 | // to true:
232 | if (input.incrementToken()) {
233 | buffer = termAtt.buffer();
234 | bufferLen = termAtt.length();
235 | final PendingInput input = futureInputs[nextWrite];
236 | lastStartOffset = input.startOffset = offsetAtt
237 | .startOffset();
238 | lastEndOffset = input.endOffset = offsetAtt.endOffset();
239 | inputEndOffset = input.endOffset;
240 | if (nextRead != nextWrite) {
241 | capture();
242 | } else {
243 | input.consumed = false;
244 | }
245 |
246 | } else {
247 | // No more input tokens
248 | finished = true;
249 | break;
250 | }
251 | }
252 | } else {
253 | // Still in our lookahead
254 | buffer = futureInputs[curNextRead].term.chars();
255 | bufferLen = futureInputs[curNextRead].term.length();
256 | inputEndOffset = futureInputs[curNextRead].endOffset;
257 | }
258 |
259 | tokenCount++;
260 |
261 | // Run each char in this token through the FST:
262 | int bufUpto = 0;
263 | while (bufUpto < bufferLen) {
264 | final int codePoint = Character.codePointAt(buffer, bufUpto,
265 | bufferLen);
266 | if (fst.findTargetArc(
267 | ignoreCase ? Character.toLowerCase(codePoint)
268 | : codePoint, scratchArc, scratchArc, fstReader) == null) {
269 | break byToken;
270 | }
271 |
272 | // Accum the output
273 | pendingOutput = fst.outputs.add(pendingOutput,
274 | scratchArc.output());
275 | bufUpto += Character.charCount(codePoint);
276 | }
277 |
278 | // OK, entire token matched; now see if this is a final
279 | // state:
280 | if (scratchArc.isFinal()) {
281 | matchOutput = fst.outputs.add(pendingOutput,
282 | scratchArc.nextFinalOutput());
283 | matchInputLength = tokenCount;
284 | matchEndOffset = inputEndOffset;
285 | }
286 |
287 | // See if the FST wants to continue matching (ie, needs to
288 | // see the next input token):
289 | if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc,
290 | scratchArc, fstReader) == null) {
291 | // No further rules can match here; we're done
292 | // searching for matching rules starting at the
293 | // current input position.
294 | break;
295 | } else {
296 | // More matching is possible -- accum the output (if
297 | // any) of the WORD_SEP arc:
298 | pendingOutput = fst.outputs.add(pendingOutput,
299 | scratchArc.output());
300 | if (nextRead == nextWrite) {
301 | capture();
302 | }
303 | }
304 |
305 | curNextRead = rollIncr(curNextRead);
306 | }
307 |
308 | if (nextRead == nextWrite && !finished) {
309 | nextWrite = rollIncr(nextWrite);
310 | }
311 |
312 | if (matchOutput != null) {
313 | inputSkipCount = matchInputLength;
314 | addOutput(matchOutput, matchInputLength, matchEndOffset);
315 | } else if (nextRead != nextWrite) {
316 | // Even though we had no match here, we set to 1
317 | // because we need to skip current input token before
318 | // trying to match again:
319 | inputSkipCount = 1;
320 | } else {
321 | assert finished;
322 | }
323 |
324 | }
325 |
326 | // Interleaves all output tokens onto the futureOutputs:
327 | private void addOutput(BytesRef bytes, int matchInputLength,
328 | int matchEndOffset) {
329 | bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
330 |
331 | final int code = bytesReader.readVInt();
332 | final boolean keepOrig = (code & 0x1) == 0;
333 | final int count = code >>> 1;
334 | for (int outputIDX = 0; outputIDX < count; outputIDX++) {
335 | synonyms.words.get(bytesReader.readVInt(), scratchBytes);
336 | scratchChars.copyUTF8Bytes(scratchBytes);
337 | int lastStart = 0;
338 | final int chEnd = lastStart + scratchChars.length();
339 | int outputUpto = nextRead;
340 | for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
341 | if (chIDX == chEnd
342 | || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
343 | final int outputLen = chIDX - lastStart;
344 | // Caller is not allowed to have empty string in
345 | // the output:
346 | assert outputLen > 0 : "output contains empty string: "
347 | + scratchChars;
348 | final int endOffset;
349 | final int posLen;
350 | if (chIDX == chEnd && lastStart == 0) {
351 | // This rule had a single output token, so, we set
352 | // this output's endOffset to the current
353 | // endOffset (ie, endOffset of the last input
354 | // token it matched):
355 | endOffset = matchEndOffset;
356 | posLen = keepOrig ? matchInputLength : 1;
357 | } else {
358 | // This rule has more than one output token; we
359 | // can't pick any particular endOffset for this
360 | // case, so, we inherit the endOffset for the
361 | // input token which this output overlaps:
362 | endOffset = -1;
363 | posLen = 1;
364 | }
365 | futureOutputs[outputUpto].add(scratchChars.chars(),
366 | lastStart, outputLen, endOffset, posLen);
367 | lastStart = 1 + chIDX;
368 | outputUpto = rollIncr(outputUpto);
369 | assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto="
370 | + outputUpto + " vs nextWrite=" + nextWrite;
371 | }
372 | }
373 | }
374 |
375 | int upto = nextRead;
376 | for (int idx = 0; idx < matchInputLength; idx++) {
377 | futureInputs[upto].keepOrig |= keepOrig;
378 | futureInputs[upto].matched = true;
379 | upto = rollIncr(upto);
380 | }
381 | }
382 |
383 | // ++ mod rollBufferSize
384 | private int rollIncr(int count) {
385 | count++;
386 | if (count == rollBufferSize) {
387 | return 0;
388 | } else {
389 | return count;
390 | }
391 | }
392 |
393 | @Override
394 | public boolean incrementToken() throws IOException {
395 |
396 | while (true) {
397 |
398 | // First play back any buffered future inputs/outputs
399 | // w/o running parsing again:
400 | while (inputSkipCount != 0) {
401 |
402 | // At each position, we first output the original
403 | // token
404 |
405 | // TODO: maybe just a PendingState class, holding
406 | // both input & outputs?
407 | final PendingInput input = futureInputs[nextRead];
408 | final PendingOutputs outputs = futureOutputs[nextRead];
409 |
410 | if (!input.consumed && (input.keepOrig || !input.matched)) {
411 | if (input.state != null) {
412 | // Return a previously saved token (because we
413 | // had to lookahead):
414 | restoreState(input.state);
415 | } else {
416 | // Pass-through case: return token we just pulled
417 | // but didn't capture:
418 | assert inputSkipCount == 1 : "inputSkipCount="
419 | + inputSkipCount + " nextRead=" + nextRead;
420 | }
421 | input.reset();
422 | if (outputs.count > 0) {
423 | outputs.posIncr = 0;
424 | } else {
425 | nextRead = rollIncr(nextRead);
426 | inputSkipCount--;
427 | }
428 | return true;
429 | } else if (outputs.upto < outputs.count) {
430 | // Still have pending outputs to replay at this
431 | // position
432 | input.reset();
433 | final int posIncr = outputs.posIncr;
434 | final CharsRef output = outputs.pullNext();
435 | clearAttributes();
436 | termAtt.copyBuffer(output.chars, output.offset,
437 | output.length);
438 | typeAtt.setType(TYPE_SYNONYM);
439 | int endOffset = outputs.getLastEndOffset();
440 | if (endOffset == -1) {
441 | endOffset = input.endOffset;
442 | }
443 | offsetAtt.setOffset(input.startOffset, endOffset);
444 | posIncrAtt.setPositionIncrement(posIncr);
445 | posLenAtt.setPositionLength(outputs.getLastPosLength());
446 | if (outputs.count == 0) {
447 | // Done with the buffered input and all outputs at
448 | // this position
449 | nextRead = rollIncr(nextRead);
450 | inputSkipCount--;
451 | }
452 | return true;
453 | } else {
454 | // Done with the buffered input and all outputs at
455 | // this position
456 | input.reset();
457 | nextRead = rollIncr(nextRead);
458 | inputSkipCount--;
459 | }
460 | }
461 |
462 | if (finished && nextRead == nextWrite) {
463 | // End case: if any output syns went beyond end of
464 | // input stream, enumerate them now:
465 | final PendingOutputs outputs = futureOutputs[nextRead];
466 | if (outputs.upto < outputs.count) {
467 | final int posIncr = outputs.posIncr;
468 | final CharsRef output = outputs.pullNext();
469 | futureInputs[nextRead].reset();
470 | if (outputs.count == 0) {
471 | nextWrite = nextRead = rollIncr(nextRead);
472 | }
473 | clearAttributes();
474 | // Keep offset from last input token:
475 | offsetAtt.setOffset(lastStartOffset, lastEndOffset);
476 | termAtt.copyBuffer(output.chars, output.offset,
477 | output.length);
478 | typeAtt.setType(TYPE_SYNONYM);
479 | posIncrAtt.setPositionIncrement(posIncr);
480 | return true;
481 | } else {
482 | return false;
483 | }
484 | }
485 |
486 | // Find new synonym matches:
487 | parse();
488 | }
489 | }
490 |
491 | @Override
492 | public void reset() throws IOException {
493 |
494 | super.reset();
495 | captureCount = 0;
496 | finished = false;
497 | inputSkipCount = 0;
498 | nextRead = nextWrite = 0;
499 |
500 | // In normal usage these resets would not be needed,
501 | // since they reset-as-they-are-consumed, but the app
502 | // may not consume all input tokens (or we might hit an
503 | // exception), in which case we have leftover state
504 | // here:
505 | for (PendingInput input : futureInputs) {
506 | input.reset();
507 | }
508 | for (PendingOutputs output : futureOutputs) {
509 | output.reset();
510 | }
511 | }
512 |
513 | void update(SynonymMap synonymMap) {
514 | this.synonyms = synonymMap;
515 | this.fst = synonyms.fst;
516 | if (fst == null) {
517 | throw new IllegalArgumentException("fst must be non-null");
518 | }
519 | this.fstReader = fst.getBytesReader();
520 |
521 | // Must be 1+ so that when roll buffer is at full
522 | // lookahead we can distinguish this full buffer from
523 | // the empty buffer:
524 | rollBufferSize = 1 + synonyms.maxHorizontalContext;
525 |
526 | futureInputs = new PendingInput[rollBufferSize];
527 | futureOutputs = new PendingOutputs[rollBufferSize];
528 | for (int pos = 0; pos < rollBufferSize; pos++) {
529 | futureInputs[pos] = new PendingInput();
530 | futureOutputs[pos] = new PendingOutputs();
531 | }
532 |
533 | scratchArc = new FST.Arc<>();
534 | }
535 |
536 | // Hold all buffered (read ahead) stacked input tokens for
537 | // a future position. When multiple tokens are at the
538 | // same position, we only store (and match against) the
539 | // term for the first token at the position, but capture
540 | // state for (and enumerate) all other tokens at this
541 | // position:
542 | private static class PendingInput {
543 | final CharsRefBuilder term = new CharsRefBuilder();
544 | AttributeSource.State state;
545 | boolean keepOrig;
546 | boolean matched;
547 | boolean consumed = true;
548 | int startOffset;
549 | int endOffset;
550 |
551 | void reset() {
552 | state = null;
553 | consumed = true;
554 | keepOrig = false;
555 | matched = false;
556 | }
557 | }
558 |
559 | // Holds pending output synonyms for one future position:
560 | private static class PendingOutputs {
561 | CharsRefBuilder[] outputs;
562 | int[] endOffsets;
563 | int[] posLengths;
564 | int upto;
565 | int count;
566 | int posIncr = 1;
567 | int lastEndOffset;
568 | int lastPosLength;
569 |
570 | PendingOutputs() {
571 | outputs = new CharsRefBuilder[1];
572 | endOffsets = new int[1];
573 | posLengths = new int[1];
574 | }
575 |
576 | void reset() {
577 | upto = count = 0;
578 | posIncr = 1;
579 | }
580 |
581 | CharsRef pullNext() {
582 | assert upto < count;
583 | lastEndOffset = endOffsets[upto];
584 | lastPosLength = posLengths[upto];
585 | final CharsRefBuilder result = outputs[upto++];
586 | posIncr = 0;
587 | if (upto == count) {
588 | reset();
589 | }
590 | return result.get();
591 | }
592 |
593 | int getLastEndOffset() {
594 | return lastEndOffset;
595 | }
596 |
597 | int getLastPosLength() {
598 | return lastPosLength;
599 | }
600 |
601 | void add(char[] output, int offset, int len, int endOffset,
602 | int posLength) {
603 | if (count == outputs.length) {
604 | outputs = Arrays.copyOf(outputs, ArrayUtil.oversize(1 + count,
605 | RamUsageEstimator.NUM_BYTES_OBJECT_REF));
606 | }
607 | if (count == endOffsets.length) {
608 | final int[] next = new int[ArrayUtil.oversize(1 + count,
609 | Integer.BYTES)];
610 | System.arraycopy(endOffsets, 0, next, 0, count);
611 | endOffsets = next;
612 | }
613 | if (count == posLengths.length) {
614 | final int[] next = new int[ArrayUtil.oversize(1 + count,
615 | Integer.BYTES)];
616 | System.arraycopy(posLengths, 0, next, 0, count);
617 | posLengths = next;
618 | }
619 | if (outputs[count] == null) {
620 | outputs[count] = new CharsRefBuilder();
621 | }
622 | outputs[count].copyChars(output, offset, len);
623 | // endOffset can be -1, in which case we should simply
624 | // use the endOffset of the input token, or X >= 0, in
625 | // which case we use X as the endOffset for this output
626 | endOffsets[count] = endOffset;
627 | posLengths[count] = posLength;
628 | count++;
629 | }
630 | }
631 |
632 | }
633 |
--------------------------------------------------------------------------------