├── .gitignore
├── .travis.yml
├── CHANGES.md
├── DevNotes.txt
├── LICENSE.txt
├── NOTICE.txt
├── QUICK_START.md
├── README.md
├── checkstyle-suppressions.xml
├── checkstyle.xml
├── pom.xml
└── src
    ├── main
        └── java
        │   └── org
        │       └── opensextant
        │           └── solrtexttagger
        │               ├── ConcatenateFilter.java
        │               ├── ConcatenateFilterFactory.java
        │               ├── HtmlOffsetCorrector.java
        │               ├── OffsetCorrector.java
        │               ├── TagClusterReducer.java
        │               ├── TagLL.java
        │               ├── Tagger.java
        │               ├── TaggerRequestHandler.java
        │               ├── TaggingAttribute.java
        │               ├── TaggingAttributeImpl.java
        │               ├── TermPrefixCursor.java
        │               ├── XmlOffsetCorrector.java
        │               └── package-info.java
    └── test
        ├── java
            └── org
            │   └── opensextant
            │       └── solrtexttagger
            │           ├── AbstractTaggerTest.java
            │           ├── ConcatenateFilterTest.java
            │           ├── EmbeddedSolrNoSerializeTest.java
            │           ├── HtmlInterpolationTest.java
            │           ├── RandomizedTaggerTest.java
            │           ├── Tagger2Test.java
            │           ├── TaggerTest.java
            │           ├── TaggingAttributeTest.java
            │           ├── WordLengthTaggingFilter.java
            │           ├── WordLengthTaggingFilterFactory.java
            │           └── XmlInterpolationTest.java
        └── resources
            ├── META-INF
                └── services
                │   └── org.apache.lucene.analysis.util.TokenFilterFactory
            ├── logback.xml
            ├── solr
                └── collection1
                │   └── conf
                │       ├── schema.xml
                │       └── solrconfig.xml
            └── taggingattribute
                └── collection1
                    └── conf
                        ├── schema.xml
                        └── solrconfig.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | /target/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | 
 3 | sudo: false
 4 | 
 5 | script: mvn -Drandomized.multiplier=10 -Dsolr.version=$SOLR_VERSION -Dlog.level=WARN clean verify
 6 | 
 7 | jdk:
 8 |   - oraclejdk8
 9 |   - oraclejdk9
10 | 
11 | env:
12 | # see pom.xml for notes on previous versions
13 |   - SOLR_VERSION=7.0.1
14 |   - SOLR_VERSION=7.1.0
15 |   - SOLR_VERSION=7.2.1
16 | 
17 | notifications:
18 |   email:
19 |     - dsmiley@apache.org
20 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | This file records changes to the SolrTextTagger.  It has Solr & Java version compatibility info too.
 2 | 
 3 | NOTE: There are three independent versions of the tagger: the one in Apache Solr 7.4.0, the GitHub latest 2.x and GitHub latest 1.x.
 4 | 2.x does not support synonyms (posInc=0) analysis but the others do.  Only 2.x supports htmlOffsetAdjust.
 5 | 
 6 | The [.travis.yml file](.travis.yml) shows the current testing version matrix
 7 | on master.  Older releases will show older tested releases working at
 8 | those times.
 9 | 
10 | The TaggerHandler in Apache Solr 7.4.0 is based on 2.6-SNAPSHOT, and has other changes.
11 | 
12 | ## Version 2.6-SNAPSHOT (unreleased)
13 | 
14 | * Performance: Avoid calling terms.iterator() when not needed
15 | * Notice: Lucene's postingsFormat="Memory" option will be removed imminently.
16 |   So use "FST50" which is nearly as good.
17 | 
18 | ## Version 2.5, March 27th, 2018
19 | 
20 | Compatible with Solr 7.0, 7.1, 7.2, 7.3, ...
21 | 
22 | ## Version 2.4, February 11th, 2017
23 | 
24 | Compatible with Solr 6.3, 6.4.1, ... ?
25 | 
26 | Compiled for Java 1.8.
27 | 
28 | * #61 'fq' is now multi-valued
29 | 
30 | ## Version 2.3, July 20th, 2016
31 | 
32 | Compatible with Solr 5.3 thru 6.2.1
33 | 
34 | Compiled for Java 1.7.
35 | 
36 | ## Version 2.2, December 16th, 2015
37 | 
38 | Compatible with Solr 5.2
39 | 
40 | Compiled for Java 1.7.
41 | 
42 | ## Version 2.1, August 12th, 2015
43 | 
44 | Compatible with Solr 5.0 thru 5.1.
45 | 
46 | Compiled for Java 1.7.
47 | 
48 | ## Version 2.0, January 26th, 2015
49 | 
50 | Compatible with Solr 4.3 thru 4.10.
51 | 
52 | Compiled for Java 1.6.
53 | 
54 | This is a major release that fundamentally changes the underlying engine from working directly off
55 | of an FST to one working off a Lucene TermsEnum configured to be backed by an FST.  The
56 | schema and configuration has changed some accordingly, but the tagger request API hasn't changed.
57 | The tagger's codebase shrunk too as Lucene manages more of the complexity.
58 | The internal name entries are now encoded as a char delimited phrase _instead of_ a word dictionary
59 | with word ID phrases.  This approach reduced the memory and disk requirements substantially
60 | from 1.x.  40% less?
61 | 
62 | IMPORTANT: One feature *not* yet ported from 1.x is support for index-time expanding synonyms
63 | and the catenate options of WordDelimiterFilter (or other analysis resulting in tokens at the
64 | same position).  Consequently, don't do those things in your index analysis chain :-/
65 | 
66 |  * 'xmlOffsetAdjust' option.  See README.md
67 | 
68 |  * 'htmlOffsetAdjust' option.  See README.md
69 | 
70 |  * 'nonTaggableTags' option.  See README.md
71 |  
72 |  * Removed deprecated NoSerializeEmbeddedSolrServer & EmbeddedSolrUpdater (\#21)
73 | 
74 | ## Version 1.2 (and prior), October 2nd 2013
75 | 
76 | Compatible with Solr 4.2 thru 4.4; later 4.x releases may or may not work.
77 | 
78 | Compiled for Java 1.6.
79 | 
80 |  * Supports index-time expanding synonyms and the catenate options of WordDelimiterFilter, or most
81 |  other analysis at index time wherein tokens are generated at the same position.
82 |  Multi-word synonyms are not supported unless you normalize at index & query to a single-word
83 |  variant (i.e. "domain name system" -> "dns").
84 |  Internally, this done by PhraseBuilder and is tested in PosIncPosLenTaggerTest.
85 |  Thanks to Rupert Westenthaler!  (\#10)
86 | 


--------------------------------------------------------------------------------
/DevNotes.txt:
--------------------------------------------------------------------------------
 1 | ### Running EmbeddedSolrUpdater
 2 | 
 3 | export JAVA_OPTS="-Dsolr.solr.home=../Gazetteer/SolrHome -Dsolr.data.dir=/Volumes/Speedy/data"
 4 | ./updateSolr.sh '/update?update.contentType=text/csv&optimize=true&separator=%09&trim=on&f.SOURCE_FEATURE_ID.map=1.0:1&f.SOURCE_NAME_ID.map=1.0:1' '/tag?build=true' < /Volumes/Speedy/Merged.txt
 5 | 
 6 | ### Run tagger (not embedded)
 7 | curl -XPOST 'http://localhost:8983/solr/tag?overlaps=ALL&tagsLimit=5000&fl=*&wt=json&indent=2' -H 'Content-Type:text/plain' -d 'We drove to Byrds Creek. Then we'
 8 | or -d '@myfile.txt'
 9 | 
10 | curl -XPOST 'http://localhost:8983/solr/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name&wt=json&indent=2' -H 'Content-Type:text/plain' -d 'We drove to Byrds Creek. Then we'
11 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | OpenSextant's Solr Text Tagger
 2 | Copyright 2013 The MITRE Corporation. All Rights Reserved.
 3 | 
 4 | This software uses the Apache License 2.0.  See LICENSE.txt.
 5 | 
 6 | This product includes software developed by
 7 | The MITRE Corporation (http://www.mitre.org/).
 8 | 
 9 |   This software was produced for the U. S. Government
10 |   under Contract No. W15P7T-11-C-F600, and is
11 |   subject to the Rights in Noncommercial Computer Software
12 |   and Noncommercial Computer Software Documentation
13 |   Clause 252.227-7014 (JUN 1995)


--------------------------------------------------------------------------------
/QUICK_START.md:
--------------------------------------------------------------------------------
  1 | First, understand you must use a version of this "SolrTextTagger" that is compatible with Solr.
  2 | Unfortunately, Solr (more often actually Lucene) makes small changes that necessitate an adjustment
  3 | in the tagger thus requiring more tagger releases that often have no additional features.  
  4 | View the [CHANGES.md](CHANGES.md) file for information on what versions are compatible with what Solr versions.
  5 | 
  6 | # Get Java
  7 | 
  8 | Get Java, preferably the JDK, AKA the Java SE Development Kit which includes a compiler and other 
  9 | useful tools.  I'll assume v1.8, the latest version.  If you already have v1.7, that's fine but be 
 10 | aware Solr 6 requires Java v1.8.  There are multiple ways to get Java, including multiple vendors.  
 11 | Try [Oracle's download page](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html).  
 12 | If you just have the Java "JRE" (no compiler) then that's probably fine.
 13 | 
 14 | # Get Apache Solr
 15 | 
 16 | Go to [Solr's download page](http://www.apache.org/dyn/closer.lua/lucene/solr/) and download either the
 17 | ".zip" or the ".tgz" depending on which you prefer, then expand it.  We'll call the expanded directory
 18 | SOLR_DIST_DIR.  As of this writing, the latest version is v5.4.1.
 19 | 
 20 | # Get the SolrTextTagger
 21 | 
 22 | The OpenSextant SolrTextTagger is a plug-in to Apache Solr.  A Plug-in is a '.jar' file (possibly 
 23 | requiring other dependent '.jar' files) that is placed somewhere that Solr will see it.  To get the 
 24 | text tagger's Jar, you can either download a 
 25 | [pre-built one](http://search.maven.org/#search%7Cga%7C1%7Ca%3A%22solr-text-tagger%22) from Maven 
 26 | central if it's an official release, or build it yourself if you have a Java compiler and Maven.  
 27 | There's also a "SNAPSHOT" (unreleased) 'jar' on Sonatype's maven repository.  
 28 | You can find that [here](https://oss.sonatype.org/content/repositories/snapshots/org/opensextant/solr-text-tagger/2.3-SNAPSHOT/).  
 29 | Remember to consult [CHANGES.md](CHANGES.md) on which version to use based on which Solr version you chose.  (Hint: you'll need 2.3 if you are running Solr 5.4 or 5.3).
 30 | 
 31 | Optional: If you intend to use the `htmlOffsetAdjust` option then you'll need to get the Jericho 
 32 | HTML parser too, such as from Maven central. 
 33 | 
 34 | ## Install the Tagger
 35 | 
 36 | The easiest method is simply to put the '.jar' file into SOLR_DIST_DIR/server/solr/lib/.  The
 37 | lib dir won't exist initially so create it.
 38 | If you need Jericho too then put it here as well.
 39 | 
 40 | # Run Solr
 41 | 
 42 | Start Solr on port 8983 (Solr's default port):
 43 | 
 44 |     bin/solr start
 45 | 
 46 | # Create and Configure a Solr Collection
 47 | 
 48 | Note that there are 2 ways we could go about this.  Solr's classic approach involves editing some 
 49 | config files (schema.xml, solrconfig.xml), which I might have pre-created for these quick-start instructions.  
 50 | The newer approach is to use Solr's API to modify the configuration.  We'll choose the latter, even 
 51 | though I'm most fond of the former.
 52 | 
 53 | Create a Solr collection named "geonames".  Since we don't specify a configuration template (-d) we 
 54 | get a so-called "data-driven" configuration.  It's good for experimentation and getting going fast 
 55 | but not for production or being optimal.
 56 | 
 57 |     bin/solr create -c geonames
 58 | 
 59 | ## Configuring
 60 | 
 61 | We need to configure the schema first.  The "data driven" mode we're using allows us to keep this step fairly
 62 | minimal -- we just need to declare a field type, 2 fields, and a copy-field.
 63 | The critical part up-front is to define the "tag" field type.  There are many many ways to configure
 64 | text analysis; and we're not going to get into those choices here.  But an important bit is the
 65 | ConcatenateFilterFactory at the end of the index analyzer chain.  Another important bit for
 66 | performance is postingsFormat=FST50 (resulting in compact FST based in-memory data structures vs.
 67 | going to disk every time).
 68 | 
 69 | Schema configuration:
 70 | 
 71 | ````
 72 | curl -X POST -H 'Content-type:application/json'  http://localhost:8983/solr/geonames/schema -d '{
 73 |   "add-field-type":{
 74 |     "name":"tag",
 75 |     "class":"solr.TextField",
 76 |     "postingsFormat":"FST50",
 77 |     "omitNorms":true,
 78 |     "indexAnalyzer":{
 79 |       "tokenizer":{ 
 80 |          "class":"solr.StandardTokenizerFactory" },
 81 |       "filters":[
 82 |         {"class":"solr.EnglishPossessiveFilterFactory"},
 83 |         {"class":"solr.ASCIIFoldingFilterFactory"},
 84 |         {"class":"solr.LowerCaseFilterFactory"},
 85 |         {"class":"org.opensextant.solrtexttagger.ConcatenateFilterFactory"}
 86 |       ]},
 87 |     "queryAnalyzer":{
 88 |       "tokenizer":{ 
 89 |          "class":"solr.StandardTokenizerFactory" },
 90 |       "filters":[
 91 |         {"class":"solr.EnglishPossessiveFilterFactory"},
 92 |         {"class":"solr.ASCIIFoldingFilterFactory"},
 93 |         {"class":"solr.LowerCaseFilterFactory"}
 94 |       ]}
 95 |     },
 96 | 
 97 |   "add-field":{ "name":"name",     "type":"text_general"},
 98 |   
 99 |   "add-field":{ "name":"name_tag", "type":"tag",          "stored":false },
100 |   
101 |   "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]}
102 | }'
103 | ````
104 | 
105 | Configure a custom Solr Request Handler:
106 | 
107 | ````
108 | curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{
109 |   "add-requesthandler" : {
110 |     "name": "/tag",
111 |     "class":"org.opensextant.solrtexttagger.TaggerRequestHandler",
112 |     "defaults":{ "field":"name_tag" }
113 |   }
114 | }'
115 | ````
116 | 
117 | # Load Some Sample Data
118 | 
119 | We'll go with some Geonames.org data in CSV format.  Solr is quite flexible in loading data in a 
120 | variety of formats.  This [cities1000.zip](http://download.geonames.org/export/dump/cities1000.zip) 
121 | should be almost 7MB file expanding to a cities1000.txt file around 22.2MB containing 145k lines, 
122 | each a city in the world of at least 1000 population.
123 | 
124 | ````
125 | curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \
126 |   'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate'
127 | ````
128 | 
129 | That might take around 35 seconds; it depends.  It can be a lot faster if the schema were tuned
130 | to only have what we truly need (no text search if not needed).
131 | 
132 | In that command we said optimize=true to put the index in a state that will tmake tagging faster.
133 | The encapsulator=%00 is a bit of a hack to disable the default double-quote.
134 | 
135 | # Tag Time!
136 | 
137 | This is a trivial example tagging a small piece of text.  For more options, see the Usage section
138 | in the readme.
139 |   
140 | ````
141 | curl -X POST \
142 |   'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
143 |   -H 'Content-Type:text/plain' -d 'Hello New York City'
144 | ````
145 | 
146 | The response should be this (the QTime may vary):
147 | ````
148 | {
149 |   "responseHeader":{
150 |     "status":0,
151 |     "QTime":1},
152 |   "tagsCount":1,
153 |   "tags":[[
154 |       "startOffset",6,
155 |       "endOffset",19,
156 |       "ids",["5128581"]]],
157 |   "response":{"numFound":1,"start":0,"docs":[
158 |       {
159 |         "id":"5128581",
160 |         "name":["New York City"],
161 |         "countrycode":["US"]}]
162 |   }}
163 | ````
164 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Solr Text Tagger
  2 | 
  3 | This project implements a "naive" text tagger based on Apache Lucene / Solr, using
  4 | Lucene FST (Finite State Transducer) technology under the hood for remarkable low-memory properties.
  5 | It is "naive" because it does simple text word based substring tagging without consideration
  6 | of any natural language context.  It operates on the results of how you
  7 | configure text analysis in Lucene and so it's quite flexible to match things
  8 | like phonetics for sounds-like tagging if you wanted to.  For more information, see the presentation
  9 | video/slides referenced below.
 10 | 
 11 | The tagger can be used for finding entities/concepts in large text, or for doing likewise in queries
 12 | to enhance query-understanding.
 13 | 
 14 | For a list of changes with version of this tagger, to include Solr & Java version compatibility, 
 15 | see [CHANGES.md](CHANGES.md)
 16 | 
 17 | ### Note: the STT is included in Apache Solr 7.4.0 !!!
 18 | 
 19 | Solr 7.4.0 now includes the Solr Text Tagger.  It's [documented in the Solr Reference Guide](https://builds.apache.org/job/Solr-reference-guide-master/javadoc/the-tagger-handler.html).  As-such, you likely should just use the one in Solr and not the one here.  That said, `htmlOffsetAdjust` is not implemented there.  Issues #82 and #81 document some information about the differences and contain further links.
 20 | 
 21 | ## Resources / References
 22 | 
 23 | * [SoDA](https://github.com/elsevierlabs-os/soda) "Solr Dictionary Annotator" is an open-source system that uses this tagger extensively.  You might want to use that instead of the tagger directly.  In addition to more features add on top of the tagger, it has extensive cloud scaling documentation.
 24 | * [How-To blog post by Mikołaj Kania](http://mikolajkania.com/2017/03/30/extract-entities-with-solr-text-tagger/)
 25 | * [Dictionary Based Annotation at scale with Spark, SolrTextTagger, and OpenNLP (video)](https://www.youtube.com/watch?v=gOe0aYAS8Do)
 26 |     ([slides](http://www.slideshare.net/sujitpal/sseu-2015soda))
 27 |     -- a presentation by Sujit Pal at Spark Summit Europe 2015
 28 | * [Text Tagging with Finite State Transducers (video)](http://www.youtube.com/watch?v=3kQyYbTyXfc)
 29 |     ([slides](http://lucenerevolution.org/wp-content/uploads/2014/08/Text-Tagging-with-Finite-State-Transducers.pdf)) -- a presentation at Lucene Revolution 2013 by David Smiley  (first release about the tagger)
 30 | * [Fuzzy String Matching with SolrTextTagger](http://sujitpal.blogspot.com/2014/02/fuzzy-string-matching-with.html) -- a blog post by Sujit Pal
 31 | * [Tulip](http://dl.acm.org/citation.cfm?id=2634351) -- a winner of the [ERD'14 challenge](https://pdfs.semanticscholar.org/91cf/c37d4853bb7214d18ca091f9bfede8b301a0.pdf) uses the Text Tagger.
 32 | 
 33 | Pertaining to Lucene's Finite State Transducers:
 34 | 
 35 | * https://docs.google.com/presentation/d/1Z7OYvKc5dHAXiVdMpk69uulpIT6A7FGfohjHx8fmHBU/edit#slide=id.p
 36 | * http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html
 37 | * http://blog.mikemccandless.com/2011/01/finite-state-transducers-part-2.html
 38 | 
 39 | ## Contributors:
 40 | 
 41 |   * David Smiley
 42 |   * Rupert Westenthaler   (notably the PhraseBuilder in the 1.1 branch)
 43 | 
 44 | ## Quick Start
 45 | 
 46 | See the [QUICK_START.md](QUICK_START.md) file for a set of instructions to get you going ASAP.
 47 | 
 48 | ## Build Instructions
 49 | 
 50 | The build requires Java (v8 or v9) and Maven.
 51 | 
 52 | To compile and run tests, use:
 53 | 
 54 |     %> mvn test
 55 | 
 56 | To compile, test, and build the jar (placed in target/), use
 57 | 
 58 |     %> mvn package
 59 | 
 60 | ## Configuration
 61 | 
 62 | A Solr schema.xml needs 2 things
 63 | 
 64 |  * A unique key field  (see `<uniqueKey>`).  Setting docValues=true on this field is recommended.
 65 |  * A name/lookup field indexed with Shingling or more likely ConcatenateFilter.
 66 | 
 67 | If you want to support typical keyword search on the names, not just tagging, then index
 68 | the names in an additional field with a typical analysis configuration to your preference.
 69 | 
 70 | For tagging, the name field's index analyzer needs to end in either shingling for "partial"
 71 | (i.e. sub name phrase) matching of a name, or more likely using ConcatenateFilter for 
 72 | complete name matching.  ConcatenateFilter acts similar to shingling but it
 73 | concatenates all tokens into one final token with a space separator.
 74 | The query time analysis should _not_ have Shingling or ConcatenateFilter.
 75 | 
 76 | Prior to shingling or the ConcatenateFilter, preceding text analysis should result in
 77 | consecutive positions <i>(i.e. the position increment of each term must always be
 78 | 1)</i>.  As-such, Synonyms and some configurations of WordDelimiterFilter are not supported. 
 79 | On the other hand, if the input text
 80 | has a position increment greater than one (e.g. stop word) then it is handled properly as if an
 81 | unknown word was there.  Support for synonyms or any other filters producing posInc=0 is a feature
 82 | that has largely been overcome in the 1.1 version but it has yet to be ported to 2.x; see
 83 | [Issue #20, RE the PhraseBuilder](https://github.com/OpenSextant/SolrTextTagger/issues/20)
 84 | 
 85 | To make the tagger work as fast as possible, configure the name field with
 86 | <i>postingsFormat="FST50";</i>.  In doing so, all the terms/postings are placed into an efficient FST
 87 | data structure.
 88 | 
 89 | Here is a sample field type config that should work quite well:
 90 | 
 91 |     <fieldType name="tag" class="solr.TextField" positionIncrementGap="100" postingsFormat="FST50"
 92 |         omitTermFreqAndPositions="true" omitNorms="true">
 93 |       <analyzer type="index">
 94 |         <tokenizer class="solr.StandardTokenizerFactory"/>
 95 |         <filter class="solr.EnglishPossessiveFilterFactory" />
 96 |         <filter class="solr.ASCIIFoldingFilterFactory"/>
 97 |         <filter class="solr.LowerCaseFilterFactory" />
 98 | 
 99 |         <filter class="org.opensextant.solrtexttagger.ConcatenateFilterFactory" />
100 |       </analyzer>
101 |       <analyzer type="query">
102 |         <tokenizer class="solr.StandardTokenizerFactory"/>
103 |         <filter class="solr.EnglishPossessiveFilterFactory" />
104 |         <filter class="solr.ASCIIFoldingFilterFactory"/>
105 |         <filter class="solr.LowerCaseFilterFactory" />
106 |       </analyzer>
107 |     </fieldType>
108 | 
109 | A Solr solrconfig.xml needs a special request handler, configured like this.
110 | 
111 |     <requestHandler name="/tag" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
112 |       <lst name="defaults">
113 |         <str name="field">name_tag</str>
114 |         <str name="fq">PUT SOME SOLR QUERY HERE; OPTIONAL</str><!-- filter out -->
115 |       </lst>
116 |     </requestHandler>
117 | 
118 |  * `field`: The field that represents the corpus to match on, as described above.
119 |  * `fq`: (optional) A query that matches a subset of documents for name matching.
120 | 
121 | Also, to enable custom so-called postings formats, ensure that your solrconfig.xml has a
122 | codecFactory defined like this:
123 | 
124 |     <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
125 | 
126 | ## Usage
127 | 
128 | For tagging, you HTTP POST data to Solr similar to how the ExtractingRequestHandler
129 | (Tika) is invoked.  A request invoked via the "curl" program could look like this:
130 | 
131 |     curl -XPOST \
132 |       'http://localhost:8983/solr/collection1/tag?overlaps=NO_SUB&tagsLimit=5000&fl=*' \
133 |       -H 'Content-Type:text/plain' -d @/mypath/myfile.txt
134 | 
135 | ### The tagger request-time parameters are
136 | 
137 |  * `overlaps`: choose the algorithm to determine which overlapping tags should be
138 |  retained, versus being pruned away.  Options are:
139 |   * `ALL`: Emit all tags.
140 |   * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag).
141 |   * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest
142 |   one (by character length). If there is a tie, pick the right-most. Remove
143 |   any tags overlapping with this tag then repeat the algorithm to potentially
144 |   find other tags that can be emitted in the cluster.
145 |  * `matchText`: A boolean indicating whether to return the matched text in the tag
146 |  response.  This will trigger the tagger to fully buffer the input before tagging.
147 |  * `tagsLimit`: The maximum number of tags to return in the response.  Tagging
148 |  effectively stops after this point.  By default this is 1000.
149 |  * `rows`: Solr's standard param to say the maximum number of documents to return,
150 |  but defaulting to 10000 for a tag request.
151 |  * `skipAltTokens`: A boolean flag used to suppress errors that can occur if, for
152 |  example, you enable synonym expansion at query time in the analyzer, which you
153 |  normally shouldn't do. Let this default to false unless you know that such
154 |  tokens can't be avoided.
155 |  * `ignoreStopwords`: A boolean flag that causes stopwords (or any condition causing positions to
156 |  skip like >255 char words) to be ignored as if it wasn't there. Otherwise, the behavior is to treat
157 |  them as breaks in tagging on the presumption your indexed text-analysis configuration doesn't have
158 |  a StopWordFilter. By default the indexed analysis chain is checked for the presence of a
159 |  StopWordFilter and if found then ignoreStopWords is true if unspecified. You probably shouldn't
160 |  have a StopWordFilter configured and probably won't need to set this param either.
161 |  * `xmlOffsetAdjust`: A boolean indicating that the input is XML and furthermore that the offsets of
162 |  returned tags should be adjusted as necessary to allow for the client to insert an open and closing
163 |  element at the positions. If it isn't possible to do so then the tag will be omitted. You are
164 |  expected to configure HTMLStripCharFilter in the schema when using this option.
165 |  This will trigger the tagger to fully buffer the input before tagging.
166 |  * `htmlOffsetAdjust`: Similar to xmlOffsetAdjust except for HTML content that may have various issues
167 |  that would never work with an XML parser. There needn't be a top level element, and some tags
168 |  are known to self-close (e.g. BR). The tagger uses the Jericho HTML Parser for this feature
169 |  (ASL & LGPL & EPL licensed).
170 |  * `nonTaggableTags`: (only with htmlOffsetAdjust) Omits tags that would enclose one of these HTML
171 |  elements. Comma delimited, lower-case. For example 'a' (anchor) would be a likely choice so that
172 |  links the application inserts don't overlap other links.
173 |  * `fl`: Solr's standard param for listing the fields to return.
174 |  * Most other standard parameters for working with Solr response formatting:
175 |  `echoParams`, `wt`, `indent`, etc.
176 | 
177 | ### Output
178 | 
179 | The output is broken down into two parts, first an array of tags, and then
180 | Solr documents referenced by those tags.  Each tag has the starting character
181 | offset, an ending character (+1) offset, and the Solr unique key field value.
182 | The Solr documents part of the response is Solr's standard search results
183 | format.
184 | 
185 | ## Advanced Tips
186 | 
187 | * For reducing tagging latency even further, consider embedding Solr with
188 |  EmbeddedSolrServer.  See EmbeddedSolrNoSerializeTest.
189 | 


--------------------------------------------------------------------------------
/checkstyle-suppressions.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <!--
 4 |   This software was produced for the U. S. Government
 5 |   under Contract No. W15P7T-11-C-F600, and is
 6 |   subject to the Rights in Noncommercial Computer Software
 7 |   and Noncommercial Computer Software Documentation
 8 |   Clause 252.227-7014 (JUN 1995)
 9 | 
10 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
11 | 
12 |   Licensed under the Apache License, Version 2.0 (the "License");
13 |   you may not use this file except in compliance with the License.
14 |   You may obtain a copy of the License at
15 | 
16 |       http://www.apache.org/licenses/LICENSE-2.0
17 | 
18 |   Unless required by applicable law or agreed to in writing, software
19 |   distributed under the License is distributed on an "AS IS" BASIS,
20 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 |   See the License for the specific language governing permissions and
22 |   limitations under the License.
23 |   -->
24 | 
25 | <!DOCTYPE suppressions PUBLIC
26 |     "-//Puppy Crawl//DTD Suppressions 1.0//EN"
27 |     "http://www.puppycrawl.com/dtds/suppressions_1_1.dtd">
28 | 
29 | <suppressions>
30 |   <!-- header's shouldn't count -->
31 |   <suppress checks="StrictDuplicateCode"
32 |             files=".*"
33 |             lines="1-10"/>
34 | </suppressions>


--------------------------------------------------------------------------------
/checkstyle.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <!--
  3 |   This software was produced for the U. S. Government
  4 |   under Contract No. W15P7T-11-C-F600, and is
  5 |   subject to the Rights in Noncommercial Computer Software
  6 |   and Noncommercial Computer Software Documentation
  7 |   Clause 252.227-7014 (JUN 1995)
  8 | 
  9 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 10 | 
 11 |   Licensed under the Apache License, Version 2.0 (the "License");
 12 |   you may not use this file except in compliance with the License.
 13 |   You may obtain a copy of the License at
 14 | 
 15 |       http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 |   Unless required by applicable law or agreed to in writing, software
 18 |   distributed under the License is distributed on an "AS IS" BASIS,
 19 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 |   See the License for the specific language governing permissions and
 21 |   limitations under the License.
 22 |   -->
 23 | 
 24 | <!DOCTYPE module PUBLIC
 25 |     "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
 26 |     "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
 27 | 
 28 | <!--
 29 | 
 30 |   Checkstyle configuration that checks for some fairly minimal conventions. Some
 31 |     coding convention sources this is partially borrowed from are:
 32 | 
 33 |     - the Java Language Specification at
 34 |       http://java.sun.com/docs/books/jls/second_edition/html/index.html
 35 | 
 36 |     - the Sun Code Conventions at http://java.sun.com/docs/codeconv/
 37 | 
 38 |     - the Javadoc guidelines at
 39 |       http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
 40 | 
 41 |     - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
 42 | 
 43 |     - some best practices
 44 | 
 45 |   Checkstyle is very configurable. Be sure to read the documentation at
 46 |   http://checkstyle.sf.net (or in your downloaded distribution).
 47 | 
 48 |   Most Checks are configurable, be sure to consult the documentation.
 49 | 
 50 |   To completely disable a check, just comment it out or delete it from the file.
 51 | 
 52 |   Finally, it is worth reading the documentation.
 53 | 
 54 | -->
 55 | 
 56 | <module name="Checker">
 57 |   <property name="severity" value="warning"/>
 58 | 
 59 |   <!--
 60 |       If you set the basedir property below, then all reported file
 61 |       names will be relative to the specified directory. See
 62 |       http://checkstyle.sourceforge.net/5.x/config.html#Checker
 63 |   -->
 64 |   <!--<property name="basedir" value="${basedir}"/>-->
 65 | 
 66 |   <module name="SuppressionFilter">
 67 |     <property name="file" value="${checkstyle.suppressions.file}" default="checkstyle-suppressions.xml"/>
 68 |   </module>
 69 | 
 70 |   <!-- http://checkstyle.sourceforge.net/config_duplicates.html -->
 71 |   <module name="StrictDuplicateCode">
 72 |     <property name="min" value="15"/>
 73 |   </module>
 74 |   <!-- http://www.harukizaemon.com/simian/installation.html#checkstyle -->
 75 |   <!--<module name="au.com.redhillconsulting.simian.SimianCheck">-->
 76 |     <!--<property name="threshold" value="6"/>-->
 77 |   <!--</module>-->
 78 | 
 79 | 
 80 |   <!-- Checks that a package-info.java file exists for each package.     -->
 81 |   <!-- See http://checkstyle.sf.net/config_javadoc.html#JavadocPackage -->
 82 |   <!--<module name="JavadocPackage"/>-->
 83 | 
 84 |   <!-- Checks whether files end with a new line.                        -->
 85 |   <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile -->
 86 |   <!--<module name="NewlineAtEndOfFile"/>-->
 87 | 
 88 |   <!-- Checks that property files contain the same keys.         -->
 89 |   <!-- See http://checkstyle.sf.net/config_misc.html#Translation -->
 90 |   <module name="Translation"/>
 91 | 
 92 |   <!-- Checks for Size Violations.                    -->
 93 |   <!-- See http://checkstyle.sf.net/config_sizes.html -->
 94 |   <module name="FileLength"/>
 95 | 
 96 |   <!-- Checks for whitespace                               -->
 97 |   <!-- See http://checkstyle.sf.net/config_whitespace.html -->
 98 |   <module name="FileTabCharacter">
 99 |     <property name="severity" value="error"/>
100 |   </module>
101 | 
102 |   <!-- Miscellaneous other checks.                   -->
103 |   <!-- See http://checkstyle.sf.net/config_misc.html -->
104 |   <module name="RegexpSingleline">
105 |     <property name="format" value="\s+$"/>
106 |     <property name="minimum" value="0"/>
107 |     <property name="maximum" value="0"/>
108 |     <property name="message" value="Line has trailing spaces."/>
109 |   </module>
110 | 
111 |   <!-- Checks for Headers                                -->
112 |   <!-- See http://checkstyle.sf.net/config_header.html   -->
113 |   <!-- <module name="Header"> -->
114 |   <!--   <property name="headerFile" value="${checkstyle.header.file}"/> -->
115 |   <!--   <property name="fileExtensions" value="java"/> -->
116 |   <!-- </module> -->
117 | 
118 |   <module name="TreeWalker">
119 |     <property name="tabWidth" value="${checkstyle.indentChars}" default="2"/>
120 | 
121 |     <module name="Indentation">
122 |       <property name="severity" value="warning"/>
123 |       <property name="basicOffset" value="${checkstyle.indentChars}" default="2"/>
124 |       <property name="caseIndent" value="${checkstyle.indentChars}" default="2"/>
125 |       <!-- https://sourceforge.net/p/checkstyle/patches/226/
126 |           and https://github.com/maikelsteneker/checkstyle-throwsIndent -->
127 |       <!--<property name="throwsIndent" value="${checkstyle.indentCharsThrows}" default="4"/>-->
128 |     </module>
129 | 
130 |     <!-- http://checkstyle.sourceforge.net/config_annotation.html -->
131 |     <module name="MissingOverride"/>
132 | 
133 |     <!-- http://checkstyle.sourceforge.net/config_coding.html -->
134 |     <module name="CovariantEquals"/>
135 |     <module name="EqualsHashCode"/>
136 | 
137 |     <!-- Checks for Javadoc comments.                     -->
138 |     <!-- See http://checkstyle.sf.net/config_javadoc.html -->
139 |     <!--<module name="JavadocMethod"/>-->
140 |     <!--<module name="JavadocType"/>-->
141 |     <!--<module name="JavadocVariable"/>-->
142 |     <!--<module name="JavadocStyle"/>-->
143 | 
144 |     <!-- Checks for Naming Conventions.                  -->
145 |     <!-- See http://checkstyle.sf.net/config_naming.html -->
146 |     <module name="ClassTypeParameterName">
147 |       <property name="severity" value="error"/>
148 |     </module>
149 |     <!-- As per Sun convention except also allowing "logger"  or "log"
150 |          since some people don't like references to "LOGGER" shouting out all
151 |          over their code -->
152 |     <module name="ConstantName">
153 |       <property name="severity" value="error"/>
154 |       <property name="format" value="^[A-Z][A-Z0-9]*(_[A-Z0-9]+)*$|^log(ger)?$"/>
155 |     </module>
156 |     <module name="LocalFinalVariableName">
157 |     </module>
158 |     <module name="LocalVariableName">
159 |     </module>
160 |     <module name="MemberName">
161 |       <property name="severity" value="error"/>
162 |     </module>
163 |     <module name="MethodName">
164 |       <property name="severity" value="error"/>
165 |     </module>
166 |     <module name="MethodTypeParameterName">
167 |       <property name="severity" value="error"/>
168 |     </module>
169 |     <module name="PackageName">
170 |       <property name="severity" value="error"/>
171 |     </module>
172 |     <module name="ParameterName">
173 |       <property name="severity" value="error"/>
174 |     </module>
175 |     <module name="StaticVariableName">
176 |       <property name="severity" value="error"/>
177 |     </module>
178 |     <module name="TypeName">
179 |       <property name="severity" value="error"/>
180 |     </module>
181 | 
182 | 
183 |     <!-- Checks for imports                              -->
184 |     <!-- See http://checkstyle.sf.net/config_import.html -->
185 |     <!--<module name="AvoidStarImport"/>-->
186 |     <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
187 |     <module name="RedundantImport"/>
188 |     <module name="UnusedImports">
189 |       <property name="processJavadoc" value="true" />
190 |     </module>
191 | 
192 |     <!-- Checks for Size Violations.                    -->
193 |     <!-- See http://checkstyle.sf.net/config_sizes.html -->
194 |     <module name="LineLength">
195 |       <property name="max" value="120"/>
196 |       <property name="ignorePattern" value="^import " />
197 |     </module>
198 |     <module name="MethodLength"/>
199 |     <module name="ParameterNumber"/>
200 | 
201 | 
202 |     <!-- Checks for whitespace                               -->
203 |     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
204 |     <!--<module name="EmptyForIteratorPad"/>-->
205 |     <!--<module name="GenericWhitespace"/>-->
206 |     <!--<module name="MethodParamPad"/>-->
207 |     <!--<module name="NoWhitespaceAfter"/>-->
208 |     <!--<module name="NoWhitespaceBefore"/>-->
209 |     <!--<module name="OperatorWrap"/>-->
210 |     <!--<module name="ParenPad"/>-->
211 |     <!--<module name="TypecastParenPad"/>-->
212 |     <!--<module name="WhitespaceAfter"/>--><!-- TODO customize -->
213 |     <!--<module name="WhitespaceAround"/>-->
214 | 
215 | 
216 |     <!-- Modifier Checks                                    -->
217 |     <!-- See http://checkstyle.sf.net/config_modifiers.html -->
218 |     <module name="ModifierOrder"/>
219 |     <!--<module name="RedundantModifier"/>-->
220 | 
221 | 
222 |     <!-- Checks for blocks. You know, those {}'s         -->
223 |     <!-- See http://checkstyle.sf.net/config_blocks.html -->
224 |     <!--<module name="AvoidNestedBlocks"/>-->
225 |     <module name="EmptyBlock">
226 |       <property name="severity" value="error" />
227 |       <property name="option" value="text" /><!-- at least include a comment -->
228 |     </module>
229 |     <module name="LeftCurly"/>
230 |     <!--<module name="NeedBraces"/>-->
231 |     <module name="RightCurly"/>
232 | 
233 |     <!-- Checks for class design                         -->
234 |     <!-- See http://checkstyle.sf.net/config_design.html -->
235 |     <!--<module name="DesignForExtension"/>-->
236 |     <!--<module name="FinalClass"/>-->
237 |     <module name="HideUtilityClassConstructor"/>
238 |     <!--<module name="InterfaceIsType"/>-->
239 |     <!--<module name="VisibilityModifier"/>-->
240 | 
241 | 
242 |     <!-- Miscellaneous other checks.                   -->
243 |     <!-- See http://checkstyle.sf.net/config_misc.html -->
244 |     <module name="ArrayTypeStyle">
245 |       <property name="severity" value="error" />
246 |     </module>
247 |     <!--<module name="FinalParameters"/>-->
248 |     <!--<module name="TodoComment"/>-->
249 |     <module name="UpperEll">
250 |       <property name="severity" value="error" />
251 |     </module>
252 |     <module name="OuterTypeFilename">
253 |       <property name="severity" value="error" />
254 |     </module>
255 | 
256 |   </module>
257 | 
258 | </module>
259 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |   -->
 22 | 
 23 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 24 |   <modelVersion>4.0.0</modelVersion>
 25 | 
 26 |   <groupId>org.opensextant</groupId>
 27 |   <artifactId>solr-text-tagger</artifactId>
 28 |   <version>2.6-SNAPSHOT</version>
 29 |   <packaging>jar</packaging>
 30 | 
 31 |   <name>Solr Text Tagger</name>
 32 |   <description>A text tagger based on Lucene / Solr</description>
 33 |   <url>https://github.com/OpenSextant/SolrTextTagger/</url>
 34 |   <inceptionYear>2012</inceptionYear>
 35 | 
 36 |   <organization>
 37 |     <name>MITRE</name>
 38 |   </organization>
 39 | 
 40 |   <scm>
 41 |     <connection>scm:git:https://github.com/OpenSextant/SolrTextTagger.git</connection>
 42 |     <developerConnection>scm:git:https://github.com/OpenSextant/SolrTextTagger.git</developerConnection>
 43 |     <url>https://github.com/OpenSextant/SolrTextTagger.git</url>
 44 |     <tag>HEAD</tag>
 45 |   </scm>
 46 | 
 47 |   <licenses>
 48 |     <license>
 49 |       <name>Apache 2</name>
 50 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 51 |       <distribution>repo</distribution>
 52 |     </license>
 53 |   </licenses>
 54 | 
 55 |   <developers>
 56 |     <developer>
 57 |       <name>David Smiley</name>
 58 |       <email>dsmiley@apache.org</email>
 59 |       <!-- former: <organization>MITRE</organization> -->
 60 |     </developer>
 61 |   </developers>
 62 | 
 63 |   <properties>
 64 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 65 |     <!-- See CHANGES.md for details on Solr version compatibility, and .travis.yml -->
 66 |     <solr.version>7.2.1</solr.version>
 67 |   </properties>
 68 | 
 69 |   <dependencies>
 70 | 
 71 |     <dependency>
 72 |       <groupId>org.apache.solr</groupId>
 73 |       <artifactId>solr-test-framework</artifactId>
 74 |       <version>${solr.version}</version>
 75 |       <scope>test</scope>
 76 |     </dependency>
 77 |     <dependency>
 78 |       <groupId>org.apache.lucene</groupId>
 79 |       <artifactId>lucene-test-framework</artifactId>
 80 |       <version>${solr.version}</version>
 81 |       <scope>test</scope>
 82 |     </dependency>
 83 | 
 84 |     <dependency>
 85 |       <groupId>org.apache.solr</groupId>
 86 |       <artifactId>solr-core</artifactId>
 87 |       <version>${solr.version}</version>
 88 |       <exclusions>
 89 |         <exclusion>
 90 |           <groupId>org.slf4j</groupId>
 91 |           <artifactId>slf4j-jdk14</artifactId>
 92 |         </exclusion>
 93 |         <exclusion>
 94 |           <groupId>org.slf4j</groupId>
 95 |           <artifactId>slf4j-log4j12</artifactId>
 96 |         </exclusion>
 97 |         <exclusion>
 98 |           <groupId>log4j</groupId>
 99 |           <artifactId>log4j</artifactId>
100 |         </exclusion>
101 |       </exclusions>
102 |     </dependency>
103 | 
104 |     <dependency>
105 |       <groupId>org.apache.lucene</groupId>
106 |       <artifactId>lucene-core</artifactId>
107 |       <version>${solr.version}</version>
108 |     </dependency>
109 | 
110 |     <!-- Woodstox is only used for xmlOffsetAdjust=true option.
111 |     Solr already includes it with runtime scope.  We need compile scope. -->
112 |     <dependency>
113 |       <groupId>org.codehaus.woodstox</groupId>
114 |       <artifactId>woodstox-core-asl</artifactId>
115 |       <version>4.4.1</version>
116 |       <optional>true</optional>
117 |     </dependency>
118 |     <!-- Jericho is only used with htmlOffsetAdjust=true option. -->
119 |     <dependency>
120 |       <groupId>net.htmlparser.jericho</groupId>
121 |       <artifactId>jericho-html</artifactId>
122 |       <version>3.4</version>
123 |       <optional>true</optional>
124 |     </dependency>
125 | 
126 |     <!-- LOGGING -->
127 |     <dependency>
128 |       <groupId>org.slf4j</groupId>
129 |       <artifactId>slf4j-api</artifactId>
130 |       <version>1.7.7</version>
131 |     </dependency>
132 |     <dependency>
133 |       <groupId>ch.qos.logback</groupId>
134 |       <artifactId>logback-classic</artifactId>
135 |       <version>1.1.7</version>
136 |       <scope>runtime</scope>
137 |       <optional>true</optional>
138 |     </dependency>
139 | 
140 |   </dependencies>
141 | 
142 |   <!-- To check for new plugins and dependencies:
143 | mvn org.codehaus.mojo:versions-maven-plugin:2.2:display-plugin-updates
144 | mvn org.codehaus.mojo:versions-maven-plugin:2.2:display-dependency-updates
145 | -->
146 | 
147 |   <build>
148 | 
149 |     <plugins>
150 | 
151 |       <plugin>
152 |         <groupId>org.apache.maven.plugins</groupId>
153 |         <artifactId>maven-compiler-plugin</artifactId>
154 |         <version>3.1</version>
155 |         <configuration>
156 |           <source>1.8</source>
157 |           <target>1.8</target>
158 |         </configuration>
159 |       </plugin>
160 | 
161 |       <!--
162 |       Argh: https://issues.apache.org/jira/browse/SOLR-10338?focusedCommentId=16414571
163 |       -->
164 |       <plugin>
165 |         <groupId>org.apache.maven.plugins</groupId>
166 |         <artifactId>maven-surefire-plugin</artifactId>
167 |         <version>2.19.1</version>
168 |         <configuration>
169 |           <systemPropertyVariables>
170 |             <test.solr.allowed.securerandom>NativePRNG</test.solr.allowed.securerandom>
171 |           </systemPropertyVariables>
172 |         </configuration>
173 |       </plugin>
174 | 
175 |       <plugin>
176 |         <groupId>org.apache.maven.plugins</groupId>
177 |         <artifactId>maven-jar-plugin</artifactId>
178 |         <version>2.4</version>
179 |         <configuration>
180 |           <archive>
181 |             <manifest>
182 |               <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
183 |               <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
184 |             </manifest>
185 |           </archive>
186 |         </configuration>
187 |       </plugin>
188 | 
189 |       <!-- Code Quality (usually run from C.I.) -->
190 | 
191 |       <!-- run explicitly with: mvn checkstyle:check -->
192 |       <plugin>
193 |         <groupId>org.apache.maven.plugins</groupId>
194 |         <artifactId>maven-checkstyle-plugin</artifactId>
195 |         <version>2.12.1</version>
196 |         <configuration>
197 |           <configLocation>checkstyle.xml</configLocation>
198 |           <consoleOutput>true</consoleOutput>
199 |           <failsOnError>true</failsOnError>
200 |         </configuration>
201 |         <executions>
202 |           <execution>
203 |             <phase>compile</phase>
204 |             <goals>
205 |               <goal>check</goal>
206 |             </goals>
207 |           </execution>
208 |         </executions>
209 |       </plugin>
210 | 
211 |       <!-- run explicitly with: mvn pmd:check -->
212 |       <!-- PMD is partially redundant with FindBugs -->
213 |       <!--<plugin>-->
214 |         <!--<groupId>org.apache.maven.plugins</groupId>-->
215 |         <!--<artifactId>maven-pmd-plugin</artifactId>-->
216 |         <!--<version>3.0.1</version>-->
217 |         <!--<configuration>-->
218 |           <!--<minimumTokens>100</minimumTokens>-->
219 |           <!--<targetJdk>1.6</targetJdk>-->
220 |           <!--<rulesets>-->
221 |             <!--<ruleset>/rulesets/java/basic.xml</ruleset>-->
222 |             <!--<ruleset>/rulesets/java/design.xml</ruleset>-->
223 |           <!--</rulesets>-->
224 |         <!--</configuration>-->
225 |       <!--</plugin>-->
226 | 
227 |       <!-- run explicitly with: mvn findbugs:check
228 |         It might fail the build but don't take it seriously. -->
229 |       <plugin>
230 |         <groupId>org.codehaus.mojo</groupId>
231 |         <artifactId>findbugs-maven-plugin</artifactId>
232 |         <version>3.0.3</version>
233 |         <configuration>
234 |           <xmlOutput>true</xmlOutput>
235 |           <!--<threshold>High</threshold>-->
236 |         </configuration>
237 |       </plugin>
238 | 
239 |       <plugin>
240 |         <groupId>org.apache.maven.plugins</groupId>
241 |         <artifactId>maven-site-plugin</artifactId>
242 |         <version>3.3</version>
243 |       </plugin>
244 | 
245 |       <!-- When doing releases, see http://central.sonatype.org/pages/apache-maven.html
246 |       mvn release:clean release:prepare
247 |       mvn release:perform
248 |       -->
249 | 
250 |       <plugin>
251 |         <groupId>org.apache.maven.plugins</groupId>
252 |         <artifactId>maven-release-plugin</artifactId>
253 |         <version>2.5</version>
254 |         <configuration>
255 |           <autoVersionSubmodules>true</autoVersionSubmodules>
256 |           <useReleaseProfile>false</useReleaseProfile>
257 |           <releaseProfiles>release</releaseProfiles>
258 |           <goals>deploy</goals>
259 |         </configuration>
260 |       </plugin>
261 | 
262 |       <plugin>
263 |         <groupId>org.sonatype.plugins</groupId>
264 |         <artifactId>nexus-staging-maven-plugin</artifactId>
265 |         <version>1.6.6</version>
266 |         <extensions>true</extensions>
267 |         <configuration>
268 |           <serverId>ossrh</serverId>
269 |           <nexusUrl>https://oss.sonatype.org/</nexusUrl>
270 |           <autoReleaseAfterClose>true</autoReleaseAfterClose>
271 |         </configuration>
272 |       </plugin>
273 | 
274 |     </plugins>
275 | 
276 |   </build>
277 | 
278 |   <reporting>
279 |     <plugins>
280 | 
281 |       <plugin>
282 |         <groupId>org.apache.maven.plugins</groupId>
283 |         <artifactId>maven-project-info-reports-plugin</artifactId>
284 |         <version>2.7</version>
285 |         <configuration>
286 |           <!-- slow: -->
287 |           <dependencyLocationsEnabled>false</dependencyLocationsEnabled>
288 |         </configuration>
289 |       </plugin>
290 | 
291 |       <plugin>
292 |         <groupId>org.apache.maven.plugins</groupId>
293 |         <artifactId>maven-javadoc-plugin</artifactId>
294 |         <version>2.9.1</version>
295 |         <reportSets>
296 |           <reportSet>
297 |             <reports>
298 |               <report>javadoc</report>
299 |               <!--<report>test-javadoc</report>-->
300 |             </reports>
301 |           </reportSet>
302 |         </reportSets>
303 |       </plugin>
304 | 
305 |     </plugins>
306 |   </reporting>
307 | 
308 |   <profiles>
309 |     <profile>
310 |       <id>release</id>
311 | 
312 |       <build>
313 |         <plugins>
314 |           <plugin>
315 |             <groupId>org.apache.maven.plugins</groupId>
316 |             <artifactId>maven-source-plugin</artifactId>
317 |             <version>2.4</version>
318 |             <executions>
319 |               <execution>
320 |                 <id>attach-sources</id>
321 |                 <goals>
322 |                   <goal>jar-no-fork</goal>
323 |                 </goals>
324 |               </execution>
325 |             </executions>
326 |           </plugin>
327 | 
328 |           <plugin>
329 |             <groupId>org.apache.maven.plugins</groupId>
330 |             <artifactId>maven-javadoc-plugin</artifactId>
331 |             <version>2.9.1</version>
332 |             <executions>
333 |               <execution>
334 |                 <id>attach-javadocs</id>
335 |                 <goals>
336 |                   <goal>jar</goal>
337 |                 </goals>
338 |               </execution>
339 |             </executions>
340 |           </plugin>
341 | 
342 |           <plugin>
343 |             <groupId>org.apache.maven.plugins</groupId>
344 |             <artifactId>maven-gpg-plugin</artifactId>
345 |             <version>1.6</version>
346 |             <executions>
347 |               <execution>
348 |                 <id>sign-artifacts</id>
349 |                 <phase>verify</phase>
350 |                 <goals>
351 |                   <goal>sign</goal>
352 |                 </goals>
353 |               </execution>
354 |             </executions>
355 |           </plugin>
356 |         </plugins>
357 |       </build>
358 |     </profile>
359 |   </profiles>
360 | 
361 |   <distributionManagement>
362 |     <snapshotRepository>
363 |       <id>ossrh</id>
364 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
365 |     </snapshotRepository>
366 |     <repository>
367 |       <id>ossrh</id>
368 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
369 |     </repository>
370 |   </distributionManagement>
371 | 
372 |   <repositories>
373 |     <repository>
374 |       <id>apache.snapshots</id>
375 |       <name>Apache Snapshot Repository</name>
376 |       <url>https://repository.apache.org/snapshots</url>
377 |       <releases>
378 |         <enabled>false</enabled>
379 |       </releases>
380 |     </repository>
381 |   </repositories>
382 | 
383 | </project>
384 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/ConcatenateFilter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.analysis.TokenFilter;
 26 | import org.apache.lucene.analysis.TokenStream;
 27 | import org.apache.lucene.analysis.shingle.ShingleFilter;
 28 | import org.apache.lucene.analysis.tokenattributes.*;
 29 | 
 30 | import java.io.IOException;
 31 | 
 32 | /**
 33 |  * Concatenate all tokens, separated by a provided character,
 34 |  * defaulting to a single space. It always produces exactly one token, and it's designed to be the
 35 |  * last token filter in an analysis chain.
 36 |  */
 37 | public class ConcatenateFilter extends TokenFilter {
 38 | 
 39 |   /*
 40 |   For a very different approach that could accept synonyms or anything except position gaps (e.g.
 41 |   not stopwords),
 42 |   consider using o.a.l.analysis.TokenStreamToAutomaton
 43 |   with o.a.l.util.automaton.SpecialOperations.getFiniteStrings().
 44 |   For gaps (stopwords), we could perhaps index a special token at those gaps and then have the
 45 |   tagger deal with them -- also doable.
 46 |    */
 47 | 
 48 |   private char separator = ' ';
 49 | 
 50 |   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 51 |   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 52 |   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 53 |   private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
 54 |   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 55 | 
 56 |   private boolean done;
 57 |   private StringBuilder buf = new StringBuilder(128);
 58 | 
 59 |   /**
 60 |    * Construct a token stream filtering the given input.
 61 |    */
 62 |   protected ConcatenateFilter(TokenStream input) {
 63 |     super(input);
 64 |   }
 65 | 
 66 |   public void setTokenSeparator(char separator) {
 67 |     this.separator = separator;
 68 |   }
 69 | 
 70 |   @Override
 71 |   public void reset() throws IOException {
 72 |     input.reset();
 73 |     done = false;
 74 |   }
 75 | 
 76 |   @Override
 77 |   public final boolean incrementToken() throws IOException {
 78 |     if (done)
 79 |       return false;
 80 |     done = true;
 81 | 
 82 |     buf.setLength(0);
 83 |     boolean firstTerm = true;
 84 |     while (input.incrementToken()) {
 85 |       if (!firstTerm) {
 86 |         buf.append(separator);
 87 |       }
 88 |       //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13
 89 |       buf.append(termAtt);
 90 |       firstTerm = false;
 91 |     }
 92 |     input.end();//call here so we can see end of stream offsets
 93 | 
 94 |     termAtt.setEmpty().append(buf);
 95 |     //Setting the other attributes ultimately won't have much effect but lets be thorough
 96 |     offsetAtt.setOffset(0, offsetAtt.endOffset());
 97 |     posIncrAtt.setPositionIncrement(1);
 98 |     posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
 99 |     typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"
100 | 
101 |     return true;
102 |   }
103 | 
104 |   @Override
105 |   public void end() throws IOException {
106 |     //we already called input.end() in incrementToken
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/ConcatenateFilterFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.apache.lucene.analysis.TokenStream;
26 | import org.apache.lucene.analysis.util.TokenFilterFactory;
27 | 
28 | import java.util.Map;
29 | 
30 | /**
31 |  * @see ConcatenateFilter
32 |  */
33 | public class ConcatenateFilterFactory extends TokenFilterFactory {
34 | 
35 |   private final String tokenSeparator;
36 | 
37 |   /**
38 |    * Initialize this factory via a set of key-value pairs.
39 |    */
40 |   public ConcatenateFilterFactory(Map<String, String> args) {
41 |     super(args);
42 |     tokenSeparator = get(args, "tokenSeparator", " ");
43 |     if (tokenSeparator.length() != 1)
44 |       throw new IllegalArgumentException("tokenSeparator should be 1 char: "+tokenSeparator);
45 |     if (!args.isEmpty()) {
46 |       throw new IllegalArgumentException("Unknown parameters: " + args);
47 |     }
48 |   }
49 | 
50 |   @Override
51 |   public TokenStream create(TokenStream input) {
52 |     ConcatenateFilter filter = new ConcatenateFilter(input);
53 |     filter.setTokenSeparator(tokenSeparator.charAt(0));
54 |     return filter;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/HtmlOffsetCorrector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import net.htmlparser.jericho.EndTagType;
 26 | import net.htmlparser.jericho.Segment;
 27 | import net.htmlparser.jericho.StartTag;
 28 | import net.htmlparser.jericho.StartTagType;
 29 | import net.htmlparser.jericho.StreamedSource;
 30 | import net.htmlparser.jericho.Tag;
 31 | 
 32 | import java.util.Collections;
 33 | import java.util.Set;
 34 | 
 35 | /**
 36 |  * Corrects offsets to adjust for HTML formatted data. The goal is such that the caller should be
 37 |  * able to insert a start HTML tag at the start offset and a corresponding end HTML tag at the end
 38 |  * offset of the tagger, and have it be valid HTML (assuming it was "valid" in the first place).
 39 |  * See {@link #correctPair(int, int)}.
 40 |  *
 41 |  * This will work on HTML that has numerous problems that browsers deal with, as well as XML.
 42 |  *
 43 |  * Not thread-safe.
 44 |  */
 45 | public class HtmlOffsetCorrector extends OffsetCorrector {
 46 | 
 47 |   /**
 48 |    * Initialize based on the document text.
 49 |    *
 50 |    * @param docText non-null structured content.
 51 |    * @param nonTaggableTags HTML element names that should not be "taggable" (be a part of any
 52 |    *                        tag). These must be lower-case.
 53 |    */
 54 |   protected HtmlOffsetCorrector(String docText, Set<String> nonTaggableTags) {
 55 |     super(docText, nonTaggableTags != null);
 56 |     if (nonTaggableTags == null)
 57 |       nonTaggableTags = Collections.emptySet();
 58 | 
 59 |     int tagCounter = 1;//document implicit tag, and counting
 60 |     int thisTag = 0;//document implicit tag
 61 | 
 62 |     tagInfo.add(-1);//parent
 63 |     tagInfo.add(-1, 0);//StartTag
 64 |     tagInfo.add(docText.length(), docText.length()+1);//EndTag
 65 |     parentChangeOffsets.add(-1);
 66 |     parentChangeIds.add(thisTag);
 67 | 
 68 |     StreamedSource source = new StreamedSource(docText);
 69 |     source.setCoalescing(false);
 70 | 
 71 |     int nonTaggablesInProgress = 0;
 72 | 
 73 |     for (Segment segment : source) {
 74 |       if (segment instanceof Tag) {
 75 |         Tag tag = (Tag) segment;
 76 |         if (tag.getTagType() == StartTagType.NORMAL) {
 77 |           final StartTag startTag = (StartTag) tag;
 78 | 
 79 |           //TODO Consider "implicitly terminating tags", which is dependent on the current tag.
 80 | 
 81 |           if (!startTag.isEmptyElementTag() && !startTag.isEndTagForbidden() && !startTag.isSyntacticalEmptyElementTag()) {//e.g. not "<br>"
 82 |             tagInfo.ensureCapacity(tagInfo.size() + 5);
 83 |             final int parentTag = thisTag;
 84 |             tagInfo.add(parentTag);
 85 |             tagInfo.add(tag.getBegin(), tag.getEnd());
 86 |             tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag
 87 |             thisTag = tagCounter++;
 88 | 
 89 |             parentChangeOffsets.add(tag.getBegin());
 90 |             parentChangeIds.add(thisTag);
 91 | 
 92 |             //non-taggable tracking:
 93 |             if (nonTaggableTags.contains(tag.getName())) {//always lower-case
 94 |               if (nonTaggablesInProgress++ == 0)
 95 |                 nonTaggableOffsets.add(tag.getBegin());
 96 |             }
 97 |           }
 98 |         } else if (tag.getTagType() == EndTagType.NORMAL) {
 99 |           //TODO validate we're closing the tag we think we're closing.
100 |           tagInfo.set(5 * thisTag + 3, tag.getBegin());
101 |           tagInfo.set(5 * thisTag + 4, tag.getEnd());
102 |           thisTag = getParentTag(thisTag);
103 | 
104 |           parentChangeOffsets.add(tag.getEnd());
105 |           parentChangeIds.add(thisTag);
106 | 
107 |           //non-taggable tracking:
108 |           if (nonTaggableTags.contains(tag.getName())) {
109 |             if (nonTaggablesInProgress-- == 1)
110 |               nonTaggableOffsets.add(tag.getEnd() - 1);
111 |           }
112 |         }
113 |       }
114 |       //else we don't care
115 |     }//for segment
116 | 
117 |     parentChangeOffsets.add(docText.length()+1);
118 |     parentChangeIds.add(-1);
119 | 
120 |     assert nonTaggableTags.isEmpty() || nonTaggableOffsets.size() % 2 == 0;//null or even
121 |   }
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/OffsetCorrector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import com.carrotsearch.hppc.IntArrayList;
 26 | 
 27 | import java.util.Arrays;
 28 | 
 29 | public abstract class OffsetCorrector {
 30 | 
 31 |   //TODO support a streaming style of consuming input text so that we need not take a
 32 |   // String. Trickier because we need to keep more information as we parse to know when tags
 33 |   // are adjacent with/without whitespace
 34 | 
 35 |   //Data structure requirements:
 36 |   // Given a character offset:
 37 |   //   * determine what tagId is it's parent.
 38 |   //   * determine if it is adjacent to the parent open tag, ignoring whitespace
 39 |   //   * determine if it is adjacent to the parent close tag, ignoring whitespace
 40 |   // Given a tagId:
 41 |   //   * What is it's parent tagId
 42 |   //   * What's the char offset of the start and end of the open tag
 43 |   //   * What's the char offset of the start and end of the close tag
 44 | 
 45 |   /** Document text. */
 46 |   protected final String docText;
 47 | 
 48 |   /** Array of tag info comprised of 5 int fields:
 49 |    *    [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff].
 50 |    * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */
 51 |   protected final IntArrayList tagInfo;
 52 | 
 53 |   /** offsets of parent tag id change (ascending order) */
 54 |   protected final IntArrayList parentChangeOffsets;
 55 |   /** tag id; parallel array to parentChangeOffsets */
 56 |   protected final IntArrayList parentChangeIds;
 57 | 
 58 |   protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state
 59 | 
 60 |   /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */
 61 |   protected final IntArrayList nonTaggableOffsets;
 62 | 
 63 |   /**
 64 |    * Initialize based on the document text.
 65 |    * @param docText non-null structured content.
 66 |    * @param hasNonTaggable if there may be "non-taggable" tags to track
 67 |    */
 68 |   protected OffsetCorrector(String docText, boolean hasNonTaggable) {
 69 |     this.docText = docText;
 70 |     final int guessNumElements = Math.max(docText.length() / 20, 4);
 71 | 
 72 |     tagInfo = new IntArrayList(guessNumElements * 5);
 73 |     parentChangeOffsets = new IntArrayList(guessNumElements * 2);
 74 |     parentChangeIds = new IntArrayList(guessNumElements * 2);
 75 |     nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null;
 76 |   }
 77 | 
 78 |   /** Corrects the start and end offset pair. It will return null if it can't
 79 |    * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags.
 80 |    * The start (left) offset is pulled left as needed over whitespace and opening tags. The end
 81 |    * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as
 82 |    * a 2-element array.
 83 |    * <p>Note that the returned array is internally reused; just use it to examine the response.
 84 |    */
 85 |   public int[] correctPair(int leftOffset, int rightOffset) {
 86 |     rightOffset = correctEndOffsetForCloseElement(rightOffset);
 87 |     if (spansNonTaggable(leftOffset, rightOffset))
 88 |       return null;
 89 | 
 90 |     int startTag = lookupTag(leftOffset);
 91 |     //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag));
 92 |     int endTag = lookupTag(rightOffset-1);
 93 |     //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag));
 94 | 
 95 |     // Find the ancestor tag enclosing offsetPair.  And bump out left offset along the way.
 96 |     int iTag = startTag;
 97 |     for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) {
 98 |       //Ensure there is nothing except whitespace thru OpenEndOff
 99 |       int tagOpenEndOff = getOpenEndOff(iTag);
100 |       if (hasNonWhitespace(tagOpenEndOff, leftOffset))
101 |         return null;
102 |       leftOffset = getOpenStartOff(iTag);
103 |     }
104 |     final int ancestorTag = iTag;
105 |     // Bump out rightOffset until we get to ancestorTag.
106 |     for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) {
107 |       //Ensure there is nothing except whitespace thru CloseStartOff
108 |       int tagCloseStartOff = getCloseStartOff(iTag);
109 |       if (hasNonWhitespace(rightOffset, tagCloseStartOff))
110 |         return null;
111 |       rightOffset = getCloseEndOff(iTag);
112 |     }
113 | 
114 |     offsetPair[0] = leftOffset;
115 |     offsetPair[1] = rightOffset;
116 |     return offsetPair;
117 |   }
118 | 
119 |   /** Correct endOffset for adjacent element at the right side.  E.g. offsetPair might point to:
120 |    * <pre>
121 |    *   foo&lt;/tag&gt;
122 |    * </pre>
123 |    * and this method pulls the end offset left to the '&lt;'. This is necessary for use with
124 |    * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}.
125 |    *
126 |    * See https://issues.apache.org/jira/browse/LUCENE-5734 */
127 |   protected int correctEndOffsetForCloseElement(int endOffset) {
128 |     if (docText.charAt(endOffset-1) == '>') {
129 |       final int newEndOffset = docText.lastIndexOf('<', endOffset - 2);
130 |       if (newEndOffset > offsetPair[0])//just to be sure
131 |         return newEndOffset;
132 |     }
133 |     return endOffset;
134 |   }
135 | 
136 |   protected boolean hasNonWhitespace(int start, int end) {
137 |     for (int i = start; i < end; i++) {
138 |       if (!Character.isWhitespace(docText.charAt(i)))
139 |         return true;
140 |     }
141 |     return false;
142 |   }
143 | 
144 |   protected boolean tagEnclosesOffset(int tag, int off) {
145 |     return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag);
146 |   }
147 | 
148 |   protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); }
149 |   protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); }
150 |   protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); }
151 |   protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); }
152 |   protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); }
153 | 
154 |   protected int lookupTag(int off) {
155 |     int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off);
156 |     if (idx < 0)
157 |       idx = (-idx - 1) - 1;//round down
158 |     return parentChangeIds.get(idx);
159 |   }
160 | 
161 |   protected boolean spansNonTaggable(int startOff, int endOff) {
162 |     if (nonTaggableOffsets == null)
163 |       return false;
164 |     int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff);
165 |     //if tag start coincides with first or last char of non-taggable span then result is true.
166 |     // (probably never happens since those characters are actual element markup)
167 |     if (idx >= 0)
168 |       return true;
169 |     idx = -idx - 1;//modify for where we would insert
170 |     //if idx is odd then our span intersects a non-taggable span; return true
171 |     if ((idx & 1) == 1)
172 |       return true;
173 |     //it's non-taggable if the next non-taggable start span is before our endOff
174 |     if (idx == nonTaggableOffsets.size())
175 |       return false;
176 |     return nonTaggableOffsets.get(idx) < endOff;
177 |   }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TagClusterReducer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | public interface TagClusterReducer {
 26 |   /**
 27 |    * Reduces the linked-list to only those tags that should be emitted
 28 |    * @param head not null; 1-element array to head which isn't null either
 29 |    */
 30 |   void reduce(TagLL[] head);
 31 | 
 32 |   static final TagClusterReducer ALL = new TagClusterReducer() {
 33 |     @Override
 34 |     public void reduce(TagLL[] head) {
 35 |     }
 36 |   };
 37 | 
 38 |   static final TagClusterReducer NO_SUB = new TagClusterReducer() {
 39 |     @Override
 40 |     public void reduce(TagLL[] head) {
 41 |       //loop forward over all tags
 42 |       for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) {
 43 |         //loop backwards over prev tags from this tag
 44 |         for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) {
 45 |           assert tPrev.startOffset <= tag.startOffset;
 46 |           //if a previous tag's endOffset is <= this one's, tForward can be removed
 47 |           if (tPrev.endOffset >= tag.endOffset) {
 48 |             tag.removeLL();
 49 |             break;
 50 |           } else if (tPrev.startOffset == tag.startOffset) {
 51 |             tPrev.removeLL();
 52 |             //continue; 'tag' is still valid
 53 |           }
 54 |         }
 55 |       }
 56 |     }
 57 |   };
 58 | 
 59 |   static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() {
 60 |     @Override
 61 |     public void reduce(TagLL[] head) {
 62 | 
 63 |       //--Optimize for common single-tag case
 64 |       if (head[0].nextTag == null)
 65 |         return;
 66 | 
 67 |       while (true) {
 68 |         //--Find longest not already marked
 69 |         TagLL longest = null;
 70 |         for (TagLL t = head[0]; t != null; t = t.nextTag) {
 71 |           if (!t.mark && (longest == null || t.charLen() >= longest.charLen()))
 72 |             longest = t;
 73 |         }
 74 |         if (longest == null)
 75 |           break;
 76 |         //--Mark longest (so we return it eventually)
 77 |         longest.mark = true;
 78 |         //--Remove tags overlapping this longest
 79 |         for (TagLL t = head[0]; t != null; t = t.nextTag) {
 80 |           if (t.mark)
 81 |             continue;
 82 | 
 83 |           if (t.overlaps(longest)) {
 84 |             t.removeLL();
 85 |           } else if (t.startOffset >= longest.endOffset) {
 86 |             break;//no subsequent can possibly overlap
 87 |           }
 88 |         }
 89 |       }//loop
 90 | 
 91 |       //all-remaining should be marked
 92 | //      for (TagLL t = head; t != null; t = t.nextTag) {
 93 | //        assert t.mark;
 94 | ////        if (!t.mark) {
 95 | ////          t.removeLL();
 96 | ////          if (head == t)
 97 | ////            head = t.nextTag;
 98 | ////        }
 99 | //      }
100 |       assert head[0].mark;
101 |     }
102 |   };
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TagLL.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.util.BytesRef;
 26 | 
 27 | import java.io.IOException;
 28 | 
 29 | /**
 30 |  * This is a Tag -- a startOffset, endOffset and value.
 31 |  * <p>
 32 |  * A Tag starts without a value in an
 33 |  * "advancing" state.  {@link #advance(org.apache.lucene.util.BytesRef, int)}
 34 |  * is called with subsequent words and then eventually it won't advance any
 35 |  * more, and value is set (could be null).
 36 |  * <p>
 37 |  * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share
 38 |  * a reference to the head via a 1-element array, which is potentially modified
 39 |  * if any of the linked-list methods are called. Tags in the list should have
 40 |  * equal or increasing start offsets.
 41 |  */
 42 | public class TagLL{
 43 | 
 44 |   private final TagLL[] head;//a shared pointer to the head; 1 element
 45 |   TagLL prevTag, nextTag; // linked list
 46 | 
 47 |   private TermPrefixCursor cursor;
 48 | 
 49 |   final int startOffset;//inclusive
 50 |   int endOffset;//exclusive
 51 |   Object value;//null means unset
 52 | 
 53 |   /** optional boolean used by some TagClusterReducer's */
 54 |   boolean mark = false;
 55 | 
 56 |   TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) {
 57 |     this.head = head;
 58 |     this.cursor = cursor;
 59 |     this.startOffset = startOffset;
 60 |     this.endOffset = endOffset;
 61 |     this.value = value;
 62 |   }
 63 | 
 64 |   /**
 65 |    * Advances this tag with "word" at offset "offset".  If this tag is not in
 66 |    * an advancing state then it does nothing. If it is advancing and prior to
 67 |    * advancing further it sees a value, then a non-advancing tag may be inserted
 68 |    * into the LL as side-effect. If this returns false (it didn't advance) and
 69 |    * if there is no value, then it will also be removed.
 70 |    *
 71 |    *
 72 |    * @param word      The next word or null if at an end
 73 |    * @param offset    The last character in word's offset in the underlying
 74 |    *                  stream. If word is null then it's meaningless.
 75 |    *
 76 |    * @return          Whether it advanced or not.
 77 |    *
 78 |    * @throws java.io.IOException
 79 |    */
 80 |   boolean advance(BytesRef word, int offset) throws IOException {
 81 |     if (!isAdvancing())
 82 |       return false;
 83 | 
 84 |     Object iVal = cursor.getDocIds();
 85 | 
 86 |     if (word != null && cursor.advance(word)) {
 87 | 
 88 |       if (iVal != null) {
 89 |         addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal));
 90 |       }
 91 | 
 92 |       assert offset >= endOffset;
 93 |       endOffset = offset;
 94 |       return true;
 95 |     } else {
 96 |       this.value = iVal;
 97 |       this.cursor = null;
 98 |       if (iVal == null)
 99 |         removeLL();
100 |       return false;
101 |     }
102 |   }
103 | 
104 |   /** Removes this tag from the chain, connecting prevTag and nextTag. Does not
105 |    * modify "this" object's pointers, so the caller can refer to nextTag after
106 |    * removing it. */
107 |   public void removeLL() {
108 |     if (head[0] == this)
109 |       head[0] = nextTag;
110 |     if (prevTag != null) {
111 |       prevTag.nextTag = nextTag;
112 |     }
113 |     if (nextTag != null) {
114 |       nextTag.prevTag = prevTag;
115 |     }
116 |   }
117 | 
118 |   void addBeforeLL(TagLL tag) {
119 |     assert tag.startOffset <= startOffset;
120 |     if (prevTag != null) {
121 |       assert prevTag.startOffset <= tag.startOffset;
122 |       prevTag.nextTag = tag;
123 |       tag.prevTag = prevTag;
124 |     } else {
125 |       assert head[0] == this;
126 |       head[0] = tag;
127 |     }
128 |     prevTag = tag;
129 |     tag.nextTag = this;
130 |   }
131 | 
132 |   void addAfterLL(TagLL tag) {
133 |     assert tag.startOffset >= startOffset;
134 |     if (nextTag != null) {
135 |       assert nextTag.startOffset >= tag.startOffset;
136 |       nextTag.prevTag = tag;
137 |       tag.nextTag = nextTag;
138 |     }
139 |     nextTag = tag;
140 |     tag.prevTag = this;
141 |   }
142 | 
143 |   public int charLen() {
144 |     return endOffset - startOffset;
145 |   }
146 | 
147 |   public TagLL getNextTag() {
148 |     return nextTag;
149 |   }
150 | 
151 |   public TagLL getPrevTag() {
152 |     return prevTag;
153 |   }
154 | 
155 |   public int getStartOffset() {
156 |     return startOffset;
157 |   }
158 |   public int getEndOffset() {
159 |     return endOffset;
160 |   }
161 |   public boolean overlaps(TagLL other) {
162 |     //don't use >= or <= because startOffset is inclusive while endOffset is exclusive
163 |     if (startOffset < other.startOffset)
164 |       return endOffset > other.startOffset;
165 |     else
166 |       return startOffset < other.endOffset;
167 |   }
168 | 
169 |   boolean isAdvancing() {
170 |     return cursor != null;
171 |   }
172 | 
173 |   @Override
174 |   public String toString() {
175 |     return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') +
176 |         " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value);
177 |   }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/Tagger.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.analysis.TokenStream;
 26 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 27 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 28 | import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 29 | import org.apache.lucene.index.Terms;
 30 | import org.apache.lucene.util.Bits;
 31 | import org.apache.lucene.util.BytesRef;
 32 | import org.apache.lucene.util.IntsRef;
 33 | import org.slf4j.Logger;
 34 | import org.slf4j.LoggerFactory;
 35 | 
 36 | import java.io.IOException;
 37 | import java.util.HashMap;
 38 | import java.util.Map;
 39 | 
 40 | /**
 41 |  * Tags maximum string of words in a corpus.  This is a callback-style API
 42 |  * in which you implement {@link #tagCallback(int, int, Object)}.
 43 |  *
 44 |  * This class should be independently usable outside Solr.
 45 |  */
 46 | public abstract class Tagger {
 47 |   private final Logger log = LoggerFactory.getLogger(Tagger.class);
 48 | 
 49 |   private final TokenStream tokenStream;
 50 |   private final TermToBytesRefAttribute byteRefAtt;
 51 |   private final PositionIncrementAttribute posIncAtt;
 52 |   private final OffsetAttribute offsetAtt;
 53 |   private final TaggingAttribute taggingAtt;
 54 | 
 55 |   private final TagClusterReducer tagClusterReducer;
 56 |   private final Terms terms;
 57 |   private final Bits liveDocs;
 58 |   private final boolean skipAltTokens;
 59 |   private final boolean ignoreStopWords;
 60 | 
 61 |   private Map<BytesRef, IntsRef> docIdsCache;
 62 | 
 63 |   /** Whether the WARNING about skipped tokens was already logged. */
 64 |   private boolean loggedSkippedAltTokenWarning = false;
 65 | 
 66 |   public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
 67 |                 TagClusterReducer tagClusterReducer, boolean skipAltTokens,
 68 |                 boolean ignoreStopWords) throws IOException {
 69 |     this.terms = terms;
 70 |     this.liveDocs = liveDocs;
 71 |     this.tokenStream = tokenStream;
 72 |     this.skipAltTokens = skipAltTokens;
 73 |     this.ignoreStopWords = ignoreStopWords;
 74 |     byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
 75 |     posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
 76 |     offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
 77 |     taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
 78 |     tokenStream.reset();
 79 | 
 80 |     this.tagClusterReducer = tagClusterReducer;
 81 |   }
 82 | 
 83 |   public void enableDocIdsCache(int initSize) {
 84 |     if (initSize > 0)
 85 |       docIdsCache = new HashMap<>(initSize);
 86 |   }
 87 | 
 88 |   public void process() throws IOException {
 89 |     if (terms == null)
 90 |       return;
 91 | 
 92 |     //a shared pointer to the head used by this method and each Tag instance.
 93 |     final TagLL[] head = new TagLL[1];
 94 | 
 95 |     TermPrefixCursor cursor = null;//re-used
 96 | 
 97 |     //boolean switch used to log warnings in case tokens where skipped during tagging.
 98 |     boolean skippedTokens = false;
 99 | 
100 |     while (tokenStream.incrementToken()) {
101 |       if (log.isTraceEnabled()) {
102 |         log.trace("Token: {}, posInc: {},  offset: [{},{}]",
103 |                 byteRefAtt, posIncAtt.getPositionIncrement(),
104 |                 offsetAtt.startOffset(), offsetAtt.endOffset());
105 |       }
106 |       //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
107 |       if (posIncAtt.getPositionIncrement() < 1) {
108 |         //(a) Deal with this as a configuration issue and throw an exception
109 |         if (!skipAltTokens) {
110 |           //TODO throw UnsupportedTokenException when PhraseBuilder is ported
111 |           throw new IllegalStateException("Query Analyzer generates alternate "
112 |               + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
113 |               + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
114 |               + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
115 |               + "' might result in wrong tagging results if the index time analyzer "
116 |               + "is not configured accordingly. For detailed information see "
117 |               + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
118 |         } else {
119 |           //(b) In case the index time analyser had indexed all variants (users
120 |           //    need to ensure that) processing of alternate tokens can be skipped
121 |           //    as anyways all alternatives will be contained in the FST.
122 |           skippedTokens = true;
123 |           log.trace("  ... ignored token");
124 |           continue;
125 |         }
126 |       }
127 |       //-- If PositionIncrement > 1 (stopwords)
128 |       if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
129 |         log.trace("   - posInc > 1 ... mark cluster as done");
130 |         advanceTagsAndProcessClusterIfDone(head, null);
131 |       }
132 | 
133 |       final BytesRef term;
134 |       //NOTE: we need to lookup tokens if
135 |       // * the LookupAtt is true OR
136 |       // * there are still advancing tags (to find the longest possible match)
137 |       if(taggingAtt.isTaggable() || head[0] != null){
138 |         //-- Lookup the term id from the next token
139 |         term = byteRefAtt.getBytesRef();
140 |         if (term.length == 0) {
141 |           throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
142 |         }
143 |       } else { //no current cluster AND lookup == false ...
144 |         term = null; //skip this token
145 |       }
146 | 
147 |       //-- Process tag
148 |       advanceTagsAndProcessClusterIfDone(head, term);
149 | 
150 |       //-- only create new Tags for Tokens we need to lookup
151 |       if (taggingAtt.isTaggable() && term != null) {
152 | 
153 |         //determine if the terms index has a term starting with the provided term
154 |         // TODO create a pool of these cursors to reuse them more?  could be trivial impl
155 |         if (cursor == null)// (else the existing cursor will be re-used)
156 |           cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
157 |         if (cursor.advance(term)) {
158 |           TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
159 |           cursor = null;//because the new tag now "owns" this instance
160 |           //and add it to the end
161 |           if (head[0] == null) {
162 |             head[0] = newTail;
163 |           } else {
164 |             for (TagLL t = head[0]; true; t = t.nextTag) {
165 |               if (t.nextTag == null) {
166 |                 t.addAfterLL(newTail);
167 |                 break;
168 |               }
169 |             }
170 |           }
171 |         }
172 |       }//if termId >= 0
173 |     }//end while(incrementToken())
174 | 
175 |     //-- Finish all tags
176 |     advanceTagsAndProcessClusterIfDone(head, null);
177 |     assert head[0] == null;
178 | 
179 |     if(!loggedSkippedAltTokenWarning && skippedTokens){
180 |       loggedSkippedAltTokenWarning = true; //only log once
181 |       log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) "
182 |           + "while processing text. This may cause problems with some Analyzer "
183 |           + "configurations (e.g. query time synonym expansion). For details see "
184 |           + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
185 |     }
186 | 
187 |     tokenStream.end();
188 |     //tokenStream.close(); caller closes because caller acquired it
189 |   }
190 | 
191 |   private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
192 |     //-- Advance tags
193 |     final int endOffset = term != null ? offsetAtt.endOffset() : -1;
194 |     boolean anyAdvance = false;
195 |     for (TagLL t = head[0]; t != null; t = t.nextTag) {
196 |       anyAdvance |= t.advance(term, endOffset);
197 |     }
198 | 
199 |     //-- Process cluster if done
200 |     if (!anyAdvance && head[0] != null) {
201 |       tagClusterReducer.reduce(head);
202 |       for (TagLL t = head[0]; t != null; t = t.nextTag) {
203 |         assert t.value != null;
204 |         tagCallback(t.startOffset, t.endOffset, t.value);
205 |       }
206 |       head[0] = null;
207 |     }
208 |   }
209 | 
210 |   /**
211 |    * Invoked by {@link #process()} for each tag found.  endOffset is always &gt;= the endOffset
212 |    * given in the previous call.
213 |    *
214 |    * @param startOffset The character offset of the original stream where the tag starts.
215 |    * @param endOffset One more than the character offset of the original stream where the tag ends.
216 |    * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
217 |    */
218 |   protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);
219 | 
220 |   /**
221 |    * Returns a sorted array of integer docIds given the corresponding key.
222 |    * @param docIdsKey The lookup key.
223 |    * @return Not null
224 |    */
225 |   protected IntsRef lookupDocIds(Object docIdsKey) {
226 |     return (IntsRef) docIdsKey;
227 |   }
228 | }
229 | 
230 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TaggerRequestHandler.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import com.google.common.io.CharStreams;
 26 | import org.apache.lucene.analysis.Analyzer;
 27 | import org.apache.lucene.analysis.TokenStream;
 28 | import org.apache.lucene.analysis.core.StopFilterFactory;
 29 | import org.apache.lucene.analysis.util.TokenFilterFactory;
 30 | import org.apache.lucene.index.LeafReaderContext;
 31 | import org.apache.lucene.index.ReaderUtil;
 32 | import org.apache.lucene.index.Terms;
 33 | import org.apache.lucene.queries.function.FunctionValues;
 34 | import org.apache.lucene.queries.function.ValueSource;
 35 | import org.apache.lucene.search.DocIdSetIterator;
 36 | import org.apache.lucene.search.IndexSearcher;
 37 | import org.apache.lucene.search.Query;
 38 | import org.apache.lucene.util.BitSetIterator;
 39 | import org.apache.lucene.util.Bits;
 40 | import org.apache.lucene.util.FixedBitSet;
 41 | import org.apache.lucene.util.IntsRef;
 42 | import org.apache.solr.analysis.TokenizerChain;
 43 | import org.apache.solr.common.SolrException;
 44 | import org.apache.solr.common.params.CommonParams;
 45 | import org.apache.solr.common.params.MapSolrParams;
 46 | import org.apache.solr.common.params.SolrParams;
 47 | import org.apache.solr.common.util.ContentStream;
 48 | import org.apache.solr.common.util.NamedList;
 49 | import org.apache.solr.handler.RequestHandlerBase;
 50 | import org.apache.solr.request.SolrQueryRequest;
 51 | import org.apache.solr.response.SolrQueryResponse;
 52 | import org.apache.solr.schema.FieldType;
 53 | import org.apache.solr.schema.SchemaField;
 54 | import org.apache.solr.search.BitDocSet;
 55 | import org.apache.solr.search.DocList;
 56 | import org.apache.solr.search.DocSet;
 57 | import org.apache.solr.search.DocSlice;
 58 | import org.apache.solr.search.QParser;
 59 | import org.apache.solr.search.SolrIndexSearcher;
 60 | import org.apache.solr.search.SolrReturnFields;
 61 | import org.apache.solr.search.SyntaxError;
 62 | import org.slf4j.Logger;
 63 | import org.slf4j.LoggerFactory;
 64 | 
 65 | import javax.xml.stream.XMLStreamException;
 66 | import java.io.IOException;
 67 | import java.io.Reader;
 68 | import java.io.StringReader;
 69 | import java.util.ArrayList;
 70 | import java.util.Collections;
 71 | import java.util.HashMap;
 72 | import java.util.HashSet;
 73 | import java.util.Iterator;
 74 | import java.util.List;
 75 | import java.util.Locale;
 76 | import java.util.Map;
 77 | import java.util.Set;
 78 | 
 79 | /**
 80 |  * Scans posted text, looking for matching strings in the Solr index.
 81 |  * The public static final String members are request parameters.
 82 |  */
 83 | public class TaggerRequestHandler extends RequestHandlerBase {
 84 | 
 85 |   /** Request parameter. */
 86 |   public static final String OVERLAPS = "overlaps";
 87 |   /** Request parameter. */
 88 |   public static final String TAGS_LIMIT = "tagsLimit";
 89 |   /** Request parameter. */
 90 |   public static final String MATCH_TEXT = "matchText";
 91 |   /** Request parameter. */
 92 |   public static final String SKIP_ALT_TOKENS = "skipAltTokens";
 93 |   /** Request parameter. */
 94 |   public static final String IGNORE_STOPWORDS = "ignoreStopwords";
 95 |   /** Request parameter. */
 96 |   public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";
 97 |   /** Request parameter. */
 98 |   public static final String HTML_OFFSET_ADJUST = "htmlOffsetAdjust";
 99 |   /** Request parameter. */
100 |   public static final String NON_TAGGABLE_TAGS = "nonTaggableTags";
101 | 
102 |   private final Logger log = LoggerFactory.getLogger(getClass());
103 | 
104 |   @Override
105 |   public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
106 |     setTopInitArgsAsInvariants(req);
107 | 
108 |     //--Read params
109 |     final String indexedField = req.getParams().get("field");
110 |     if (indexedField == null)
111 |       throw new RuntimeException("required param 'field'");
112 | 
113 |     final TagClusterReducer tagClusterReducer =
114 |             chooseTagClusterReducer(req.getParams().get(OVERLAPS));
115 |     final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
116 |     final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
117 |     final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
118 |     final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
119 |     if (idSchemaField == null) {
120 |       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" +
121 |               "uniqueKey in the schema.");//TODO this could be relaxes
122 |     }
123 |     final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
124 |     final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS,
125 |             fieldHasIndexedStopFilter(indexedField, req));
126 |     final boolean htmlOffsetAdjust = req.getParams().getBool(HTML_OFFSET_ADJUST, false);
127 |     final boolean xmlOffsetAdjust = req.getParams().getBool(XML_OFFSET_ADJUST, false);
128 |     final String nonTaggableTags = req.getParams().get(NON_TAGGABLE_TAGS);
129 | 
130 |     //--Get posted data
131 |     Reader inputReader = null;
132 |     Iterable<ContentStream> streams = req.getContentStreams();
133 |     if (streams != null) {
134 |       Iterator<ContentStream> iter = streams.iterator();
135 |       if (iter.hasNext()) {
136 |         inputReader = iter.next().getReader();
137 |       }
138 |       if (iter.hasNext()) {
139 |         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
140 |             getClass().getSimpleName()+" does not support multiple ContentStreams");
141 |       }
142 |     }
143 |     if (inputReader == null) {
144 |       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
145 |           getClass().getSimpleName()+" requires text to be POSTed to it");
146 |     }
147 |     final String inputString;//only populated if needed
148 |     if (addMatchText || xmlOffsetAdjust || htmlOffsetAdjust) {
149 |       //Read the input fully into a String buffer that we'll need later,
150 |       // then replace the input with a reader wrapping the buffer.
151 |       inputString = CharStreams.toString(inputReader);
152 |       inputReader.close();
153 |       inputReader = new StringReader(inputString);
154 |     } else {
155 |       inputString = null;//not used
156 |     }
157 | 
158 |     final OffsetCorrector offsetCorrector =
159 |             initOffsetCorrector(htmlOffsetAdjust, xmlOffsetAdjust, inputString, nonTaggableTags);
160 |     final SolrIndexSearcher searcher = req.getSearcher();
161 |     final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
162 |     final List tags = new ArrayList(2000);
163 | 
164 |     try {
165 |       Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
166 |       try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
167 |         Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
168 |         if (terms == null)
169 |           throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
170 |                   "field " + indexedField + " has no indexed data");
171 |         Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer,
172 |                 skipAltTokens, ignoreStopWords) {
173 |           @SuppressWarnings("unchecked")
174 |           @Override
175 |           protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
176 |             if (tags.size() >= tagsLimit)
177 |               return;
178 |             if (offsetCorrector != null) {
179 |               int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
180 |               if (offsetPair == null) {
181 |                 log.debug("Discarded offsets [{}, {}] because couldn't balance XML.",
182 |                         startOffset, endOffset);
183 |                 return;
184 |               }
185 |               startOffset = offsetPair[0];
186 |               endOffset = offsetPair[1];
187 |             }
188 | 
189 |             NamedList tag = new NamedList();
190 |             tag.add("startOffset", startOffset);
191 |             tag.add("endOffset", endOffset);
192 |             if (addMatchText)
193 |               tag.add("matchText", inputString.substring(startOffset, endOffset));
194 |             //below caches, and also flags matchDocIdsBS
195 |             tag.add("ids", lookupSchemaDocIds(docIdsKey));
196 |             tags.add(tag);
197 |           }
198 | 
199 |           Map<Object, List> docIdsListCache = new HashMap<>(2000);
200 | 
201 |           ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher,
202 |                   idSchemaField.getType().getValueSource(idSchemaField, null));
203 | 
204 |           @SuppressWarnings("unchecked")
205 |           private List lookupSchemaDocIds(Object docIdsKey) {
206 |             List schemaDocIds = docIdsListCache.get(docIdsKey);
207 |             if (schemaDocIds != null)
208 |               return schemaDocIds;
209 |             IntsRef docIds = lookupDocIds(docIdsKey);
210 |             //translate lucene docIds to schema ids
211 |             schemaDocIds = new ArrayList(docIds.length);
212 |             for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
213 |               int docId = docIds.ints[i];
214 |               assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?";
215 |               matchDocIdsBS.set(docId);//also, flip docid in bitset
216 |               try {
217 |                 schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here
218 |               } catch (IOException e) {
219 |                 throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
220 |               }
221 |             }
222 |             assert !schemaDocIds.isEmpty();
223 | 
224 |             docIdsListCache.put(docIds, schemaDocIds);
225 |             return schemaDocIds;
226 |           }
227 | 
228 |         };
229 |         tagger.enableDocIdsCache(2000);//TODO configurable
230 |         tagger.process();
231 |       }
232 |     } finally {
233 |       inputReader.close();
234 |     }
235 |     rsp.add("tagsCount",tags.size());
236 |     rsp.add("tags", tags);
237 | 
238 |     rsp.setReturnFields(new SolrReturnFields( req ));
239 | 
240 |     //Solr's standard name for matching docs in response
241 |     rsp.add("response", getDocList(rows, matchDocIdsBS));
242 |   }
243 | 
244 |   private OffsetCorrector initOffsetCorrector(boolean htmlOffsetAdjust, boolean xmlOffsetAdjust,
245 |                                               String inputString, String nonTaggableTags) {
246 |     OffsetCorrector offsetCorrector;
247 |     if (htmlOffsetAdjust) {
248 |       Set<String> nonTaggableTagSet = null;
249 |       if (nonTaggableTags != null) {
250 |         //comma delimited list
251 |         nonTaggableTags = nonTaggableTags.toLowerCase(Locale.ROOT);
252 |         final String[] strings = nonTaggableTags.split(",");
253 |         nonTaggableTagSet = new HashSet<>(strings.length);
254 |         Collections.addAll(nonTaggableTagSet, strings);
255 |       }
256 |       try {
257 |         offsetCorrector = new HtmlOffsetCorrector(inputString, nonTaggableTagSet);
258 |       } catch (Exception e) {
259 |         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
260 |                 "Expecting HTML but wasn't: " + e, e);
261 |       }
262 |     } else if (xmlOffsetAdjust) {
263 |       if (nonTaggableTags != null)
264 |         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
265 |                 NON_TAGGABLE_TAGS+" not supported for xml");
266 |       try {
267 |         offsetCorrector = new XmlOffsetCorrector(inputString);
268 |       } catch (XMLStreamException e) {
269 |         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
270 |                 "Expecting XML but wasn't: " + e, e);
271 |       }
272 |     } else {
273 |       offsetCorrector = null;
274 |     }
275 |     return offsetCorrector;
276 |   }
277 | 
278 |   private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException {
279 |     //Now we must supply a Solr DocList and add it to the response.
280 |     //  Typically this is gotten via a SolrIndexSearcher.search(), but in this case we
281 |     //  know exactly what documents to return, the order doesn't matter nor does
282 |     //  scoring.
283 |     //  Ideally an implementation of DocList could be directly implemented off
284 |     //  of a BitSet, but there are way too many methods to implement for a minor
285 |     //  payoff.
286 |     int matchDocs = matchDocIdsBS.cardinality();
287 |     int[] docIds = new int[ Math.min(rows, matchDocs) ];
288 |     DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1);
289 |     for (int i = 0; i < docIds.length; i++) {
290 |       docIds[i] = docIdIter.nextDoc();
291 |     }
292 |     return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f);
293 |   }
294 | 
295 |   private TagClusterReducer chooseTagClusterReducer(String overlaps) {
296 |     TagClusterReducer tagClusterReducer;
297 |     if (overlaps == null || overlaps.equals("NO_SUB")) {
298 |       tagClusterReducer = TagClusterReducer.NO_SUB;
299 |     } else if (overlaps.equals("ALL")) {
300 |       tagClusterReducer = TagClusterReducer.ALL;
301 |     } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) {
302 |       tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
303 |     } else {
304 |       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
305 |           "unknown tag overlap mode: "+overlaps);
306 |     }
307 |     return tagClusterReducer;
308 |   }
309 | 
310 |   /**
311 |    * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs
312 |    * either. If null is returned, then all docs are available.
313 |    */
314 |   private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException {
315 |     final String[] corpusFilterQueries = req.getParams().getParams("fq");
316 |     final SolrIndexSearcher searcher = req.getSearcher();
317 |     final Bits docBits;
318 |     if (corpusFilterQueries != null && corpusFilterQueries.length > 0) {
319 |       List<Query> filterQueries = new ArrayList<Query>(corpusFilterQueries.length);
320 |       for (String corpusFilterQuery : corpusFilterQueries) {
321 |         QParser qParser = QParser.getParser(corpusFilterQuery, null, req);
322 |         try {
323 |           filterQueries.add(qParser.parse());
324 |         } catch (SyntaxError e) {
325 |           throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
326 |         }
327 |       }
328 | 
329 |       final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache
330 |       //note: before Solr 4.7 we could call docSet.getBits() but no longer.
331 |       if (docSet instanceof BitDocSet) {
332 |         docBits = ((BitDocSet)docSet).getBits();
333 |       } else {
334 |         docBits = new Bits() {
335 | 
336 |           @Override
337 |           public boolean get(int index) {
338 |             return docSet.exists(index);
339 |           }
340 | 
341 |           @Override
342 |           public int length() {
343 |             return searcher.maxDoc();
344 |           }
345 |         };
346 |       }
347 |     } else {
348 |       docBits = searcher.getSlowAtomicReader().getLiveDocs();
349 |     }
350 |     return docBits;
351 |   }
352 | 
353 |   private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
354 |     FieldType fieldType = req.getSchema().getFieldType(field);
355 |     Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
356 |     if (analyzer instanceof TokenizerChain) {
357 |       TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
358 |       TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
359 |       for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
360 |         if (tokenFilterFactory instanceof StopFilterFactory)
361 |           return true;
362 |       }
363 |     }
364 |     return false;
365 |   }
366 | 
367 |   /**
368 |    * This request handler supports configuration options defined at the top level as well as
369 |    * those in typical Solr 'defaults', 'appends', and 'invariants'.  The top level ones are treated
370 |    * as invariants.
371 |    */
372 |   private void setTopInitArgsAsInvariants(SolrQueryRequest req) {
373 |     // First convert top level initArgs to SolrParams
374 |     HashMap<String,String> map = new HashMap<>(initArgs.size());
375 |     for (int i=0; i<initArgs.size(); i++) {
376 |       Object val = initArgs.getVal(i);
377 |       if (val != null && !(val instanceof NamedList))
378 |         map.put(initArgs.getName(i), val.toString());
379 |     }
380 |     if (map.isEmpty())
381 |       return;//short circuit; nothing to do
382 |     SolrParams topInvariants = new MapSolrParams(map);
383 |     // By putting putting the top level into the 1st arg, it overrides request params in 2nd arg.
384 |     req.setParams(SolrParams.wrapDefaults(topInvariants, req.getParams()));
385 |   }
386 | 
387 |   @Override
388 |   public String getDescription() {
389 |     return "Processes input text to find matching tokens stored in the index.";
390 |   }
391 | 
392 |   /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
393 |   static class ValueSourceAccessor {
394 |     private final List<LeafReaderContext> readerContexts;
395 |     private final ValueSource valueSource;
396 |     private final Map fContext;
397 |     private final FunctionValues[] functionValuesPerSeg;
398 |     private final int[] functionValuesDocIdPerSeg;
399 | 
400 |     ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
401 |       readerContexts = searcher.getIndexReader().leaves();
402 |       this.valueSource = valueSource;
403 |       fContext = ValueSource.newContext(searcher);
404 |       functionValuesPerSeg = new FunctionValues[readerContexts.size()];
405 |       functionValuesDocIdPerSeg = new int[readerContexts.size()];
406 |     }
407 | 
408 |     Object objectVal(int topDocId) throws IOException {
409 |       // lookup segment level stuff:
410 |       int segIdx = ReaderUtil.subIndex(topDocId, readerContexts);
411 |       LeafReaderContext rcontext = readerContexts.get(segIdx);
412 |       int segDocId = topDocId - rcontext.docBase;
413 |       // unfortunately Lucene 7.0 requires forward only traversal (with no reset method).
414 |       //   So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-(
415 |       FunctionValues functionValues = functionValuesPerSeg[segIdx];
416 |       if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) {
417 |         functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext);
418 |       }
419 |       functionValuesDocIdPerSeg[segIdx] = segDocId;
420 | 
421 |       // get value:
422 |       return functionValues.objectVal(segDocId);
423 |     }
424 |   }
425 | 
426 | }
427 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TaggingAttribute.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.apache.lucene.analysis.TokenFilter;
26 | import org.apache.lucene.analysis.TokenStream;
27 | import org.apache.lucene.util.Attribute;
28 | 
29 | /**
30 |  * Attribute used by the {@link Tagger} to decide if a token can start a
31 |  * new {@link TagLL tag}.
32 |  * <p>
33 |  * By default this Attribute will return <code>true</code>, but it might be
34 |  * reset by some {@link TokenFilter} added to the {@link TokenStream} used
35 |  * to analyze the parsed text. Typically this will be done based on NLP
36 |  * processing results (e.g. to only lookup Named Entities).
37 |  * <p>
38 |  * NOTE: that all Tokens are used to advance existing {@link TagLL tags}.
39 |  *
40 |  * @author Rupert Westenthaler
41 |  */
42 | public interface TaggingAttribute extends Attribute {
43 | 
44 |   /**
45 |    * By default this Attribute will be initialised with <code>true</code>.
46 |    * This ensures that all tokens are taggable by default (especially if
47 |    * the {@link TaggingAttribute} is not set by any component in the configured
48 |    * {@link TokenStream}
49 |    */
50 |   public static final boolean DEFAULT_TAGGABLE = true;
51 | 
52 |   /**
53 |    * Getter for the taggable state of the current Token
54 |    *
55 |    * @return the state
56 |    */
57 |   public boolean isTaggable();
58 | 
59 |   /**
60 |    * Setter for the taggable state. Typically called by code within
61 |    * {@link TokenFilter#incrementToken()}.
62 |    *
63 |    * @param lookup the state
64 |    */
65 |   public void setTaggable(boolean lookup);
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TaggingAttributeImpl.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.apache.lucene.util.AttributeImpl;
26 | import org.apache.lucene.util.AttributeReflector;
27 | 
28 | /**
29 |  * Implementation of the {@link TaggingAttribute}
30 |  *
31 |  * @author Rupert Westenthaler
32 |  */
33 | public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute {
34 | 
35 |   /**
36 |    * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE}
37 |    */
38 |   private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE;
39 | 
40 |   /*
41 |    * (non-Javadoc)
42 |    * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup()
43 |    */
44 |   @Override
45 |   public boolean isTaggable() {
46 |     return taggable;
47 |   }
48 | 
49 |   /*
50 |    * (non-Javadoc)
51 |    * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean)
52 |    */
53 |   @Override
54 |   public void setTaggable(boolean lookup) {
55 |     this.taggable = lookup;
56 |   }
57 | 
58 |   /*
59 |    * (non-Javadoc)
60 |    * @see org.apache.lucene.util.AttributeImpl#clear()
61 |    */
62 |   @Override
63 |   public void clear() {
64 |     taggable = DEFAULT_TAGGABLE;
65 |   }
66 | 
67 |   /*
68 |    * (non-Javadoc)
69 |    * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl)
70 |    */
71 |   @Override
72 |   public void copyTo(AttributeImpl target) {
73 |     ((TaggingAttribute) target).setTaggable(taggable);
74 |   }
75 | 
76 |   @Override
77 |   public void reflectWith(AttributeReflector reflector) {
78 |     reflector.reflect(TaggingAttribute.class, "taggable", isTaggable());
79 |   }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/TermPrefixCursor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.index.PostingsEnum;
 26 | import org.apache.lucene.index.TermsEnum;
 27 | import org.apache.lucene.util.Bits;
 28 | import org.apache.lucene.util.BytesRef;
 29 | import org.apache.lucene.util.BytesRefBuilder;
 30 | import org.apache.lucene.util.IntsRef;
 31 | 
 32 | import java.io.IOException;
 33 | import java.util.Map;
 34 | 
 35 | /**
 36 |  * Cursor into the terms that advances by prefix.
 37 |  */
 38 | class TermPrefixCursor {
 39 | 
 40 |   //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
 41 |   // Maybe that could be added to Lucene.
 42 | 
 43 |   // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?
 44 | 
 45 |   private static final byte SEPARATOR_CHAR = ' ';
 46 |   private static final IntsRef EMPTY_INTSREF = new IntsRef();
 47 | 
 48 |   private final TermsEnum termsEnum;
 49 |   private final Bits liveDocs;
 50 |   private final Map<BytesRef, IntsRef> docIdsCache;
 51 | 
 52 |   private BytesRef prefixBuf;//we append to this
 53 |   private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
 54 |   private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
 55 |   private PostingsEnum postingsEnum;
 56 |   private IntsRef docIds;
 57 | 
 58 |   TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map<BytesRef, IntsRef> docIdsCache) {
 59 |     this.termsEnum = termsEnum;
 60 |     this.liveDocs = liveDocs;
 61 |     this.docIdsCache = docIdsCache;
 62 |   }
 63 | 
 64 |   /** Appends the separator char (if not the first) plus the given word to the prefix buffer,
 65 |    * then seeks to it. If the seek fails, false is returned and this cursor
 66 |    * can be re-used as if in a new state.  The {@code word} BytesRef is considered temporary,
 67 |    * and is not saved within this class. */
 68 |   boolean advance(BytesRef word) throws IOException {
 69 |     if (prefixBuf == null) { // first advance
 70 |       //set prefixBuf to word temporary. When advance() completes, we either null out or copy.
 71 |       prefixBuf = word;
 72 |       prefixBufOnLoan = true;
 73 |       if (seekPrefix()) {//... and we have to
 74 |         ensureBufIsACopy();
 75 |         return true;
 76 |       } else {
 77 |         prefixBuf = null;//just to be darned sure 'word' isn't referenced here
 78 |         return false;
 79 |       }
 80 | 
 81 |     } else { // subsequent advance
 82 |       //append to existing
 83 |       assert !prefixBufOnLoan;
 84 | 
 85 |       prefixBufBuilder.append(SEPARATOR_CHAR);
 86 |       prefixBufBuilder.append(word);
 87 |       prefixBuf = prefixBufBuilder.get();
 88 |       if (seekPrefix()) {
 89 |         return true;
 90 |       } else {
 91 |         prefixBuf = null;
 92 |         return false;
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   private void ensureBufIsACopy() {
 98 |     if (!prefixBufOnLoan)
 99 |       return;
100 | 
101 |     prefixBufBuilder.clear();
102 |     prefixBufBuilder.copyBytes(prefixBuf);
103 |     prefixBuf = prefixBufBuilder.get();
104 |     prefixBufOnLoan = false;
105 |   }
106 | 
107 |   /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
108 |    * Sets docIds. **/
109 |   private boolean seekPrefix() throws IOException {
110 |     TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);
111 | 
112 |     docIds = null;//invalidate
113 |     switch (seekStatus) {
114 |       case END:
115 |         return false;
116 | 
117 |       case FOUND:
118 |         postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
119 |         docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
120 |         if (docIds.length > 0) {
121 |           return true;
122 |         }
123 | 
124 |         //Pretend we didn't find it; go to next term
125 |         docIds = null;
126 |         if (termsEnum.next() == null) { // case END
127 |           return false;
128 |         }
129 |         //fall through to NOT_FOUND
130 | 
131 |       case NOT_FOUND:
132 |         //termsEnum must start with prefixBuf to continue
133 |         BytesRef teTerm = termsEnum.term();
134 | 
135 |         if (teTerm.length > prefixBuf.length) {
136 |           for (int i = 0; i < prefixBuf.length; i++) {
137 |             if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
138 |               return false;
139 |           }
140 |           if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
141 |             return false;
142 |           return true;
143 |         }
144 |         return false;
145 |     }
146 |     throw new IllegalStateException(seekStatus.toString());
147 |   }
148 | 
149 |   /** Returns an IntsRef either cached or reading postingsEnum. Not null.
150 |    * @param postingsEnum*/
151 |   private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
152 |     // (The cache can have empty IntsRefs)
153 | 
154 |     //lookup prefixBuf in a cache
155 |     if (docIdsCache != null) {
156 |       docIds = docIdsCache.get(prefixBuf);
157 |       if (docIds != null) {
158 |         return docIds;
159 |       }
160 |     }
161 | 
162 |     //read postingsEnum
163 |     docIds = new IntsRef(termsEnum.docFreq());
164 |     int docId;
165 |     while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
166 |       if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
167 |         continue;
168 |       }
169 |       docIds.ints[docIds.length++] = docId;
170 |     }
171 |     if (docIds.length == 0)
172 |       docIds = EMPTY_INTSREF;
173 | 
174 |     //cache
175 |     if (docIdsCache != null) {
176 |       ensureBufIsACopy();
177 |       //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
178 |       docIdsCache.put(prefixBuf.clone(), docIds);
179 |     }
180 |     return docIds;
181 |   }
182 | 
183 |   /** The docIds of the last call to advance, if it returned true. It might be null, but
184 |    * its length won't be 0. Treat as immutable. */
185 |   IntsRef getDocIds() {
186 |     assert docIds == null || docIds.length != 0;
187 |     return docIds;
188 |   }
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/XmlOffsetCorrector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import com.ctc.wstx.stax.WstxInputFactory;
 26 | import org.apache.commons.io.input.ClosedInputStream;
 27 | import org.codehaus.stax2.LocationInfo;
 28 | import org.codehaus.stax2.XMLInputFactory2;
 29 | import org.codehaus.stax2.XMLStreamReader2;
 30 | 
 31 | import javax.xml.stream.XMLResolver;
 32 | import javax.xml.stream.XMLStreamException;
 33 | import javax.xml.stream.events.XMLEvent;
 34 | import java.io.InputStream;
 35 | import java.io.StringReader;
 36 | 
 37 | /**
 38 |  * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be
 39 |  * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end
 40 |  * offset of the tagger, and have it be valid XML.  See {@link #correctPair(int, int)}.
 41 |  *
 42 |  * This will not work on invalid XML.
 43 |  *
 44 |  * Not thread-safe.
 45 |  */
 46 | public class XmlOffsetCorrector extends OffsetCorrector {
 47 | 
 48 |   //TODO use StAX without hard requirement on woodstox.   xmlStreamReader.getLocation().getCharacterOffset()
 49 | 
 50 |   private static final XMLInputFactory2 XML_INPUT_FACTORY;
 51 |   static {
 52 |     // note: similar code in Solr's EmptyEntityResolver
 53 |     XML_INPUT_FACTORY = new WstxInputFactory();
 54 |     XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() {
 55 |       @Override
 56 |       public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) {
 57 |         return ClosedInputStream.CLOSED_INPUT_STREAM;
 58 |       }
 59 |     });
 60 |     // TODO disable DTD?
 61 |     // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE)
 62 |     XML_INPUT_FACTORY.configureForSpeed();
 63 |   }
 64 | 
 65 |   /**
 66 |    * Initialize based on the document text.
 67 |    * @param docText non-null XML content.
 68 |    * @throws XMLStreamException If there's a problem parsing the XML.
 69 |    */
 70 |   public XmlOffsetCorrector(String docText) throws XMLStreamException {
 71 |     super(docText, false);
 72 | 
 73 |     int tagCounter = 0;
 74 |     int thisTag = -1;
 75 | 
 76 |     //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag,
 77 |     // but we shouldn't need to because there is no findable text outside the top element.
 78 | 
 79 |     final XMLStreamReader2 xmlStreamReader =
 80 |             (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText));
 81 | 
 82 |     while (xmlStreamReader.hasNext()) {
 83 |       int eventType = xmlStreamReader.next();
 84 |       switch (eventType) {
 85 |         case XMLEvent.START_ELEMENT: {
 86 |           tagInfo.ensureCapacity(tagInfo.size() + 5);
 87 |           final int parentTag = thisTag;
 88 |           final LocationInfo info = xmlStreamReader.getLocationInfo();
 89 |           tagInfo.add(parentTag);
 90 |           tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset());
 91 |           tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag
 92 |           thisTag = tagCounter++;
 93 | 
 94 |           parentChangeOffsets.add((int) info.getStartingCharOffset());
 95 |           parentChangeIds.add(thisTag);
 96 |           break;
 97 |         }
 98 |         case XMLEvent.END_ELEMENT: {
 99 |           final LocationInfo info = xmlStreamReader.getLocationInfo();
100 |           tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset());
101 |           tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset());
102 |           thisTag = getParentTag(thisTag);
103 | 
104 |           parentChangeOffsets.add((int) info.getEndingCharOffset());
105 |           parentChangeIds.add(thisTag);
106 |           break;
107 |         }
108 |         default: //do nothing
109 |       }
110 |     }
111 |   }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/org/opensextant/solrtexttagger/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | /**
24 |  * The classes in this package implement OpenSextant's Solr-based tagger.
25 |  */
26 | package org.opensextant.solrtexttagger;


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/AbstractTaggerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.commons.lang.builder.CompareToBuilder;
 26 | import org.apache.commons.lang.builder.EqualsBuilder;
 27 | import org.apache.lucene.document.Document;
 28 | import org.apache.solr.SolrTestCaseJ4;
 29 | import org.apache.solr.common.params.CommonParams;
 30 | import org.apache.solr.common.params.ModifiableSolrParams;
 31 | import org.apache.solr.common.params.SolrParams;
 32 | import org.apache.solr.common.util.ContentStream;
 33 | import org.apache.solr.common.util.ContentStreamBase;
 34 | import org.apache.solr.common.util.NamedList;
 35 | import org.apache.solr.request.SolrQueryRequest;
 36 | import org.apache.solr.request.SolrQueryRequestBase;
 37 | import org.apache.solr.response.SolrQueryResponse;
 38 | import org.apache.solr.search.DocIterator;
 39 | import org.apache.solr.search.DocList;
 40 | import org.apache.solr.search.SolrIndexSearcher;
 41 | import org.junit.Rule;
 42 | import org.junit.rules.TestWatcher;
 43 | import org.junit.runner.Description;
 44 | import org.slf4j.Logger;
 45 | import org.slf4j.LoggerFactory;
 46 | 
 47 | import java.io.IOException;
 48 | import java.util.Arrays;
 49 | import java.util.Collections;
 50 | import java.util.HashMap;
 51 | import java.util.List;
 52 | import java.util.Map;
 53 | import java.util.TreeSet;
 54 | 
 55 | /**
 56 |  * @author David Smiley - dsmiley@apache.org
 57 |  */
 58 | public abstract class AbstractTaggerTest extends SolrTestCaseJ4 {
 59 | 
 60 |   protected final Logger log = LoggerFactory.getLogger(getClass());
 61 | 
 62 |   @Rule
 63 |   public TestWatcher watchman = new TestWatcher() {
 64 |     @Override
 65 |     protected void starting(Description description) {
 66 |       log.info("{} being run...", description.getDisplayName());
 67 |     }
 68 |   };
 69 | 
 70 |   protected final ModifiableSolrParams baseParams = new ModifiableSolrParams();
 71 | 
 72 |   //populated in buildNames; tested in assertTags
 73 |   protected static List<String> NAMES;
 74 | 
 75 |   @Override
 76 |   public void setUp() throws Exception {
 77 |     super.setUp();
 78 |     baseParams.clear();
 79 |     baseParams.set(CommonParams.QT, "/tag");
 80 |     baseParams.set(CommonParams.WT, "xml");
 81 |   }
 82 | 
 83 |   protected void assertTags(String doc, String... tags) throws Exception {
 84 |     TestTag[] tts = new TestTag[tags.length];
 85 |     for (int i = 0; i < tags.length; i++) {
 86 |       tts[i] = tt(doc, tags[i]);
 87 |     }
 88 |     assertTags(reqDoc(doc), tts);
 89 |   }
 90 | 
 91 |   protected static void buildNames(String... names) throws Exception {
 92 |     deleteByQueryAndGetVersion("*:*", null);
 93 |     NAMES = Arrays.asList(names);
 94 |     //Collections.sort(NAMES);
 95 |     int i = 0;
 96 |     for (String n : NAMES) {
 97 |       assertU(adoc("id", ""+(i++), "name", n));
 98 |     }
 99 |     assertU(commit());
100 |   }
101 | 
102 |   protected String lookupByName(String name) {
103 |     for (String n : NAMES) {
104 |       if (n.equalsIgnoreCase(name))
105 |         return n;
106 |     }
107 |     return null;
108 |   }
109 | 
110 |   protected TestTag tt(String doc, String substring) {
111 |     int startOffset = -1, endOffset;
112 |     int substringIndex = 0;
113 |     for(int i = 0; i <= substringIndex; i++) {
114 |       startOffset = doc.indexOf(substring,++startOffset);
115 |       assert startOffset >= 0 : "The test itself is broken";
116 |     }
117 |     endOffset = startOffset+substring.length();//1 greater (exclusive)
118 |     return new TestTag(startOffset, endOffset, substring, lookupByName(substring));
119 |   }
120 | 
121 |   /** Asserts the tags.  Will call req.close(). */
122 |   protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception {
123 |     try {
124 |       SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req);
125 |       TestTag[] aTags = pullTagsFromResponse(req, rsp);
126 | 
127 |       String message;
128 |       if (aTags.length > 10)
129 |         message = null;
130 |       else
131 |         message = Arrays.asList(aTags).toString();
132 |       Arrays.sort(eTags);
133 |       assertSortedArrayEquals(message, eTags, aTags);
134 | 
135 |     } finally {
136 |       req.close();
137 |     }
138 |   }
139 | 
140 |   @SuppressWarnings("unchecked")
141 |   protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException {
142 |     NamedList rspValues = rsp.getValues();
143 |     Map<String, String> matchingNames = new HashMap<>();
144 |     SolrIndexSearcher searcher = req.getSearcher();
145 |     DocList docList = (DocList) rspValues.get("response");
146 |     DocIterator iter = docList.iterator();
147 |     while (iter.hasNext()) {
148 |       int docId = iter.next();
149 |       Document doc = searcher.doc(docId);
150 |       String id = doc.getField("id").stringValue();
151 |       String name = lookupByName(doc.get("name"));
152 |       assertEquals("looking for "+name, NAMES.indexOf(name)+"", id);
153 |       matchingNames.put(id, name);
154 |     }
155 | 
156 |     //build TestTag[] aTags from response ('a' is actual)
157 |     List<NamedList> mTagsList = (List<NamedList>) rspValues.get("tags");
158 |     TestTag[] aTags = new TestTag[mTagsList.size()];
159 |     int mt_i = 0;
160 |     for (NamedList map : mTagsList) {
161 |       List<String> foundIds = (List<String>) map.get("ids");
162 |       for (String id  : foundIds) {
163 |         aTags[mt_i++] = new TestTag(
164 |             ((Number)map.get("startOffset")).intValue(),
165 |             ((Number)map.get("endOffset")).intValue(),
166 |             null,
167 |             matchingNames.get(id));
168 |       }
169 |     }
170 |     return aTags;
171 |   }
172 | 
173 |   /** REMEMBER to close() the result req object. */
174 |   protected SolrQueryRequest reqDoc(String doc, String... moreParams) {
175 |     return reqDoc(doc, params(moreParams));
176 |   }
177 | 
178 |   /** REMEMBER to close() the result req object. */
179 |   protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) {
180 |     log.debug("Test doc: "+doc);
181 |     SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams);
182 |     SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {};
183 |     Iterable<ContentStream> stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc));
184 |     req.setContentStreams(stream);
185 |     return req;
186 |   }
187 | 
188 |   /** Asserts the sorted arrays are equals, with a helpful error message when not.
189 |    * @param message
190 |    * @param expecteds
191 |    * @param actuals
192 |    */
193 |   public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) {
194 |     AssertionError error = null;
195 |     try {
196 |       assertArrayEquals(null, expecteds, actuals);
197 |     } catch (AssertionError e) {
198 |       error = e;
199 |     }
200 |     if (error == null)
201 |       return;
202 |     TreeSet<Object> expectedRemaining = new TreeSet<>(Arrays.asList(expecteds));
203 |     expectedRemaining.removeAll(Arrays.asList(actuals));
204 |     if (!expectedRemaining.isEmpty())
205 |       fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error);
206 |     TreeSet<Object> actualsRemaining = new TreeSet<>(Arrays.asList(actuals));
207 |     actualsRemaining.removeAll(Arrays.asList(expecteds));
208 |     fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error);
209 |   }
210 | 
211 |   class TestTag implements Comparable {
212 |     final int startOffset, endOffset;
213 |     final String substring;
214 |     final String docName;
215 | 
216 |     TestTag(int startOffset, int endOffset, String substring, String docName) {
217 |       this.startOffset = startOffset;
218 |       this.endOffset = endOffset;
219 |       this.substring = substring;
220 |       this.docName = docName;
221 |     }
222 | 
223 |     @Override
224 |     public String toString() {
225 |       return "TestTag{" +
226 |           "[" + startOffset + "-" + endOffset + "]" +
227 |           " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" +
228 |           (docName.equals(substring) || substring == null ? "" : " substr="+substring)+
229 |           '}';
230 |     }
231 | 
232 |     @Override
233 |     public boolean equals(Object obj) {
234 |       TestTag that = (TestTag) obj;
235 |       return new EqualsBuilder()
236 |           .append(this.startOffset, that.startOffset)
237 |           .append(this.endOffset, that.endOffset)
238 |           .append(this.docName, that.docName)
239 |           .isEquals();
240 |     }
241 | 
242 |     @Override
243 |     public int hashCode() {
244 |       return startOffset;//cheesy but acceptable
245 |     }
246 | 
247 |     @Override
248 |     public int compareTo(Object o) {
249 |       TestTag that = (TestTag) o;
250 |       return new CompareToBuilder()
251 |           .append(this.startOffset, that.startOffset)
252 |           .append(this.endOffset, that.endOffset)
253 |           .append(this.docName,that.docName)
254 |           .toComparison();
255 |     }
256 |   }
257 | }
258 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/ConcatenateFilterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
26 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
27 | 
28 | import java.io.IOException;
29 | import java.io.StringReader;
30 | 
31 | public class ConcatenateFilterTest extends BaseTokenStreamTestCase {
32 | 
33 |   public void testTypical() throws IOException {
34 |     String NYC = "new york city";
35 |     WhitespaceTokenizer stream = new WhitespaceTokenizer();
36 |     stream.setReader(new StringReader(NYC));
37 |     ConcatenateFilter filter = new ConcatenateFilter(stream);
38 |     try {
39 |       assertTokenStreamContents(filter, new String[]{NYC},
40 |           new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
41 |           new int[]{1}, null, NYC.length(), true);
42 |     } catch (AssertionError e) {
43 |       //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly.
44 |       // It's manner of checking this is imperfect and incompatible with
45 |       // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(),
46 |       // which is weird. To the best of my ability, end() appears to be implemented correctly.
47 |       if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()"))
48 |         throw e;
49 |     }
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/EmbeddedSolrNoSerializeTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.document.Field;
 26 | import org.apache.solr.SolrTestCaseJ4;
 27 | import org.apache.solr.client.solrj.SolrServerException;
 28 | import org.apache.solr.client.solrj.StreamingResponseCallback;
 29 | import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 30 | import org.apache.solr.client.solrj.request.QueryRequest;
 31 | import org.apache.solr.client.solrj.response.QueryResponse;
 32 | import org.apache.solr.common.SolrDocument;
 33 | import org.apache.solr.common.SolrDocumentList;
 34 | import org.apache.solr.common.params.ModifiableSolrParams;
 35 | import org.apache.solr.common.params.SolrParams;
 36 | import org.apache.solr.common.util.ContentStream;
 37 | import org.apache.solr.common.util.ContentStreamBase;
 38 | import org.junit.Before;
 39 | import org.junit.BeforeClass;
 40 | import org.junit.Ignore;
 41 | import org.junit.Test;
 42 | 
 43 | import java.io.IOException;
 44 | import java.util.Collection;
 45 | import java.util.Collections;
 46 | import java.util.concurrent.atomic.AtomicReference;
 47 | import java.util.function.BiFunction;
 48 | 
 49 | /**
 50 |  * Tests that we can skip serialization of the documents when embedding
 51 |  * Solr.
 52 |  *
 53 |  * @author David Smiley - dsmiley@apache.org
 54 |  */
 55 | public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 {
 56 | 
 57 |   static EmbeddedSolrServer solrServer;
 58 | 
 59 |   @BeforeClass
 60 |   public static void init() throws Exception {
 61 |     initCore("solrconfig.xml", "schema.xml");
 62 |     solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1");
 63 |     //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core
 64 |   }
 65 | 
 66 |   @Before
 67 |   public void setUp() throws Exception {
 68 |     super.setUp();
 69 |     clearIndex();
 70 |     assertU(adoc("id", "9999", "name", "Boston"));
 71 |     assertU(commit());
 72 |   }
 73 | 
 74 |   @Test
 75 |   public void testTag() throws SolrServerException, IOException {
 76 |     ModifiableSolrParams params = params();
 77 |     String input = "foo boston bar";//just one tag;
 78 |     QueryRequest req = new SolrTaggerRequest(params, input);
 79 |     req.setPath("/tag");
 80 | 
 81 |     QueryResponse rsp = req.process(solrServer);
 82 |     SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response");
 83 |     assertNotNull(rsp.getResponse().get("tags"));
 84 |     assertNotNull(results.get(0));
 85 |   }
 86 | 
 87 |   @SuppressWarnings("serial")
 88 |   public static class SolrTaggerRequest extends QueryRequest {
 89 | 
 90 |     private final String input;
 91 | 
 92 |     public SolrTaggerRequest(SolrParams p, String input) {
 93 |       super(p, METHOD.POST);
 94 |       this.input = input;
 95 |     }
 96 | 
 97 |     // Deprecated in 7.2 but should live on until 8.x
 98 |     @SuppressWarnings("deprecation")
 99 |     @Override
100 |     public Collection<ContentStream> getContentStreams() {
101 |       return Collections.singleton(new ContentStreamBase.StringStream(input));
102 |     }
103 | 
104 |     // As of 7.2.  But won't work until: https://issues.apache.org/jira/browse/SOLR-12142
105 | //    @Override
106 | //    public RequestWriter.ContentWriter getContentWriter(String expectedType) {
107 | //      return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8");
108 | //    }
109 |   }
110 | 
111 |   @Test
112 |   public void testSearch() throws Exception {
113 |     QueryResponse rsp = solrServer.query(params("q", "name:Boston"));
114 |     assertNotNull(rsp.getResults().get(0));
115 |   }
116 | 
117 |   @Test
118 |   public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception {
119 |     doTestAssertTagStreaming(SolrTaggerRequest::new);
120 |   }
121 | 
122 |   @Test @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO
123 |   // and it can't be enabled with EmbeddedSolrServer until SOLR-12126
124 |   public void testAssertTagStreamingWithStreamBodyParam() throws Exception {
125 |     doTestAssertTagStreaming((params, input) -> {
126 |       params.set("stream.body", input);
127 |       return new QueryRequest(params);
128 |     });
129 |   }
130 | 
131 |   public void doTestAssertTagStreaming(BiFunction<ModifiableSolrParams,String,QueryRequest> newQueryRequest) throws IOException, SolrServerException {
132 |     ModifiableSolrParams params = params();
133 |     String input = "foo boston bar";//just one tag;
134 |     QueryRequest req = newQueryRequest.apply(params, input);
135 |     req.setPath("/tag");
136 | 
137 |     final AtomicReference<SolrDocument> refDoc = new AtomicReference<>();
138 |     req.setStreamingResponseCallback(new StreamingResponseCallback() {
139 |       @Override
140 |       public void streamSolrDocument(SolrDocument doc) {
141 |         refDoc.set(doc);
142 |       }
143 | 
144 |       @Override
145 |       public void streamDocListInfo(long numFound, long start, Float maxScore) {
146 | 
147 |       }
148 |     });
149 |     QueryResponse rsp = req.process(solrServer);
150 |     assertNotNull(rsp.getResponse().get("tags"));
151 |     assertNotNull(refDoc.get());
152 |     assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue());
153 |   }
154 | }
155 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/HtmlInterpolationTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.junit.Ignore;
26 | import org.junit.Test;
27 | 
28 | public class HtmlInterpolationTest extends XmlInterpolationTest {
29 |   @Override
30 |   public void setUp() throws Exception {
31 |     super.setUp();
32 |     baseParams.set("htmlOffsetAdjust", "true");
33 |     baseParams.set("matchText", "true");
34 |   }
35 | 
36 |   @Override
37 |   @Test @Ignore //because in html mode, seemingly everything is valid
38 |   public void testValidatingXml() throws Exception {
39 |   }
40 | 
41 |   @Override
42 |   @Test @Ignore //because in html mode, seemingly everything is valid
43 |   public void testInvalidXml() throws Exception {
44 |   }
45 | 
46 |   @Override
47 |   protected void validateXml(String xml) throws Exception {
48 |     //cause this test to *not* try to parse as actual html
49 |   }
50 | 
51 |   @Test
52 |   public void testHtml() throws Exception {
53 |     buildNames("start end");
54 | 
55 |     assertXmlTag("<doc>before start <br> end after</doc>", true);//br is assumed empty
56 | 
57 |     //no wrapping tags:
58 |     assertXmlTag("start end", true);
59 |     assertXmlTag("start end <em>other text</em>", true);
60 |     assertXmlTag("start end<em> other text</em>", true);
61 |     assertXmlTag("<em>other text</em> start end", true);
62 |     assertXmlTag("start <td/> end", true);    
63 |   }
64 | 
65 |   @Test
66 |   public void testHtmlNonTaggable() throws Exception {
67 |     baseParams.set("nonTaggableTags","a" + (random().nextBoolean() ? ",sub" : ""));
68 |     buildNames("start end");
69 | 
70 |     assertXmlTag("start end", true);
71 |     assertXmlTag("start <a>end</a>", false);
72 |     assertXmlTag("<a>start</a> end", false);
73 |     assertXmlTag("<doc><a>before </a>start <br> end<a> after</a></doc>", true);//adjacent
74 |     assertXmlTag("<doc><a>before <a>inner</a> </a>start <br> end<a> after</a></doc>", true);
75 | 
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/RandomizedTaggerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import com.carrotsearch.randomizedtesting.annotations.Repeat;
 26 | import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
 27 | import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 28 | import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 29 | import org.junit.BeforeClass;
 30 | import org.junit.Test;
 31 | 
 32 | import java.util.ArrayList;
 33 | import java.util.HashSet;
 34 | import java.util.List;
 35 | import java.util.Random;
 36 | import java.util.Set;
 37 | 
 38 | /**
 39 |  * Randomly generate taggable text and verify via simple tag algorithm.
 40 |  */
 41 | @Repeat(iterations = 10)
 42 | public class RandomizedTaggerTest extends AbstractTaggerTest {
 43 | 
 44 |   @BeforeClass
 45 |   public static void beforeClass() throws Exception {
 46 |     initCore("solrconfig.xml", "schema.xml");
 47 |   }
 48 | 
 49 |   @Test
 50 |   public void test() throws Exception {
 51 |     final Random R = random();
 52 | 
 53 |     Set<String> names = new HashSet<>();
 54 |     //random list of single-word names
 55 |     final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5);
 56 |     for (int i = 0; i < NUM_SINGLES; i++) {
 57 |       if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers)
 58 |         names.add(randomStringOfLength(16, 32));
 59 |       else
 60 |         names.add(randomString());
 61 |     }
 62 | 
 63 |     //add random list of multi-word names, partially including existing names
 64 |     final int NUM_MULTI = 10;
 65 |     for (int i = 0; i < NUM_MULTI; i++) {
 66 |       final int numWords = RandomNumbers.randomIntBetween(R, 2, 4);
 67 |       StringBuilder buf = new StringBuilder();
 68 |       for (int j = 0; j < numWords; j++) {
 69 |         if (j != 0)
 70 |           buf.append(' ');
 71 |         if (R.nextBoolean()) {//new likely non-existent word
 72 |           buf.append(randomString());
 73 |         } else {//existing word (possible multi-word from prev iteration)
 74 |           buf.append(RandomPicks.randomFrom(R, names));
 75 |         }
 76 |       }
 77 |       names.add(buf.toString());
 78 |     }
 79 | 
 80 |     // BUILD NAMES
 81 |     buildNames(names.toArray(new String[names.size()]));
 82 | 
 83 |     // QUERY LOOP
 84 |     for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) {
 85 |       // Build up random input, similar to multi-word random names above
 86 |       StringBuilder input = new StringBuilder();
 87 |       final int INPUT_WORD_LEN = 20;
 88 |       input.append(' ');//must start with space based on assertBruteForce logic
 89 |       for (int i = 0; i < INPUT_WORD_LEN; i++) {
 90 |         if (R.nextBoolean()) {//new likely non-existent word
 91 |           input.append(randomString());
 92 |         } else {//existing word (possible multi-word from prev iteration)
 93 |           input.append(RandomPicks.randomFrom(R, NAMES));
 94 |         }
 95 |         input.append(' ');//must end with a space
 96 |       }
 97 | 
 98 |       boolean madeIt = false;
 99 |       try {
100 |         assertBruteForce(input.toString());
101 |         madeIt = true;
102 |       } finally {
103 |         if (!madeIt) {
104 |           System.out.println("Reproduce with:");
105 |           System.out.print(" buildNames(");
106 |           for (int i = 0; i < NAMES.size(); i++) {
107 |             if (i != 0)
108 |               System.out.print(',');
109 |             System.out.print('"');
110 |             System.out.print(NAMES.get(i));
111 |             System.out.print('"');
112 |           }
113 |           System.out.println(");");
114 |           System.out.println(" assertBruteForce(\"" + input+"\");");
115 |         }
116 |       }
117 |     }
118 | 
119 |   }
120 | 
121 |   private void assertBruteForce(String input) throws Exception {
122 |     assert input.matches(" .* ");
123 |     baseParams.set("overlaps", "ALL");
124 | 
125 |     //loop through NAMES and find all tag offsets
126 |     List<TestTag> testTags = new ArrayList<>();
127 |     for (String name : NAMES) {
128 |       String spaceName = " "+name+" ";
129 |       int off = 0;
130 |       while (true) {
131 |         int idx = input.indexOf(spaceName, off);
132 |         if (idx < 0)
133 |           break;
134 |         testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name));
135 |         off = idx + 1;
136 |       }
137 |     }
138 | 
139 |     //assert
140 |     assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()]));
141 |   }
142 | 
143 |   private String randomString() { return randomStringOfLength(1, 1); }
144 | 
145 |   private String randomStringOfLength(int min, int max) {
146 |     return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase();
147 |   }
148 | 
149 | }
150 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/Tagger2Test.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.junit.BeforeClass;
 26 | import org.junit.Ignore;
 27 | import org.junit.Test;
 28 | 
 29 | import java.nio.charset.StandardCharsets;
 30 | 
 31 | /**
 32 |  * Test the {@link org.opensextant.solrtexttagger.TaggerRequestHandler}.
 33 |  */
 34 | public class Tagger2Test extends AbstractTaggerTest {
 35 | 
 36 |   @BeforeClass
 37 |   public static void beforeClass() throws Exception {
 38 |     initCore("solrconfig.xml", "schema.xml");
 39 |   }
 40 | 
 41 |   @Override
 42 |   public void setUp() throws Exception {
 43 |     super.setUp();
 44 |     baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
 45 |   }
 46 | 
 47 |   /** whole matching, no sub-tags */
 48 |   @Test
 49 |   public void testLongestDominantRight() throws Exception {
 50 |     buildNames("in", "San", "in San", "Francisco", "San Francisco",
 51 |         "San Francisco State College", "College of California",
 52 |         "Clayton", "Clayton North", "North Carolina");
 53 | 
 54 |     assertTags("He lived in San Francisco.",
 55 |         "in", "San Francisco");
 56 | 
 57 |     assertTags("He enrolled in San Francisco State College of California",
 58 |         "in", "San Francisco State College");
 59 | 
 60 |     assertTags("He lived in Clayton North Carolina",
 61 |         "in", "Clayton", "North Carolina");
 62 | 
 63 |   }
 64 | 
 65 |   // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list,
 66 |   // Jan 26th 2015.  Honestly it's not particularly important to us but it renders this test
 67 |   // pointless.
 68 |   /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2  related: #13 */
 69 |   @Test
 70 |   @Ignore
 71 |   public void testVeryLongWord() throws Exception {
 72 |     String SANFRAN = "San Francisco";
 73 |     buildNames(SANFRAN);
 74 | 
 75 |     // exceeds default 255 max token length which means it in-effect becomes a stop-word
 76 |     StringBuilder STOP = new StringBuilder(260);//>255
 77 |     for (int i = 0; i < STOP.capacity(); i++) {
 78 |       STOP.append((char) ('0' + (i % 10)));
 79 |     }
 80 | 
 81 |     String doc = "San " + STOP + " Francisco";
 82 |     assertTags(doc);//no match due to default stop word handling
 83 |     //and we find it when we ignore stop words
 84 |     assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN)));
 85 |   }
 86 | 
 87 |   /** Support for stopwords (posInc > 1);
 88 |    * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */
 89 |   @Test
 90 |   public void testStopWords() throws Exception {
 91 |     baseParams.set("qt", "/tagStop");//stop filter (pos inc enabled) index & query
 92 | 
 93 |     String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query
 94 |     String ACITYA = "A City A";
 95 | 
 96 |     buildNames(SOUTHOFWALES, ACITYA);
 97 | 
 98 |     //round-trip works
 99 |     assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES,
100 |             lookupByName(SOUTHOFWALES)));
101 |     //  but offsets doesn't include stopword when leading or trailing...
102 |     assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City",
103 |             lookupByName(ACITYA)));
104 |     //break on stop words
105 |     assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing
106 |   }
107 | 
108 |   /** Ensure character offsets work for multi-byte characters */
109 |   @Test
110 |   public void testMultibyteChar() throws Exception {
111 |     //  https://unicode-table.com/en/2019/
112 |     //             0         1         2         3         4
113 |     //             01234567890123456789012345678901234567890
114 |     String TEXT = "He mentionned ’Obama’ in the White House";
115 |     assertEquals(40, TEXT.length()); // char length (in Java, UTF16)
116 | 
117 |     String QUOTE = TEXT.substring(14, 15);
118 |     assertEquals(8217, QUOTE.codePointAt(0));
119 | 
120 |     //UTF8
121 |     assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length);
122 |     assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length);
123 |     assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length);
124 | 
125 |     //UTF16 big endian    (by specifying big/little endian, there is no "byte order mark")
126 |     assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length);
127 |     assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length);
128 |     assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length);
129 | 
130 | 
131 |     buildNames("Obama");
132 | 
133 |     assertTags(TEXT, "Obama");
134 | 
135 |     // TODO test surrogate pairs (i.e. code points not in the BMP)
136 |   }
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/TaggerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.solr.common.params.CommonParams;
 26 | import org.apache.solr.common.params.ModifiableSolrParams;
 27 | import org.apache.solr.request.SolrQueryRequest;
 28 | import org.junit.BeforeClass;
 29 | import org.junit.Test;
 30 | 
 31 | import java.util.Arrays;
 32 | import java.util.stream.Collectors;
 33 | 
 34 | /**
 35 |  * The original test for {@link org.opensextant.solrtexttagger.TaggerRequestHandler}.
 36 |  */
 37 | public class TaggerTest extends AbstractTaggerTest {
 38 | 
 39 |   @BeforeClass
 40 |   public static void beforeClass() throws Exception {
 41 |     initCore("solrconfig.xml", "schema.xml");
 42 |   }
 43 | 
 44 |   private void indexAndBuild() throws Exception {
 45 |     N[] names = N.values();
 46 |     String[] namesStrs = new String[names.length];
 47 |     for (int i = 0; i < names.length; i++) {
 48 |       namesStrs[i] = names[i].getName();
 49 |     }
 50 |     buildNames(namesStrs);
 51 |   }
 52 | 
 53 |   /** Name corpus */
 54 |   enum N {
 55 |     //keep order to retain ord()
 56 |     London, London_Business_School, Boston, City_of_London,
 57 |     of, the//filtered out of the corpus by a custom query
 58 |     ;
 59 | 
 60 |     String getName() { return name().replace('_',' '); }
 61 |     static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); }
 62 |     int getId() { return ordinal(); }
 63 |   }
 64 | 
 65 |   @Test
 66 |   public void testFormat() throws Exception {
 67 |     baseParams.set("qt", "/tagPartial");
 68 |     baseParams.set("overlaps", "NO_SUB");
 69 |     indexAndBuild();
 70 | 
 71 |     String rspStr = _testFormatRequest(false);
 72 |     String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
 73 |         "<response>\n" +
 74 |         "<int name=\"tagsCount\">1</int>" +
 75 |         "<arr name=\"tags\"><lst>" +
 76 |           "<int name=\"startOffset\">0</int>" +
 77 |           "<int name=\"endOffset\">6</int>" +
 78 |           "<arr name=\"ids\"><str>1</str></arr>" +
 79 |         "</lst></arr>" +
 80 |         "<result name=\"response\" numFound=\"1\" start=\"0\">" +
 81 |           "<doc><str name=\"id\">1</str><str name=\"name\">London Business School</str></doc>" +
 82 |         "</result>\n" +
 83 |         "</response>\n";
 84 |     assertEquals(expected, rspStr);
 85 |   }
 86 | 
 87 |   @Test
 88 |   public void testFormatMatchText() throws Exception {
 89 |     baseParams.set("qt", "/tagPartial");
 90 |     baseParams.set("overlaps", "NO_SUB");
 91 |     indexAndBuild();
 92 | 
 93 |     String rspStr = _testFormatRequest(true);
 94 |     String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
 95 |         "<response>\n" +
 96 |         "<int name=\"tagsCount\">1</int>" +
 97 |         "<arr name=\"tags\"><lst>" +
 98 |           "<int name=\"startOffset\">0</int>" +
 99 |           "<int name=\"endOffset\">6</int><" +
100 |           "str name=\"matchText\">school</str>" +
101 |           "<arr name=\"ids\"><str>1</str></arr>" +
102 |         "</lst></arr>" +
103 |         "<result name=\"response\" numFound=\"1\" start=\"0\">" +
104 |           "<doc><str name=\"id\">1</str><str name=\"name\">London Business School</str></doc>" +
105 |         "</result>\n" +
106 |         "</response>\n";
107 |     assertEquals(expected, rspStr);
108 |   }
109 | 
110 |   private String _testFormatRequest(boolean matchText) throws Exception {
111 |     String doc = "school";//just one tag
112 |     SolrQueryRequest req = reqDoc(doc, "indent", "off", "omitHeader", "on", "matchText", ""+matchText);
113 |     String rspStr = h.query(req);
114 |     req.close();
115 |     return rspStr;
116 |   }
117 | 
118 |   @Test
119 |   /** Partial matching, no sub-tags */
120 |   public void testPartialMatching() throws Exception {
121 |     baseParams.set("qt", "/tagPartial");
122 |     baseParams.set("overlaps", "NO_SUB");
123 |     indexAndBuild();
124 | 
125 |     //these match nothing
126 |     assertTags(reqDoc("") );
127 |     assertTags(reqDoc(" ") );
128 |     assertTags(reqDoc("the") );
129 | 
130 |     String doc;
131 | 
132 |     //just London Business School via "school" substring
133 |     doc = "school";
134 |     assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
135 | 
136 |     doc = "a school";
137 |     assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
138 | 
139 |     doc = "school a";
140 |     assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School));
141 | 
142 |     //More interesting
143 | 
144 |     doc = "school City";
145 |     assertTags(reqDoc(doc),
146 |         tt(doc, "school", 0, N.London_Business_School),
147 |         tt(doc, "City", 0, N.City_of_London) );
148 | 
149 |     doc = "City of London Business School";
150 |     assertTags(reqDoc(doc),   //no plain London (sub-tag)
151 |         tt(doc, "City of London", 0, N.City_of_London),
152 |         tt(doc, "London Business School", 0, N.London_Business_School));
153 |   }
154 | 
155 |   @Test
156 |   /** whole matching, no sub-tags */
157 |   public void testWholeMatching() throws Exception {
158 |     baseParams.set("qt", "/tag");
159 |     baseParams.set("overlaps", "NO_SUB");
160 |     indexAndBuild();
161 | 
162 |     //these match nothing
163 |     assertTags(reqDoc(""));
164 |     assertTags(reqDoc(" ") );
165 |     assertTags(reqDoc("the") );
166 | 
167 |     //partial on N.London_Business_School matches nothing
168 |     assertTags(reqDoc("school") );
169 |     assertTags(reqDoc("a school") );
170 |     assertTags(reqDoc("school a") );
171 |     assertTags(reqDoc("school City") );
172 | 
173 |     String doc;
174 | 
175 |     doc = "school business london";//backwards
176 |     assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
177 | 
178 |     doc = "of London Business School";
179 |     assertTags(reqDoc(doc),   //no plain London (sub-tag)
180 |         tt(doc, "London Business School", 0, N.London_Business_School));
181 | 
182 |     //More interesting
183 |     doc = "City of London Business School";
184 |     assertTags(reqDoc(doc),   //no plain London (sub-tag)
185 |         tt(doc, "City of London", 0, N.City_of_London),
186 |         tt(doc, "London Business School", 0, N.London_Business_School));
187 | 
188 |     doc = "City of London Business";
189 |     assertTags(reqDoc(doc),   //no plain London (sub-tag) no Business (partial-match)
190 |         tt(doc, "City of London", 0, N.City_of_London));
191 | 
192 |     doc = "London Business magazine";
193 |     assertTags(reqDoc(doc),  //Just London; L.B.S. fails
194 |         tt(doc, "London", 0, N.London));
195 |   }
196 | 
197 |   @Test
198 |   /** whole matching, with sub-tags */
199 |   public void testSubTags() throws Exception {
200 |     baseParams.set("qt", "/tag");
201 |     baseParams.set("overlaps", "ALL");
202 |     indexAndBuild();
203 | 
204 |     //these match nothing
205 |     assertTags(reqDoc(""));
206 |     assertTags(reqDoc(" ") );
207 |     assertTags(reqDoc("the") );
208 | 
209 |     //partial on N.London_Business_School matches nothing
210 |     assertTags(reqDoc("school") );
211 |     assertTags(reqDoc("a school") );
212 |     assertTags(reqDoc("school a") );
213 |     assertTags(reqDoc("school City") );
214 | 
215 |     String doc;
216 | 
217 |     doc = "school business london";//backwards
218 |     assertTags(reqDoc(doc), tt(doc,"london", 0, N.London));
219 | 
220 |     //More interesting
221 |     doc = "City of London Business School";
222 |     assertTags(reqDoc(doc),
223 |         tt(doc, "City of London", 0, N.City_of_London),
224 |         tt(doc, "London", 0, N.London),
225 |         tt(doc, "London Business School", 0, N.London_Business_School));
226 | 
227 |     doc = "City of London Business";
228 |     assertTags(reqDoc(doc),
229 |         tt(doc, "City of London", 0, N.City_of_London),
230 |         tt(doc, "London", 0, N.London));
231 |   }
232 | 
233 |   @Test
234 |   public void testMultipleFilterQueries() throws Exception {
235 |     baseParams.set("qt", "/tag");
236 |     baseParams.set("overlaps", "ALL");
237 | 
238 |     // build up the corpus with some additional fields for filtering purposes
239 |     deleteByQueryAndGetVersion("*:*", null);
240 | 
241 |     int i = 0;
242 |     assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK"));
243 |     assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK"));
244 |     assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US"));
245 |     assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK"));
246 |     assertU(commit());
247 | 
248 |     // not calling buildNames so that we can bring along extra attributes for filtering
249 |     NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList());
250 | 
251 |     // phrase that matches everything
252 |     String doc = "City of London Business School in Boston";
253 | 
254 |     // first do no filtering
255 |     ModifiableSolrParams p = new ModifiableSolrParams();
256 |     p.add(CommonParams.Q, "*:*");
257 |     assertTags(reqDoc(doc, p),
258 |         tt(doc, "City of London", 0, N.City_of_London),
259 |         tt(doc, "London", 0, N.London),
260 |         tt(doc, "London Business School", 0, N.London_Business_School),
261 |         tt(doc, "Boston", 0, N.Boston));
262 | 
263 |     // add a single fq
264 |     p.add(CommonParams.FQ, "type:city");
265 |     assertTags(reqDoc(doc, p),
266 |         tt(doc, "London", 0, N.London),
267 |         tt(doc, "Boston", 0, N.Boston));
268 | 
269 |     // add another fq
270 |     p.add(CommonParams.FQ, "country:US");
271 |     assertTags(reqDoc(doc, p),
272 |         tt(doc, "Boston", 0, N.Boston));
273 |   }
274 | 
275 |   private TestTag tt(String doc, String substring, int substringIndex, N name) {
276 |     assert substringIndex == 0;
277 | 
278 |     //little bit of copy-paste code from super.tt()
279 |     int startOffset = -1, endOffset;
280 |     int substringIndex1 = 0;
281 |     for(int i = 0; i <= substringIndex1; i++) {
282 |       startOffset = doc.indexOf(substring, ++startOffset);
283 |       assert startOffset >= 0 : "The test itself is broken";
284 |     }
285 |     endOffset = startOffset+ substring.length();//1 greater (exclusive)
286 |     return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName()));
287 |   }
288 | 
289 | }
290 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/TaggingAttributeTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.junit.BeforeClass;
26 | import org.junit.Test;
27 | 
28 | /**
29 |  * Test the {@link org.opensextant.solrtexttagger.TaggerRequestHandler} with
30 |  * a Analyzer chain that does use the {@link TaggingAttribute}. See the test
31 |  * configuration under 'taggingattribute'.
32 |  */
33 | public class TaggingAttributeTest extends AbstractTaggerTest {
34 | 
35 |   @BeforeClass
36 |   public static void beforeClass() throws Exception {
37 |     //NOTE: We use the TaggingAttribute specific configuration
38 |     // Reference solr-home in target/test-classes since that's where it's copied and any config
39 |     // persisting (e.g. from rest managed stuff) will happen there.
40 |     initCore("solrconfig.xml", "schema.xml", "target/test-classes/taggingattribute");
41 |   }
42 | 
43 |   @Test
44 |   /**
45 |    * Whole matching, no sub-tags. Links only words with > 3 letters.
46 |    * Because of that "San" is not used to start tags
47 |    *
48 |    */
49 |   public void testTaggingAttribute() throws Exception {
50 |     // this test is based on the longest dominant right test, so we use the
51 |     // the same TagClusterReducer setting
52 |     baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
53 | 
54 |     buildNames("in", "San", "in San", "Francisco", "San Francisco",
55 |         "San Francisco State College", "College of California",
56 |         "Clayton", "Clayton North", "North Carolina");
57 | 
58 |     assertTags("He lived in San Francisco.",
59 |         //"in", "San Francisco"); //whis would be expected without taggable
60 |         "Francisco");// this are the expected results with taggable 
61 | 
62 |     assertTags("He enrolled in San Francisco State College of California",
63 |         //"in", "San Francisco State College"); //without taggable enabled
64 |         "Francisco", "College of California");// With taggable
65 |     //NOTE this also tests that started tags are advanced for non-taggable
66 |     //     tokens, as otherwise 'College of California' would not be
67 |     //     suggested.
68 | 
69 |     assertTags("He lived in Clayton North Carolina",
70 |         //"in", "Clayton", "North Carolina");
71 |         "Clayton", "North Carolina");
72 | 
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/WordLengthTaggingFilter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.lucene.analysis.TokenFilter;
 26 | import org.apache.lucene.analysis.TokenStream;
 27 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 28 | 
 29 | import java.io.IOException;
 30 | 
 31 | /**
 32 |  * Simple TokenFilter that lookup only Tokens with more as the parsed number
 33 |  * of chars.<p>
 34 |  * <b>NOTE:</b>This implementation is only intended to be used as an example
 35 |  * and for unit testing the {@link TaggingAttribute} feature. Typically
 36 |  * implementations will be based on NLP results (e.g. using POS tags or
 37 |  * detected Named Entities).
 38 |  * <p/>
 39 |  * <b>Example Usage:</b><p>
 40 |  * Currently the usage requires to modify the Analyzer as defined by the
 41 |  * <code>indexedField</code>. An alternative would be to allow the configuration
 42 |  * of a special FieldType in the schema.xml and use this Analyzer for processing
 43 |  * the text sent to the request.<p>
 44 |  * While the current solution is fine for direct API usage, defining the
 45 |  * Analyzer in the schema.xml would be better suitable for using this feature
 46 |  * with the {@link TaggerRequestHandler}.
 47 |  *
 48 |  * <code><pre>
 49 |  *     Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
 50 |  *     //get the TokenStream from the Analyzer
 51 |  *     TokenStream baseStream = analyzer.tokenStream("", reader);
 52 |  *     //add a FilterStream that sets the LookupAttribute to the end
 53 |  *     TokenStream filterStream = new WordLengthLookupFilter(baseStream);
 54 |  *     //create the Tagger using the modified analyzer chain.
 55 |  *     new Tagger(corpus, filterStream, tagClusterReducer) {
 56 |  *
 57 |  *         protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
 58 |  *             //implement the callback
 59 |  *         }
 60 |  *
 61 |  *     }.process();
 62 |  * </pre></code>
 63 |  *
 64 |  * @author Rupert Westenthaler
 65 |  */
 66 | public class WordLengthTaggingFilter extends TokenFilter {
 67 | 
 68 |   /**
 69 |    * The default minimum length is <code>3</code>
 70 |    */
 71 |   public static final int DEFAULT_MIN_LENGTH = 3;
 72 |   private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class);
 73 |   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 74 |   private int minLength;
 75 | 
 76 |   /**
 77 |    * TokenFilter only marks tokens to be looked up with equals or more as
 78 |    * {@link #DEFAULT_MIN_LENGTH} characters
 79 |    *
 80 |    * @param input
 81 |    */
 82 |   public WordLengthTaggingFilter(TokenStream input) {
 83 |     this(input, null);
 84 |   }
 85 | 
 86 |   /**
 87 |    * TokenFilter only marks tokens to be looked up with equals or more characters
 88 |    * as the parsed minimum.
 89 |    *
 90 |    * @param input     the TokenStream to consume tokens from
 91 |    * @param minLength The minimum length to lookup a Token. <code>null</code>
 92 |    *                  or &lt;= 0 to use the #DEFAULT_MIN_LENGTH
 93 |    */
 94 |   public WordLengthTaggingFilter(TokenStream input, Integer minLength) {
 95 |     super(input);
 96 |     if (minLength == null || minLength <= 0) {
 97 |       this.minLength = DEFAULT_MIN_LENGTH;
 98 |     } else {
 99 |       this.minLength = minLength;
100 |     }
101 |   }
102 | 
103 |   @Override
104 |   public final boolean incrementToken() throws IOException {
105 |     if (input.incrementToken()) {
106 |       int size = offsetAtt.endOffset() - offsetAtt.startOffset();
107 |       lookupAtt.setTaggable(size >= minLength);
108 |       return true;
109 |     } else {
110 |       return false;
111 |     }
112 |   }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/WordLengthTaggingFilterFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This software was produced for the U. S. Government
 3 |   under Contract No. W15P7T-11-C-F600, and is
 4 |   subject to the Rights in Noncommercial Computer Software
 5 |   and Noncommercial Computer Software Documentation
 6 |   Clause 252.227-7014 (JUN 1995)
 7 | 
 8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 9 | 
10 |   Licensed under the Apache License, Version 2.0 (the "License");
11 |   you may not use this file except in compliance with the License.
12 |   You may obtain a copy of the License at
13 | 
14 |       http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 |   Unless required by applicable law or agreed to in writing, software
17 |   distributed under the License is distributed on an "AS IS" BASIS,
18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |   See the License for the specific language governing permissions and
20 |   limitations under the License.
21 |  */
22 | 
23 | package org.opensextant.solrtexttagger;
24 | 
25 | import org.apache.lucene.analysis.TokenStream;
26 | import org.apache.lucene.analysis.util.TokenFilterFactory;
27 | import org.slf4j.Logger;
28 | import org.slf4j.LoggerFactory;
29 | 
30 | import java.util.Map;
31 | 
32 | public class WordLengthTaggingFilterFactory extends TokenFilterFactory {
33 | 
34 |   private final Logger log = LoggerFactory.getLogger(WordLengthTaggingFilterFactory.class);
35 | 
36 |   public static final String MIN_LENGTH = "minLength";
37 | 
38 |   private final Integer minLength;
39 | 
40 |   public WordLengthTaggingFilterFactory(Map<String, String> args) {
41 |     super(args);
42 |     int minLength = -1;
43 |     Object value = args.get(MIN_LENGTH);
44 |     if (value != null) {
45 |       try {
46 |         minLength = Integer.parseInt(value.toString());
47 |       } catch (NumberFormatException e) {
48 |         log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value);
49 | 
50 |       }
51 |     }
52 |     if (minLength <= 0) {
53 |       log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH);
54 |       this.minLength = null;
55 |     } else {
56 |       log.info("set minLength={}", minLength);
57 |       this.minLength = minLength;
58 |     }
59 |   }
60 | 
61 |   @Override
62 |   public TokenStream create(TokenStream input) {
63 |     return new WordLengthTaggingFilter(input, minLength);
64 |   }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/java/org/opensextant/solrtexttagger/XmlInterpolationTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This software was produced for the U. S. Government
  3 |   under Contract No. W15P7T-11-C-F600, and is
  4 |   subject to the Rights in Noncommercial Computer Software
  5 |   and Noncommercial Computer Software Documentation
  6 |   Clause 252.227-7014 (JUN 1995)
  7 | 
  8 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
  9 | 
 10 |   Licensed under the Apache License, Version 2.0 (the "License");
 11 |   you may not use this file except in compliance with the License.
 12 |   You may obtain a copy of the License at
 13 | 
 14 |       http://www.apache.org/licenses/LICENSE-2.0
 15 | 
 16 |   Unless required by applicable law or agreed to in writing, software
 17 |   distributed under the License is distributed on an "AS IS" BASIS,
 18 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |   See the License for the specific language governing permissions and
 20 |   limitations under the License.
 21 |  */
 22 | 
 23 | package org.opensextant.solrtexttagger;
 24 | 
 25 | import org.apache.commons.io.IOUtils;
 26 | import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 27 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 28 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 29 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 30 | import org.apache.solr.common.SolrException;
 31 | import org.apache.solr.request.SolrQueryRequest;
 32 | import org.apache.solr.response.SolrQueryResponse;
 33 | import org.junit.BeforeClass;
 34 | import org.junit.Test;
 35 | import org.xml.sax.InputSource;
 36 | 
 37 | import javax.xml.parsers.DocumentBuilder;
 38 | import javax.xml.parsers.DocumentBuilderFactory;
 39 | import java.io.IOException;
 40 | import java.io.Reader;
 41 | import java.io.StringReader;
 42 | import java.util.ArrayList;
 43 | import java.util.Collections;
 44 | import java.util.List;
 45 | 
 46 | public class XmlInterpolationTest extends AbstractTaggerTest {
 47 | 
 48 |   private static DocumentBuilder xmlDocBuilder;
 49 | 
 50 | 
 51 |   @BeforeClass
 52 |   public static void beforeClass() throws Exception {
 53 |     DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance();
 54 |     xmlDocBuilderFactory.setValidating(true);
 55 |     xmlDocBuilderFactory.setNamespaceAware(true);
 56 |     xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder();
 57 | 
 58 |     initCore("solrconfig.xml", "schema.xml");
 59 |   }
 60 | 
 61 |   @Override
 62 |   public void setUp() throws Exception {
 63 |     super.setUp();
 64 |     baseParams.set("qt", "/tagXml");
 65 |     baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT");
 66 |     baseParams.set("xmlOffsetAdjust", "true");
 67 |   }
 68 | 
 69 |   @Test
 70 |   public void test() throws Exception {
 71 |     buildNames("start end");
 72 | 
 73 |     assertXmlTag("<doc>before start <!-- c --> end after</doc>", true);
 74 |     assertXmlTag("<doc>before start <br/> end after</doc>", true);
 75 |     assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true);
 76 |     assertXmlTag("<doc>before <em>start</em> end after</doc>", true);
 77 |     assertXmlTag("<doc>before start end<em> after</em></doc>", true);
 78 |     assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags
 79 |     assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true);
 80 |     assertXmlTag("<doc>before <b> <em>start</em> </b> <em>  end  </em> after</doc>", true);
 81 | 
 82 |     assertXmlTag("<doc><p>before start</p> end after</doc>", false);
 83 |     assertXmlTag("<doc>before start <p>end after</p> </doc>", false);
 84 | 
 85 |     assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true);
 86 |   }
 87 | 
 88 |   @Test(expected = SolrException.class)
 89 |   public void testInvalidXml() throws Exception {
 90 |     assertXmlTag("notXml", false);
 91 |   }
 92 | 
 93 |   @Test(expected = Exception.class)
 94 |   public void testValidatingXml() throws Exception {
 95 |     validateXml("foo");
 96 |   }
 97 | 
 98 |   protected void assertXmlTag(String docText, boolean expected) throws Exception {
 99 |     final SolrQueryRequest req = reqDoc(docText);
100 |     try { // 5.4 and beyond we can use try-with-resources
101 |       final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req);
102 |       final TestTag[] testTags = pullTagsFromResponse(req, rsp);
103 |       if (!expected) {
104 |         assertEquals(0, testTags.length);
105 |       } else {
106 |         assertEquals(1, testTags.length);
107 |         final TestTag tag = testTags[0];
108 |         validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName));
109 |       }
110 |     } finally {
111 |       req.close();
112 |     }
113 |   }
114 | 
115 |   protected void validateXml(String xml) throws Exception {
116 |     // the "parse" method also validates XML, will throw an exception if mis-formatted
117 |     xmlDocBuilder.parse(new InputSource(new StringReader(xml)));
118 |   }
119 | 
120 | 
121 |   @Test
122 |   public void testLuceneHtmlFilterBehavior() {
123 |     String docText;
124 | 
125 |     //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734
126 |     docText = "<doc><a><b>start</b> end</a></doc>";
127 |     assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end"));
128 | 
129 |     //Space after "end" means offset doesn't include </a>
130 |     docText = "<doc><a><b>start</b> end </a></doc>";
131 |     assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end"));
132 | 
133 |     //Matches entity at end
134 |     final String endStr = String.format("en&#x%02x;", (int) 'd');
135 |     docText = "<doc>start " + endStr + "</doc>";
136 |     assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end"));
137 |     //... and at start
138 |     final String startStr = String.format("&#x%02x;tart", (int) 's');
139 |     docText = "<doc>" + startStr + " end</doc>";
140 |     assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end"));
141 | 
142 |     //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start".
143 |     docText = "<!DOCTYPE start [ "
144 |             + "<!ENTITY start \"start\">"
145 |             + "]><start><?start start ?><!-- start --><start/>&start;</start>";
146 |     assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start"));
147 | 
148 |     //Test entity behavior
149 |     docText =                " &mdash; &ndash; &amp; &foo; &#xA0; a&nbsp;b";
150 |     assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"},
151 |             analyzeReturnTokens(docText));
152 | 
153 |     //Observe offset adjustment of trailing entity to end tag
154 |     docText = "foo&nbsp;bar";
155 |     assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo"));
156 |   }
157 | 
158 |   private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) {
159 |     String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id)
160 |     String insertEnd = "</A>";
161 |     return docText.substring(0, startOffset)
162 |             + insertStart
163 |             + docText.substring(startOffset, endOffset)
164 |             + insertEnd
165 |             + docText.substring(endOffset);
166 |   }
167 | 
168 |   private int[] tagExpect(String docText, String start, String end) {
169 |     return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()};
170 |   }
171 | 
172 |   private int[] analyzeTagOne(String docText, String start, String end) {
173 |     int[] result = {-1, -1};
174 | 
175 |     Reader filter = new HTMLStripCharFilter(new StringReader(docText));
176 | 
177 |     WhitespaceTokenizer ts = new WhitespaceTokenizer();
178 |     final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
179 |     final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
180 |     try {
181 |       ts.setReader(filter);
182 |       ts.reset();
183 |       while (ts.incrementToken()) {
184 |         final String termString = termAttribute.toString();
185 |         if (termString.equals(start))
186 |           result[0] = offsetAttribute.startOffset();
187 |         if (termString.equals(end)) {
188 |           result[1] = offsetAttribute.endOffset();
189 |           return result;
190 |         }
191 |       }
192 |       ts.end();
193 |     } catch (IOException e) {
194 |       throw new RuntimeException(e);
195 |     } finally {
196 |       IOUtils.closeQuietly(ts);
197 |     }
198 |     return result;
199 |   }
200 | 
201 |   private String[] analyzeReturnTokens(String docText) {
202 |     List<String> result = new ArrayList<>();
203 | 
204 |     Reader filter = new HTMLStripCharFilter(new StringReader(docText),
205 |             Collections.singleton("unescaped"));
206 |     WhitespaceTokenizer ts = new WhitespaceTokenizer();
207 |     final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
208 |     try {
209 |       ts.setReader(filter);
210 |       ts.reset();
211 |       while (ts.incrementToken()) {
212 |         result.add(termAttribute.toString());
213 |       }
214 |       ts.end();
215 |     } catch (IOException e) {
216 |       throw new RuntimeException(e);
217 |     } finally {
218 |       IOUtils.closeQuietly(ts);
219 |     }
220 |     return result.toArray(new String[result.size()]);
221 |   }
222 | 
223 | }
224 | 


--------------------------------------------------------------------------------
/src/test/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory:
--------------------------------------------------------------------------------
1 | org.opensextant.solrtexttagger.WordLengthTaggingFilterFactory


--------------------------------------------------------------------------------
/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   This software was produced for the U. S. Government
 4 |   under Contract No. W15P7T-11-C-F600, and is
 5 |   subject to the Rights in Noncommercial Computer Software
 6 |   and Noncommercial Computer Software Documentation
 7 |   Clause 252.227-7014 (JUN 1995)
 8 | 
 9 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
10 | 
11 |   Licensed under the Apache License, Version 2.0 (the "License");
12 |   you may not use this file except in compliance with the License.
13 |   You may obtain a copy of the License at
14 | 
15 |       http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 |   Unless required by applicable law or agreed to in writing, software
18 |   distributed under the License is distributed on an "AS IS" BASIS,
19 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 |   See the License for the specific language governing permissions and
21 |   limitations under the License.
22 |   -->
23 | 
24 | <configuration>
25 |   <contextListener class="ch.qos.logback.classic.jul.LevelChangePropagator"/>
26 | 
27 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
28 |     <!-- encoders are assigned the type
29 |          ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
30 |     <encoder>
31 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
32 |     </encoder>
33 |   </appender>
34 | 
35 |   <!--<logger name="org.opensextant.solrtexttagger" level="DEBUG"/>-->
36 |   <logger name="org.apache.solr" level="WARN"/>
37 |   <!--<logger name="org.apache.solr.SolrTestCaseJ4" level="INFO"/>-->
38 |   <logger name="org.opensextant.solrtexttagger.TaggerTest" level="${log.level:-DEBUG}"/>
39 | 
40 |   <root level="${log.level:-INFO}">
41 |     <appender-ref ref="STDOUT" />
42 |   </root>
43 | </configuration>


--------------------------------------------------------------------------------
/src/test/resources/solr/collection1/conf/schema.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | <!--
  3 |   This software was produced for the U. S. Government
  4 |   under Contract No. W15P7T-11-C-F600, and is
  5 |   subject to the Rights in Noncommercial Computer Software
  6 |   and Noncommercial Computer Software Documentation
  7 |   Clause 252.227-7014 (JUN 1995)
  8 | 
  9 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
 10 | 
 11 |   Licensed under the Apache License, Version 2.0 (the "License");
 12 |   you may not use this file except in compliance with the License.
 13 |   You may obtain a copy of the License at
 14 | 
 15 |       http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 |   Unless required by applicable law or agreed to in writing, software
 18 |   distributed under the License is distributed on an "AS IS" BASIS,
 19 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 |   See the License for the specific language governing permissions and
 21 |   limitations under the License.
 22 |   -->
 23 | <schema name="minimal" version="1.6">
 24 | 
 25 |   <fields>
 26 |     <field name="id" type="string" docValues="true" required="true"/>
 27 |     <field name="name" type="string"/>
 28 |     <!-- freq, positions, and norms are not needed by the tagger. However if you
 29 |     intend to have this field be used for general search, you should not exclude
 30 |     these stats. -->
 31 |     <field name="name_tag" type="tag" stored="false"/>
 32 |     <field name="name_tagStop" type="tagStop" stored="false"/>
 33 |     <field name="name_tagPartial" type="tagPartial" stored="false"/>
 34 |     <field name="name_tagXml" type="tagXml" stored="false"/>
 35 | 
 36 |     <copyField source="name" dest="name_tag"/>
 37 |     <copyField source="name" dest="name_tagStop"/>
 38 |     <copyField source="name" dest="name_tagPartial"/>
 39 |     <copyField source="name" dest="name_tagXml"/>
 40 | 
 41 |     <dynamicField name="*" type="string" indexed="true" stored="true"/>
 42 |   </fields>
 43 | 
 44 |   <uniqueKey>id</uniqueKey>
 45 | 
 46 |   <types>
 47 |     <fieldType name="string" class="solr.StrField"/>
 48 | 
 49 |     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
 50 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
 51 | 
 52 |     <fieldType name="tag" class="solr.TextField" positionIncrementGap="100"
 53 |                postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
 54 |       <analyzer type="index">
 55 |         <tokenizer class="solr.StandardTokenizerFactory" />
 56 |         <!--Usually good but for our tests, lets not. <filter class="solr.ASCIIFoldingFilterFactory"/>-->
 57 |         <filter class="solr.LowerCaseFilterFactory"/>
 58 | 
 59 |         <filter class="org.opensextant.solrtexttagger.ConcatenateFilterFactory" />
 60 |       </analyzer>
 61 |       <analyzer type="query">
 62 |         <tokenizer class="solr.StandardTokenizerFactory" />
 63 |         <!--<filter class="solr.ASCIIFoldingFilterFactory"/>-->
 64 |         <filter class="solr.LowerCaseFilterFactory"/>
 65 |       </analyzer>
 66 |     </fieldType>
 67 | 
 68 |     <!-- adds a stop filter -->
 69 |     <fieldType name="tagStop" class="solr.TextField" positionIncrementGap="100"
 70 |                postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
 71 |       <analyzer type="index">
 72 |         <tokenizer class="solr.StandardTokenizerFactory" />
 73 |         <filter class="solr.LowerCaseFilterFactory"/>
 74 |         <filter class="solr.StopFilterFactory" /><!-- by default english stopwords -->
 75 | 
 76 |         <filter class="org.opensextant.solrtexttagger.ConcatenateFilterFactory" />
 77 |       </analyzer>
 78 |       <analyzer type="query">
 79 |         <tokenizer class="solr.StandardTokenizerFactory" />
 80 |         <filter class="solr.LowerCaseFilterFactory"/>
 81 |         <filter class="solr.StopFilterFactory" /><!-- by default english stopwords -->
 82 |       </analyzer>
 83 |     </fieldType>
 84 | 
 85 |     <fieldType name="tagPartial" class="solr.TextField" positionIncrementGap="100"
 86 |                postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
 87 |       <analyzer type="index">
 88 |         <tokenizer class="solr.StandardTokenizerFactory" />
 89 |         <filter class="solr.LowerCaseFilterFactory"/>
 90 | 
 91 |         <filter class="solr.ShingleFilterFactory"
 92 |                 outputUnigramsIfNoShingles="true" maxShingleSize="10"  />
 93 |       </analyzer>
 94 |       <analyzer type="query">
 95 |         <tokenizer class="solr.StandardTokenizerFactory" />
 96 |         <filter class="solr.LowerCaseFilterFactory"/>
 97 |       </analyzer>
 98 |     </fieldType>
 99 | 
100 |     <fieldType name="tagXml" class="solr.TextField" positionIncrementGap="100"
101 |                postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
102 |       <analyzer type="index">
103 |         <tokenizer class="solr.StandardTokenizerFactory" />
104 |         <filter class="solr.LowerCaseFilterFactory"/>
105 | 
106 |         <filter class="org.opensextant.solrtexttagger.ConcatenateFilterFactory" />
107 |       </analyzer>
108 |       <analyzer type="query">
109 |         <charFilter class="solr.HTMLStripCharFilterFactory" /><!-- ADDED THIS! -->
110 |         <tokenizer class="solr.StandardTokenizerFactory" />
111 |         <filter class="solr.LowerCaseFilterFactory"/>
112 |       </analyzer>
113 |     </fieldType>
114 | 
115 | 
116 |     <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
117 |       <analyzer type="index">
118 |         <tokenizer class="solr.StandardTokenizerFactory"/>
119 |         <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
120 |         <!-- in this example, we will only use synonyms at query time
121 |         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
122 |         -->
123 |         <filter class="solr.LowerCaseFilterFactory"/>
124 |       </analyzer>
125 |       <analyzer type="query">
126 |         <tokenizer class="solr.StandardTokenizerFactory"/>
127 |         <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
128 |         <!-- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
129 |         <filter class="solr.LowerCaseFilterFactory"/>
130 |       </analyzer>
131 |     </fieldType>
132 | 
133 |   </types>
134 | 
135 | 
136 | </schema>
137 | 


--------------------------------------------------------------------------------
/src/test/resources/solr/collection1/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | 
 3 | <!--
 4 |   This software was produced for the U. S. Government
 5 |   under Contract No. W15P7T-11-C-F600, and is
 6 |   subject to the Rights in Noncommercial Computer Software
 7 |   and Noncommercial Computer Software Documentation
 8 |   Clause 252.227-7014 (JUN 1995)
 9 | 
10 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
11 | 
12 |   Licensed under the Apache License, Version 2.0 (the "License");
13 |   you may not use this file except in compliance with the License.
14 |   You may obtain a copy of the License at
15 | 
16 |       http://www.apache.org/licenses/LICENSE-2.0
17 | 
18 |   Unless required by applicable law or agreed to in writing, software
19 |   distributed under the License is distributed on an "AS IS" BASIS,
20 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 |   See the License for the specific language governing permissions and
22 |   limitations under the License.
23 |   -->
24 | 
25 | <!-- a basic solrconfig that tests can use when they want simple minimal solrconfig/schema
26 |      DO NOT ADD THINGS TO THIS CONFIG! -->
27 | <config>
28 |   <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
29 |   <dataDir>${solr.data.dir:}</dataDir>
30 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
31 | 
32 |   <!-- for postingsFormat="..." -->
33 |   <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
34 |   
35 |   <!-- since Solr 4.8: -->
36 |   <schemaFactory name="SchemaFactory" class="solr.ClassicIndexSchemaFactory" />
37 | 
38 |   <query>
39 |     <!-- illustrate putting in memory for warm-up -->
40 |     <listener event="firstSearcher" class="solr.QuerySenderListener">
41 |       <arr name="queries">
42 |         <lst><str name="q">name_tag:[* TO *]</str></lst>
43 |       </arr>
44 |     </listener>
45 |     <listener event="newSearcher" class="solr.QuerySenderListener">
46 |       <arr name="queries">
47 |         <lst><str name="q">name_tag:[* TO *]</str></lst>
48 |       </arr>
49 |     </listener>
50 |   </query>
51 | 
52 |   <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
53 | 
54 |   <requestHandler name="/tag" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
55 |     <lst name="defaults">
56 |       <str name="field">name_tag</str>
57 |       <str name="fq">NOT name:(of the)</str><!-- filter out -->
58 |     </lst>
59 |   </requestHandler>
60 | 
61 |   <requestHandler name="/tagStop" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
62 |     <!-- top level params; legacy format just to test it still works -->
63 |     <str name="field">name_tagStop</str>
64 |   </requestHandler>
65 | 
66 |   <requestHandler name="/tagPartial" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
67 |     <!-- top level params; legacy format just to test it still works -->
68 |     <str name="field">name_tagPartial</str>
69 |     <str name="fq">NOT name:(of the)</str><!-- filter out -->
70 |   </requestHandler>
71 | 
72 |   <requestHandler name="/tagXml" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
73 |     <!-- top level params; legacy format just to test it still works -->
74 |     <str name="field">name_tagXml</str>
75 |   </requestHandler>
76 | 
77 | </config>
78 | 


--------------------------------------------------------------------------------
/src/test/resources/taggingattribute/collection1/conf/schema.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!--
 3 |   This software was produced for the U. S. Government
 4 |   under Contract No. W15P7T-11-C-F600, and is
 5 |   subject to the Rights in Noncommercial Computer Software
 6 |   and Noncommercial Computer Software Documentation
 7 |   Clause 252.227-7014 (JUN 1995)
 8 | 
 9 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
10 | 
11 |   Licensed under the Apache License, Version 2.0 (the "License");
12 |   you may not use this file except in compliance with the License.
13 |   You may obtain a copy of the License at
14 | 
15 |       http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 |   Unless required by applicable law or agreed to in writing, software
18 |   distributed under the License is distributed on an "AS IS" BASIS,
19 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 |   See the License for the specific language governing permissions and
21 |   limitations under the License.
22 |   -->
23 | <schema name="minimal" version="1.6">
24 | 
25 |   <fields>
26 |     <field name="id" type="string" docValues="true" required="true"/>
27 |     <field name="name" type="string"/>
28 |     <!-- freq, positions, and norms are not needed by the tagger. However if you
29 |     intend to have this field be used for general search, you should not exclude
30 |     these stats. -->
31 |     <field name="name_tag" type="tag" stored="false"
32 |            omitTermFreqAndPositions="true" omitNorms="true"/>
33 | 
34 |     <copyField source="name" dest="name_tag"/>
35 | 
36 |     <dynamicField name="*" type="string" indexed="true" stored="true"/>
37 |   </fields>
38 | 
39 |   <uniqueKey>id</uniqueKey>
40 | 
41 |   <types>
42 |     <fieldType name="string" class="solr.StrField"/>
43 | 
44 |     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
45 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
46 | 
47 |     <fieldType name="tag" class="solr.TextField" positionIncrementGap="100" postingsFormat="FST50">
48 |       <analyzer type="index">
49 |         <tokenizer class="solr.StandardTokenizerFactory" />
50 |         <filter class="solr.ASCIIFoldingFilterFactory"/>
51 |         <filter class="solr.LowerCaseFilterFactory"/>
52 | 
53 |         <filter class="org.opensextant.solrtexttagger.ConcatenateFilterFactory" />
54 |       </analyzer>
55 |       <analyzer type="query">
56 |         <!-- 32 just for tests, bumps posInc -->
57 |         <tokenizer class="solr.StandardTokenizerFactory"
58 |                    maxTokenLength="32"/>
59 |         <!--
60 |          NOTE: This used the WordLengthTaggingFilterFactory to test the
61 |          TaggingAttribute. The WordLengthTaggingFilter set the TaggingAttribute
62 |          for words based on their length. The attribute is ignored at indexing
63 |          time, but the Tagger will use it to only start tags for words that are
64 |          equals or longer as the configured minLength.
65 |          -->
66 |         <filter class="solr.WordLengthTaggingFilterFactory" minLength="4"/>
67 |         <filter class="solr.ASCIIFoldingFilterFactory"/>
68 |         <filter class="solr.LowerCaseFilterFactory"/>
69 |       </analyzer>
70 |     </fieldType>
71 | 
72 |     <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
73 |       <analyzer type="index">
74 |         <tokenizer class="solr.StandardTokenizerFactory"/>
75 |         <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
76 |         <!-- in this example, we will only use synonyms at query time
77 |         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
78 |         -->
79 |         <filter class="solr.LowerCaseFilterFactory"/>
80 |       </analyzer>
81 |       <analyzer type="query">
82 |         <tokenizer class="solr.StandardTokenizerFactory"/>
83 |         <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> -->
84 |         <!-- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
85 |         <filter class="solr.LowerCaseFilterFactory"/>
86 |       </analyzer>
87 |     </fieldType>
88 | 
89 |   </types>
90 | 
91 | 
92 | </schema>
93 | 


--------------------------------------------------------------------------------
/src/test/resources/taggingattribute/collection1/conf/solrconfig.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | 
 3 | <!--
 4 |   This software was produced for the U. S. Government
 5 |   under Contract No. W15P7T-11-C-F600, and is
 6 |   subject to the Rights in Noncommercial Computer Software
 7 |   and Noncommercial Computer Software Documentation
 8 |   Clause 252.227-7014 (JUN 1995)
 9 | 
10 |   Copyright 2013 The MITRE Corporation. All Rights Reserved.
11 | 
12 |   Licensed under the Apache License, Version 2.0 (the "License");
13 |   you may not use this file except in compliance with the License.
14 |   You may obtain a copy of the License at
15 | 
16 |       http://www.apache.org/licenses/LICENSE-2.0
17 | 
18 |   Unless required by applicable law or agreed to in writing, software
19 |   distributed under the License is distributed on an "AS IS" BASIS,
20 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 |   See the License for the specific language governing permissions and
22 |   limitations under the License.
23 |   -->
24 | 
25 | <!-- a basic solrconfig that tests can use when they want simple minimal solrconfig/schema
26 |      DO NOT ADD THINGS TO THIS CONFIG! -->
27 | <config>
28 |   <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
29 |   <dataDir>${solr.data.dir:}</dataDir>
30 |   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
31 | 
32 |   <!-- for postingsFormat="..." -->
33 |   <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
34 | 
35 |   <!-- since Solr 4.8: -->
36 |   <schemaFactory name="SchemaFactory" class="solr.ClassicIndexSchemaFactory" />
37 | 
38 |   <!--<requestHandler name="/select" class="solr.SearchRequestHandler"></requestHandler>-->
39 | 
40 |   <requestHandler name="/tag" class="org.opensextant.solrtexttagger.TaggerRequestHandler">
41 |     <!-- top level params; legacy format just to test it still works -->
42 |     <str name="field">name_tag</str>
43 |     <str name="fq">NOT name:(of the)</str><!-- filter out -->
44 |   </requestHandler>
45 | 
46 | </config>
47 | 


--------------------------------------------------------------------------------