├── .gitignore ├── .travis.yml ├── CHANGES.md ├── DevNotes.txt ├── LICENSE.txt ├── NOTICE.txt ├── QUICK_START.md ├── README.md ├── checkstyle-suppressions.xml ├── checkstyle.xml ├── pom.xml └── src ├── main └── java │ └── org │ └── opensextant │ └── solrtexttagger │ ├── ConcatenateFilter.java │ ├── ConcatenateFilterFactory.java │ ├── HtmlOffsetCorrector.java │ ├── OffsetCorrector.java │ ├── TagClusterReducer.java │ ├── TagLL.java │ ├── Tagger.java │ ├── TaggerRequestHandler.java │ ├── TaggingAttribute.java │ ├── TaggingAttributeImpl.java │ ├── TermPrefixCursor.java │ ├── XmlOffsetCorrector.java │ └── package-info.java └── test ├── java └── org │ └── opensextant │ └── solrtexttagger │ ├── AbstractTaggerTest.java │ ├── ConcatenateFilterTest.java │ ├── EmbeddedSolrNoSerializeTest.java │ ├── HtmlInterpolationTest.java │ ├── RandomizedTaggerTest.java │ ├── Tagger2Test.java │ ├── TaggerTest.java │ ├── TaggingAttributeTest.java │ ├── WordLengthTaggingFilter.java │ ├── WordLengthTaggingFilterFactory.java │ └── XmlInterpolationTest.java └── resources ├── META-INF └── services │ └── org.apache.lucene.analysis.util.TokenFilterFactory ├── logback.xml ├── solr └── collection1 │ └── conf │ ├── schema.xml │ └── solrconfig.xml └── taggingattribute └── collection1 └── conf ├── schema.xml └── solrconfig.xml /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | sudo: false 4 | 5 | script: mvn -Drandomized.multiplier=10 -Dsolr.version=$SOLR_VERSION -Dlog.level=WARN clean verify 6 | 7 | jdk: 8 | - oraclejdk8 9 | - oraclejdk9 10 | 11 | env: 12 | # see pom.xml for notes on previous versions 13 | - SOLR_VERSION=7.0.1 14 | - SOLR_VERSION=7.1.0 15 | - SOLR_VERSION=7.2.1 16 | 17 | notifications: 18 | email: 19 | - dsmiley@apache.org 20 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | This file records changes to the SolrTextTagger. It has Solr & Java version compatibility info too. 2 | 3 | NOTE: There are three independent versions of the tagger: the one in Apache Solr 7.4.0, the GitHub latest 2.x and GitHub latest 1.x. 4 | 2.x does not support synonyms (posInc=0) analysis but the others do. Only 2.x supports htmlOffsetAdjust. 5 | 6 | The [.travis.yml file](.travis.yml) shows the current testing version matrix 7 | on master. Older releases will show older tested releases working at 8 | those times. 9 | 10 | The TaggerHandler in Apache Solr 7.4.0 is based on 2.6-SNAPSHOT, and has other changes. 11 | 12 | ## Version 2.6-SNAPSHOT (unreleased) 13 | 14 | * Performance: Avoid calling terms.iterator() when not needed 15 | * Notice: Lucene's postingsFormat="Memory" option will be removed imminently. 16 | So use "FST50" which is nearly as good. 17 | 18 | ## Version 2.5, March 27th, 2018 19 | 20 | Compatible with Solr 7.0, 7.1, 7.2, 7.3, ... 21 | 22 | ## Version 2.4, February 11th, 2017 23 | 24 | Compatible with Solr 6.3, 6.4.1, ... ? 25 | 26 | Compiled for Java 1.8. 27 | 28 | * #61 'fq' is now multi-valued 29 | 30 | ## Version 2.3, July 20th, 2016 31 | 32 | Compatible with Solr 5.3 thru 6.2.1 33 | 34 | Compiled for Java 1.7. 35 | 36 | ## Version 2.2, December 16th, 2015 37 | 38 | Compatible with Solr 5.2 39 | 40 | Compiled for Java 1.7. 41 | 42 | ## Version 2.1, August 12th, 2015 43 | 44 | Compatible with Solr 5.0 thru 5.1. 45 | 46 | Compiled for Java 1.7. 47 | 48 | ## Version 2.0, January 26th, 2015 49 | 50 | Compatible with Solr 4.3 thru 4.10. 51 | 52 | Compiled for Java 1.6. 53 | 54 | This is a major release that fundamentally changes the underlying engine from working directly off 55 | of an FST to one working off a Lucene TermsEnum configured to be backed by an FST. The 56 | schema and configuration has changed some accordingly, but the tagger request API hasn't changed. 57 | The tagger's codebase shrunk too as Lucene manages more of the complexity. 58 | The internal name entries are now encoded as a char delimited phrase _instead of_ a word dictionary 59 | with word ID phrases. This approach reduced the memory and disk requirements substantially 60 | from 1.x. 40% less? 61 | 62 | IMPORTANT: One feature *not* yet ported from 1.x is support for index-time expanding synonyms 63 | and the catenate options of WordDelimiterFilter (or other analysis resulting in tokens at the 64 | same position). Consequently, don't do those things in your index analysis chain :-/ 65 | 66 | * 'xmlOffsetAdjust' option. See README.md 67 | 68 | * 'htmlOffsetAdjust' option. See README.md 69 | 70 | * 'nonTaggableTags' option. See README.md 71 | 72 | * Removed deprecated NoSerializeEmbeddedSolrServer & EmbeddedSolrUpdater (\#21) 73 | 74 | ## Version 1.2 (and prior), October 2nd 2013 75 | 76 | Compatible with Solr 4.2 thru 4.4; later 4.x releases may or may not work. 77 | 78 | Compiled for Java 1.6. 79 | 80 | * Supports index-time expanding synonyms and the catenate options of WordDelimiterFilter, or most 81 | other analysis at index time wherein tokens are generated at the same position. 82 | Multi-word synonyms are not supported unless you normalize at index & query to a single-word 83 | variant (i.e. "domain name system" -> "dns"). 84 | Internally, this done by PhraseBuilder and is tested in PosIncPosLenTaggerTest. 85 | Thanks to Rupert Westenthaler! (\#10) 86 | -------------------------------------------------------------------------------- /DevNotes.txt: -------------------------------------------------------------------------------- 1 | ### Running EmbeddedSolrUpdater 2 | 3 | export JAVA_OPTS="-Dsolr.solr.home=../Gazetteer/SolrHome -Dsolr.data.dir=/Volumes/Speedy/data" 4 | ./updateSolr.sh '/update?update.contentType=text/csv&optimize=true&separator=%09&trim=on&f.SOURCE_FEATURE_ID.map=1.0:1&f.SOURCE_NAME_ID.map=1.0:1' '/tag?build=true' < /Volumes/Speedy/Merged.txt 5 | 6 | ### Run tagger (not embedded) 7 | curl -XPOST 'http://localhost:8983/solr/tag?overlaps=ALL&tagsLimit=5000&fl=*&wt=json&indent=2' -H 'Content-Type:text/plain' -d 'We drove to Byrds Creek. Then we' 8 | or -d '@myfile.txt' 9 | 10 | curl -XPOST 'http://localhost:8983/solr/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name&wt=json&indent=2' -H 'Content-Type:text/plain' -d 'We drove to Byrds Creek. Then we' 11 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | OpenSextant's Solr Text Tagger 2 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 3 | 4 | This software uses the Apache License 2.0. See LICENSE.txt. 5 | 6 | This product includes software developed by 7 | The MITRE Corporation (http://www.mitre.org/). 8 | 9 | This software was produced for the U. S. Government 10 | under Contract No. W15P7T-11-C-F600, and is 11 | subject to the Rights in Noncommercial Computer Software 12 | and Noncommercial Computer Software Documentation 13 | Clause 252.227-7014 (JUN 1995) -------------------------------------------------------------------------------- /QUICK_START.md: -------------------------------------------------------------------------------- 1 | First, understand you must use a version of this "SolrTextTagger" that is compatible with Solr. 2 | Unfortunately, Solr (more often actually Lucene) makes small changes that necessitate an adjustment 3 | in the tagger thus requiring more tagger releases that often have no additional features. 4 | View the [CHANGES.md](CHANGES.md) file for information on what versions are compatible with what Solr versions. 5 | 6 | # Get Java 7 | 8 | Get Java, preferably the JDK, AKA the Java SE Development Kit which includes a compiler and other 9 | useful tools. I'll assume v1.8, the latest version. If you already have v1.7, that's fine but be 10 | aware Solr 6 requires Java v1.8. There are multiple ways to get Java, including multiple vendors. 11 | Try [Oracle's download page](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). 12 | If you just have the Java "JRE" (no compiler) then that's probably fine. 13 | 14 | # Get Apache Solr 15 | 16 | Go to [Solr's download page](http://www.apache.org/dyn/closer.lua/lucene/solr/) and download either the 17 | ".zip" or the ".tgz" depending on which you prefer, then expand it. We'll call the expanded directory 18 | SOLR_DIST_DIR. As of this writing, the latest version is v5.4.1. 19 | 20 | # Get the SolrTextTagger 21 | 22 | The OpenSextant SolrTextTagger is a plug-in to Apache Solr. A Plug-in is a '.jar' file (possibly 23 | requiring other dependent '.jar' files) that is placed somewhere that Solr will see it. To get the 24 | text tagger's Jar, you can either download a 25 | [pre-built one](http://search.maven.org/#search%7Cga%7C1%7Ca%3A%22solr-text-tagger%22) from Maven 26 | central if it's an official release, or build it yourself if you have a Java compiler and Maven. 27 | There's also a "SNAPSHOT" (unreleased) 'jar' on Sonatype's maven repository. 28 | You can find that [here](https://oss.sonatype.org/content/repositories/snapshots/org/opensextant/solr-text-tagger/2.3-SNAPSHOT/). 29 | Remember to consult [CHANGES.md](CHANGES.md) on which version to use based on which Solr version you chose. (Hint: you'll need 2.3 if you are running Solr 5.4 or 5.3). 30 | 31 | Optional: If you intend to use the `htmlOffsetAdjust` option then you'll need to get the Jericho 32 | HTML parser too, such as from Maven central. 33 | 34 | ## Install the Tagger 35 | 36 | The easiest method is simply to put the '.jar' file into SOLR_DIST_DIR/server/solr/lib/. The 37 | lib dir won't exist initially so create it. 38 | If you need Jericho too then put it here as well. 39 | 40 | # Run Solr 41 | 42 | Start Solr on port 8983 (Solr's default port): 43 | 44 | bin/solr start 45 | 46 | # Create and Configure a Solr Collection 47 | 48 | Note that there are 2 ways we could go about this. Solr's classic approach involves editing some 49 | config files (schema.xml, solrconfig.xml), which I might have pre-created for these quick-start instructions. 50 | The newer approach is to use Solr's API to modify the configuration. We'll choose the latter, even 51 | though I'm most fond of the former. 52 | 53 | Create a Solr collection named "geonames". Since we don't specify a configuration template (-d) we 54 | get a so-called "data-driven" configuration. It's good for experimentation and getting going fast 55 | but not for production or being optimal. 56 | 57 | bin/solr create -c geonames 58 | 59 | ## Configuring 60 | 61 | We need to configure the schema first. The "data driven" mode we're using allows us to keep this step fairly 62 | minimal -- we just need to declare a field type, 2 fields, and a copy-field. 63 | The critical part up-front is to define the "tag" field type. There are many many ways to configure 64 | text analysis; and we're not going to get into those choices here. But an important bit is the 65 | ConcatenateFilterFactory at the end of the index analyzer chain. Another important bit for 66 | performance is postingsFormat=FST50 (resulting in compact FST based in-memory data structures vs. 67 | going to disk every time). 68 | 69 | Schema configuration: 70 | 71 | ```` 72 | curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/schema -d '{ 73 | "add-field-type":{ 74 | "name":"tag", 75 | "class":"solr.TextField", 76 | "postingsFormat":"FST50", 77 | "omitNorms":true, 78 | "indexAnalyzer":{ 79 | "tokenizer":{ 80 | "class":"solr.StandardTokenizerFactory" }, 81 | "filters":[ 82 | {"class":"solr.EnglishPossessiveFilterFactory"}, 83 | {"class":"solr.ASCIIFoldingFilterFactory"}, 84 | {"class":"solr.LowerCaseFilterFactory"}, 85 | {"class":"org.opensextant.solrtexttagger.ConcatenateFilterFactory"} 86 | ]}, 87 | "queryAnalyzer":{ 88 | "tokenizer":{ 89 | "class":"solr.StandardTokenizerFactory" }, 90 | "filters":[ 91 | {"class":"solr.EnglishPossessiveFilterFactory"}, 92 | {"class":"solr.ASCIIFoldingFilterFactory"}, 93 | {"class":"solr.LowerCaseFilterFactory"} 94 | ]} 95 | }, 96 | 97 | "add-field":{ "name":"name", "type":"text_general"}, 98 | 99 | "add-field":{ "name":"name_tag", "type":"tag", "stored":false }, 100 | 101 | "add-copy-field":{ "source":"name", "dest":[ "name_tag" ]} 102 | }' 103 | ```` 104 | 105 | Configure a custom Solr Request Handler: 106 | 107 | ```` 108 | curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/geonames/config -d '{ 109 | "add-requesthandler" : { 110 | "name": "/tag", 111 | "class":"org.opensextant.solrtexttagger.TaggerRequestHandler", 112 | "defaults":{ "field":"name_tag" } 113 | } 114 | }' 115 | ```` 116 | 117 | # Load Some Sample Data 118 | 119 | We'll go with some Geonames.org data in CSV format. Solr is quite flexible in loading data in a 120 | variety of formats. This [cities1000.zip](http://download.geonames.org/export/dump/cities1000.zip) 121 | should be almost 7MB file expanding to a cities1000.txt file around 22.2MB containing 145k lines, 122 | each a city in the world of at least 1000 population. 123 | 124 | ```` 125 | curl -X POST --data-binary @/path/to/cities1000.txt -H 'Content-type:application/csv' \ 126 | 'http://localhost:8983/solr/geonames/update?commit=true&optimize=true&separator=%09&encapsulator=%00&fieldnames=id,name,,alternative_names,latitude,longitude,,,countrycode,,,,,,population,elevation,,timezone,lastupdate' 127 | ```` 128 | 129 | That might take around 35 seconds; it depends. It can be a lot faster if the schema were tuned 130 | to only have what we truly need (no text search if not needed). 131 | 132 | In that command we said optimize=true to put the index in a state that will tmake tagging faster. 133 | The encapsulator=%00 is a bit of a hack to disable the default double-quote. 134 | 135 | # Tag Time! 136 | 137 | This is a trivial example tagging a small piece of text. For more options, see the Usage section 138 | in the readme. 139 | 140 | ```` 141 | curl -X POST \ 142 | 'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \ 143 | -H 'Content-Type:text/plain' -d 'Hello New York City' 144 | ```` 145 | 146 | The response should be this (the QTime may vary): 147 | ```` 148 | { 149 | "responseHeader":{ 150 | "status":0, 151 | "QTime":1}, 152 | "tagsCount":1, 153 | "tags":[[ 154 | "startOffset",6, 155 | "endOffset",19, 156 | "ids",["5128581"]]], 157 | "response":{"numFound":1,"start":0,"docs":[ 158 | { 159 | "id":"5128581", 160 | "name":["New York City"], 161 | "countrycode":["US"]}] 162 | }} 163 | ```` 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Solr Text Tagger 2 | 3 | This project implements a "naive" text tagger based on Apache Lucene / Solr, using 4 | Lucene FST (Finite State Transducer) technology under the hood for remarkable low-memory properties. 5 | It is "naive" because it does simple text word based substring tagging without consideration 6 | of any natural language context. It operates on the results of how you 7 | configure text analysis in Lucene and so it's quite flexible to match things 8 | like phonetics for sounds-like tagging if you wanted to. For more information, see the presentation 9 | video/slides referenced below. 10 | 11 | The tagger can be used for finding entities/concepts in large text, or for doing likewise in queries 12 | to enhance query-understanding. 13 | 14 | For a list of changes with version of this tagger, to include Solr & Java version compatibility, 15 | see [CHANGES.md](CHANGES.md) 16 | 17 | ### Note: the STT is included in Apache Solr 7.4.0 !!! 18 | 19 | Solr 7.4.0 now includes the Solr Text Tagger. It's [documented in the Solr Reference Guide](https://builds.apache.org/job/Solr-reference-guide-master/javadoc/the-tagger-handler.html). As-such, you likely should just use the one in Solr and not the one here. That said, `htmlOffsetAdjust` is not implemented there. Issues #82 and #81 document some information about the differences and contain further links. 20 | 21 | ## Resources / References 22 | 23 | * [SoDA](https://github.com/elsevierlabs-os/soda) "Solr Dictionary Annotator" is an open-source system that uses this tagger extensively. You might want to use that instead of the tagger directly. In addition to more features add on top of the tagger, it has extensive cloud scaling documentation. 24 | * [How-To blog post by Mikołaj Kania](http://mikolajkania.com/2017/03/30/extract-entities-with-solr-text-tagger/) 25 | * [Dictionary Based Annotation at scale with Spark, SolrTextTagger, and OpenNLP (video)](https://www.youtube.com/watch?v=gOe0aYAS8Do) 26 | ([slides](http://www.slideshare.net/sujitpal/sseu-2015soda)) 27 | -- a presentation by Sujit Pal at Spark Summit Europe 2015 28 | * [Text Tagging with Finite State Transducers (video)](http://www.youtube.com/watch?v=3kQyYbTyXfc) 29 | ([slides](http://lucenerevolution.org/wp-content/uploads/2014/08/Text-Tagging-with-Finite-State-Transducers.pdf)) -- a presentation at Lucene Revolution 2013 by David Smiley (first release about the tagger) 30 | * [Fuzzy String Matching with SolrTextTagger](http://sujitpal.blogspot.com/2014/02/fuzzy-string-matching-with.html) -- a blog post by Sujit Pal 31 | * [Tulip](http://dl.acm.org/citation.cfm?id=2634351) -- a winner of the [ERD'14 challenge](https://pdfs.semanticscholar.org/91cf/c37d4853bb7214d18ca091f9bfede8b301a0.pdf) uses the Text Tagger. 32 | 33 | Pertaining to Lucene's Finite State Transducers: 34 | 35 | * https://docs.google.com/presentation/d/1Z7OYvKc5dHAXiVdMpk69uulpIT6A7FGfohjHx8fmHBU/edit#slide=id.p 36 | * http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html 37 | * http://blog.mikemccandless.com/2011/01/finite-state-transducers-part-2.html 38 | 39 | ## Contributors: 40 | 41 | * David Smiley 42 | * Rupert Westenthaler (notably the PhraseBuilder in the 1.1 branch) 43 | 44 | ## Quick Start 45 | 46 | See the [QUICK_START.md](QUICK_START.md) file for a set of instructions to get you going ASAP. 47 | 48 | ## Build Instructions 49 | 50 | The build requires Java (v8 or v9) and Maven. 51 | 52 | To compile and run tests, use: 53 | 54 | %> mvn test 55 | 56 | To compile, test, and build the jar (placed in target/), use 57 | 58 | %> mvn package 59 | 60 | ## Configuration 61 | 62 | A Solr schema.xml needs 2 things 63 | 64 | * A unique key field (see ``). Setting docValues=true on this field is recommended. 65 | * A name/lookup field indexed with Shingling or more likely ConcatenateFilter. 66 | 67 | If you want to support typical keyword search on the names, not just tagging, then index 68 | the names in an additional field with a typical analysis configuration to your preference. 69 | 70 | For tagging, the name field's index analyzer needs to end in either shingling for "partial" 71 | (i.e. sub name phrase) matching of a name, or more likely using ConcatenateFilter for 72 | complete name matching. ConcatenateFilter acts similar to shingling but it 73 | concatenates all tokens into one final token with a space separator. 74 | The query time analysis should _not_ have Shingling or ConcatenateFilter. 75 | 76 | Prior to shingling or the ConcatenateFilter, preceding text analysis should result in 77 | consecutive positions (i.e. the position increment of each term must always be 78 | 1). As-such, Synonyms and some configurations of WordDelimiterFilter are not supported. 79 | On the other hand, if the input text 80 | has a position increment greater than one (e.g. stop word) then it is handled properly as if an 81 | unknown word was there. Support for synonyms or any other filters producing posInc=0 is a feature 82 | that has largely been overcome in the 1.1 version but it has yet to be ported to 2.x; see 83 | [Issue #20, RE the PhraseBuilder](https://github.com/OpenSextant/SolrTextTagger/issues/20) 84 | 85 | To make the tagger work as fast as possible, configure the name field with 86 | postingsFormat="FST50";. In doing so, all the terms/postings are placed into an efficient FST 87 | data structure. 88 | 89 | Here is a sample field type config that should work quite well: 90 | 91 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | A Solr solrconfig.xml needs a special request handler, configured like this. 110 | 111 | 112 | 113 | name_tag 114 | PUT SOME SOLR QUERY HERE; OPTIONAL 115 | 116 | 117 | 118 | * `field`: The field that represents the corpus to match on, as described above. 119 | * `fq`: (optional) A query that matches a subset of documents for name matching. 120 | 121 | Also, to enable custom so-called postings formats, ensure that your solrconfig.xml has a 122 | codecFactory defined like this: 123 | 124 | 125 | 126 | ## Usage 127 | 128 | For tagging, you HTTP POST data to Solr similar to how the ExtractingRequestHandler 129 | (Tika) is invoked. A request invoked via the "curl" program could look like this: 130 | 131 | curl -XPOST \ 132 | 'http://localhost:8983/solr/collection1/tag?overlaps=NO_SUB&tagsLimit=5000&fl=*' \ 133 | -H 'Content-Type:text/plain' -d @/mypath/myfile.txt 134 | 135 | ### The tagger request-time parameters are 136 | 137 | * `overlaps`: choose the algorithm to determine which overlapping tags should be 138 | retained, versus being pruned away. Options are: 139 | * `ALL`: Emit all tags. 140 | * `NO_SUB`: Don't emit a tag that is completely within another tag (i.e. no subtag). 141 | * `LONGEST_DOMINANT_RIGHT`: Given a cluster of overlapping tags, emit the longest 142 | one (by character length). If there is a tie, pick the right-most. Remove 143 | any tags overlapping with this tag then repeat the algorithm to potentially 144 | find other tags that can be emitted in the cluster. 145 | * `matchText`: A boolean indicating whether to return the matched text in the tag 146 | response. This will trigger the tagger to fully buffer the input before tagging. 147 | * `tagsLimit`: The maximum number of tags to return in the response. Tagging 148 | effectively stops after this point. By default this is 1000. 149 | * `rows`: Solr's standard param to say the maximum number of documents to return, 150 | but defaulting to 10000 for a tag request. 151 | * `skipAltTokens`: A boolean flag used to suppress errors that can occur if, for 152 | example, you enable synonym expansion at query time in the analyzer, which you 153 | normally shouldn't do. Let this default to false unless you know that such 154 | tokens can't be avoided. 155 | * `ignoreStopwords`: A boolean flag that causes stopwords (or any condition causing positions to 156 | skip like >255 char words) to be ignored as if it wasn't there. Otherwise, the behavior is to treat 157 | them as breaks in tagging on the presumption your indexed text-analysis configuration doesn't have 158 | a StopWordFilter. By default the indexed analysis chain is checked for the presence of a 159 | StopWordFilter and if found then ignoreStopWords is true if unspecified. You probably shouldn't 160 | have a StopWordFilter configured and probably won't need to set this param either. 161 | * `xmlOffsetAdjust`: A boolean indicating that the input is XML and furthermore that the offsets of 162 | returned tags should be adjusted as necessary to allow for the client to insert an open and closing 163 | element at the positions. If it isn't possible to do so then the tag will be omitted. You are 164 | expected to configure HTMLStripCharFilter in the schema when using this option. 165 | This will trigger the tagger to fully buffer the input before tagging. 166 | * `htmlOffsetAdjust`: Similar to xmlOffsetAdjust except for HTML content that may have various issues 167 | that would never work with an XML parser. There needn't be a top level element, and some tags 168 | are known to self-close (e.g. BR). The tagger uses the Jericho HTML Parser for this feature 169 | (ASL & LGPL & EPL licensed). 170 | * `nonTaggableTags`: (only with htmlOffsetAdjust) Omits tags that would enclose one of these HTML 171 | elements. Comma delimited, lower-case. For example 'a' (anchor) would be a likely choice so that 172 | links the application inserts don't overlap other links. 173 | * `fl`: Solr's standard param for listing the fields to return. 174 | * Most other standard parameters for working with Solr response formatting: 175 | `echoParams`, `wt`, `indent`, etc. 176 | 177 | ### Output 178 | 179 | The output is broken down into two parts, first an array of tags, and then 180 | Solr documents referenced by those tags. Each tag has the starting character 181 | offset, an ending character (+1) offset, and the Solr unique key field value. 182 | The Solr documents part of the response is Solr's standard search results 183 | format. 184 | 185 | ## Advanced Tips 186 | 187 | * For reducing tagging latency even further, consider embedding Solr with 188 | EmbeddedSolrServer. See EmbeddedSolrNoSerializeTest. 189 | -------------------------------------------------------------------------------- /checkstyle-suppressions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | 25 | 28 | 29 | 30 | 31 | 34 | -------------------------------------------------------------------------------- /checkstyle.xml: -------------------------------------------------------------------------------- 1 | 2 | 23 | 24 | 27 | 28 | 55 | 56 | 57 | 58 | 59 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 22 | 23 | 24 | 4.0.0 25 | 26 | org.opensextant 27 | solr-text-tagger 28 | 2.6-SNAPSHOT 29 | jar 30 | 31 | Solr Text Tagger 32 | A text tagger based on Lucene / Solr 33 | https://github.com/OpenSextant/SolrTextTagger/ 34 | 2012 35 | 36 | 37 | MITRE 38 | 39 | 40 | 41 | scm:git:https://github.com/OpenSextant/SolrTextTagger.git 42 | scm:git:https://github.com/OpenSextant/SolrTextTagger.git 43 | https://github.com/OpenSextant/SolrTextTagger.git 44 | HEAD 45 | 46 | 47 | 48 | 49 | Apache 2 50 | http://www.apache.org/licenses/LICENSE-2.0.txt 51 | repo 52 | 53 | 54 | 55 | 56 | 57 | David Smiley 58 | dsmiley@apache.org 59 | 60 | 61 | 62 | 63 | 64 | UTF-8 65 | 66 | 7.2.1 67 | 68 | 69 | 70 | 71 | 72 | org.apache.solr 73 | solr-test-framework 74 | ${solr.version} 75 | test 76 | 77 | 78 | org.apache.lucene 79 | lucene-test-framework 80 | ${solr.version} 81 | test 82 | 83 | 84 | 85 | org.apache.solr 86 | solr-core 87 | ${solr.version} 88 | 89 | 90 | org.slf4j 91 | slf4j-jdk14 92 | 93 | 94 | org.slf4j 95 | slf4j-log4j12 96 | 97 | 98 | log4j 99 | log4j 100 | 101 | 102 | 103 | 104 | 105 | org.apache.lucene 106 | lucene-core 107 | ${solr.version} 108 | 109 | 110 | 112 | 113 | org.codehaus.woodstox 114 | woodstox-core-asl 115 | 4.4.1 116 | true 117 | 118 | 119 | 120 | net.htmlparser.jericho 121 | jericho-html 122 | 3.4 123 | true 124 | 125 | 126 | 127 | 128 | org.slf4j 129 | slf4j-api 130 | 1.7.7 131 | 132 | 133 | ch.qos.logback 134 | logback-classic 135 | 1.1.7 136 | runtime 137 | true 138 | 139 | 140 | 141 | 142 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | org.apache.maven.plugins 153 | maven-compiler-plugin 154 | 3.1 155 | 156 | 1.8 157 | 1.8 158 | 159 | 160 | 161 | 164 | 165 | org.apache.maven.plugins 166 | maven-surefire-plugin 167 | 2.19.1 168 | 169 | 170 | NativePRNG 171 | 172 | 173 | 174 | 175 | 176 | org.apache.maven.plugins 177 | maven-jar-plugin 178 | 2.4 179 | 180 | 181 | 182 | true 183 | true 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | org.apache.maven.plugins 194 | maven-checkstyle-plugin 195 | 2.12.1 196 | 197 | checkstyle.xml 198 | true 199 | true 200 | 201 | 202 | 203 | compile 204 | 205 | check 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 229 | 230 | org.codehaus.mojo 231 | findbugs-maven-plugin 232 | 3.0.3 233 | 234 | true 235 | 236 | 237 | 238 | 239 | 240 | org.apache.maven.plugins 241 | maven-site-plugin 242 | 3.3 243 | 244 | 245 | 249 | 250 | 251 | org.apache.maven.plugins 252 | maven-release-plugin 253 | 2.5 254 | 255 | true 256 | false 257 | release 258 | deploy 259 | 260 | 261 | 262 | 263 | org.sonatype.plugins 264 | nexus-staging-maven-plugin 265 | 1.6.6 266 | true 267 | 268 | ossrh 269 | https://oss.sonatype.org/ 270 | true 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | org.apache.maven.plugins 283 | maven-project-info-reports-plugin 284 | 2.7 285 | 286 | 287 | false 288 | 289 | 290 | 291 | 292 | org.apache.maven.plugins 293 | maven-javadoc-plugin 294 | 2.9.1 295 | 296 | 297 | 298 | javadoc 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | release 311 | 312 | 313 | 314 | 315 | org.apache.maven.plugins 316 | maven-source-plugin 317 | 2.4 318 | 319 | 320 | attach-sources 321 | 322 | jar-no-fork 323 | 324 | 325 | 326 | 327 | 328 | 329 | org.apache.maven.plugins 330 | maven-javadoc-plugin 331 | 2.9.1 332 | 333 | 334 | attach-javadocs 335 | 336 | jar 337 | 338 | 339 | 340 | 341 | 342 | 343 | org.apache.maven.plugins 344 | maven-gpg-plugin 345 | 1.6 346 | 347 | 348 | sign-artifacts 349 | verify 350 | 351 | sign 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | ossrh 364 | https://oss.sonatype.org/content/repositories/snapshots 365 | 366 | 367 | ossrh 368 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 369 | 370 | 371 | 372 | 373 | 374 | apache.snapshots 375 | Apache Snapshot Repository 376 | https://repository.apache.org/snapshots 377 | 378 | false 379 | 380 | 381 | 382 | 383 | 384 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/ConcatenateFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenFilter; 26 | import org.apache.lucene.analysis.TokenStream; 27 | import org.apache.lucene.analysis.shingle.ShingleFilter; 28 | import org.apache.lucene.analysis.tokenattributes.*; 29 | 30 | import java.io.IOException; 31 | 32 | /** 33 | * Concatenate all tokens, separated by a provided character, 34 | * defaulting to a single space. It always produces exactly one token, and it's designed to be the 35 | * last token filter in an analysis chain. 36 | */ 37 | public class ConcatenateFilter extends TokenFilter { 38 | 39 | /* 40 | For a very different approach that could accept synonyms or anything except position gaps (e.g. 41 | not stopwords), 42 | consider using o.a.l.analysis.TokenStreamToAutomaton 43 | with o.a.l.util.automaton.SpecialOperations.getFiniteStrings(). 44 | For gaps (stopwords), we could perhaps index a special token at those gaps and then have the 45 | tagger deal with them -- also doable. 46 | */ 47 | 48 | private char separator = ' '; 49 | 50 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 51 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 52 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 53 | private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); 54 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 55 | 56 | private boolean done; 57 | private StringBuilder buf = new StringBuilder(128); 58 | 59 | /** 60 | * Construct a token stream filtering the given input. 61 | */ 62 | protected ConcatenateFilter(TokenStream input) { 63 | super(input); 64 | } 65 | 66 | public void setTokenSeparator(char separator) { 67 | this.separator = separator; 68 | } 69 | 70 | @Override 71 | public void reset() throws IOException { 72 | input.reset(); 73 | done = false; 74 | } 75 | 76 | @Override 77 | public final boolean incrementToken() throws IOException { 78 | if (done) 79 | return false; 80 | done = true; 81 | 82 | buf.setLength(0); 83 | boolean firstTerm = true; 84 | while (input.incrementToken()) { 85 | if (!firstTerm) { 86 | buf.append(separator); 87 | } 88 | //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13 89 | buf.append(termAtt); 90 | firstTerm = false; 91 | } 92 | input.end();//call here so we can see end of stream offsets 93 | 94 | termAtt.setEmpty().append(buf); 95 | //Setting the other attributes ultimately won't have much effect but lets be thorough 96 | offsetAtt.setOffset(0, offsetAtt.endOffset()); 97 | posIncrAtt.setPositionIncrement(1); 98 | posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way. 99 | typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" 100 | 101 | return true; 102 | } 103 | 104 | @Override 105 | public void end() throws IOException { 106 | //we already called input.end() in incrementToken 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/ConcatenateFilterFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenStream; 26 | import org.apache.lucene.analysis.util.TokenFilterFactory; 27 | 28 | import java.util.Map; 29 | 30 | /** 31 | * @see ConcatenateFilter 32 | */ 33 | public class ConcatenateFilterFactory extends TokenFilterFactory { 34 | 35 | private final String tokenSeparator; 36 | 37 | /** 38 | * Initialize this factory via a set of key-value pairs. 39 | */ 40 | public ConcatenateFilterFactory(Map args) { 41 | super(args); 42 | tokenSeparator = get(args, "tokenSeparator", " "); 43 | if (tokenSeparator.length() != 1) 44 | throw new IllegalArgumentException("tokenSeparator should be 1 char: "+tokenSeparator); 45 | if (!args.isEmpty()) { 46 | throw new IllegalArgumentException("Unknown parameters: " + args); 47 | } 48 | } 49 | 50 | @Override 51 | public TokenStream create(TokenStream input) { 52 | ConcatenateFilter filter = new ConcatenateFilter(input); 53 | filter.setTokenSeparator(tokenSeparator.charAt(0)); 54 | return filter; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/HtmlOffsetCorrector.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import net.htmlparser.jericho.EndTagType; 26 | import net.htmlparser.jericho.Segment; 27 | import net.htmlparser.jericho.StartTag; 28 | import net.htmlparser.jericho.StartTagType; 29 | import net.htmlparser.jericho.StreamedSource; 30 | import net.htmlparser.jericho.Tag; 31 | 32 | import java.util.Collections; 33 | import java.util.Set; 34 | 35 | /** 36 | * Corrects offsets to adjust for HTML formatted data. The goal is such that the caller should be 37 | * able to insert a start HTML tag at the start offset and a corresponding end HTML tag at the end 38 | * offset of the tagger, and have it be valid HTML (assuming it was "valid" in the first place). 39 | * See {@link #correctPair(int, int)}. 40 | * 41 | * This will work on HTML that has numerous problems that browsers deal with, as well as XML. 42 | * 43 | * Not thread-safe. 44 | */ 45 | public class HtmlOffsetCorrector extends OffsetCorrector { 46 | 47 | /** 48 | * Initialize based on the document text. 49 | * 50 | * @param docText non-null structured content. 51 | * @param nonTaggableTags HTML element names that should not be "taggable" (be a part of any 52 | * tag). These must be lower-case. 53 | */ 54 | protected HtmlOffsetCorrector(String docText, Set nonTaggableTags) { 55 | super(docText, nonTaggableTags != null); 56 | if (nonTaggableTags == null) 57 | nonTaggableTags = Collections.emptySet(); 58 | 59 | int tagCounter = 1;//document implicit tag, and counting 60 | int thisTag = 0;//document implicit tag 61 | 62 | tagInfo.add(-1);//parent 63 | tagInfo.add(-1, 0);//StartTag 64 | tagInfo.add(docText.length(), docText.length()+1);//EndTag 65 | parentChangeOffsets.add(-1); 66 | parentChangeIds.add(thisTag); 67 | 68 | StreamedSource source = new StreamedSource(docText); 69 | source.setCoalescing(false); 70 | 71 | int nonTaggablesInProgress = 0; 72 | 73 | for (Segment segment : source) { 74 | if (segment instanceof Tag) { 75 | Tag tag = (Tag) segment; 76 | if (tag.getTagType() == StartTagType.NORMAL) { 77 | final StartTag startTag = (StartTag) tag; 78 | 79 | //TODO Consider "implicitly terminating tags", which is dependent on the current tag. 80 | 81 | if (!startTag.isEmptyElementTag() && !startTag.isEndTagForbidden() && !startTag.isSyntacticalEmptyElementTag()) {//e.g. not "
" 82 | tagInfo.ensureCapacity(tagInfo.size() + 5); 83 | final int parentTag = thisTag; 84 | tagInfo.add(parentTag); 85 | tagInfo.add(tag.getBegin(), tag.getEnd()); 86 | tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag 87 | thisTag = tagCounter++; 88 | 89 | parentChangeOffsets.add(tag.getBegin()); 90 | parentChangeIds.add(thisTag); 91 | 92 | //non-taggable tracking: 93 | if (nonTaggableTags.contains(tag.getName())) {//always lower-case 94 | if (nonTaggablesInProgress++ == 0) 95 | nonTaggableOffsets.add(tag.getBegin()); 96 | } 97 | } 98 | } else if (tag.getTagType() == EndTagType.NORMAL) { 99 | //TODO validate we're closing the tag we think we're closing. 100 | tagInfo.set(5 * thisTag + 3, tag.getBegin()); 101 | tagInfo.set(5 * thisTag + 4, tag.getEnd()); 102 | thisTag = getParentTag(thisTag); 103 | 104 | parentChangeOffsets.add(tag.getEnd()); 105 | parentChangeIds.add(thisTag); 106 | 107 | //non-taggable tracking: 108 | if (nonTaggableTags.contains(tag.getName())) { 109 | if (nonTaggablesInProgress-- == 1) 110 | nonTaggableOffsets.add(tag.getEnd() - 1); 111 | } 112 | } 113 | } 114 | //else we don't care 115 | }//for segment 116 | 117 | parentChangeOffsets.add(docText.length()+1); 118 | parentChangeIds.add(-1); 119 | 120 | assert nonTaggableTags.isEmpty() || nonTaggableOffsets.size() % 2 == 0;//null or even 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/OffsetCorrector.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import com.carrotsearch.hppc.IntArrayList; 26 | 27 | import java.util.Arrays; 28 | 29 | public abstract class OffsetCorrector { 30 | 31 | //TODO support a streaming style of consuming input text so that we need not take a 32 | // String. Trickier because we need to keep more information as we parse to know when tags 33 | // are adjacent with/without whitespace 34 | 35 | //Data structure requirements: 36 | // Given a character offset: 37 | // * determine what tagId is it's parent. 38 | // * determine if it is adjacent to the parent open tag, ignoring whitespace 39 | // * determine if it is adjacent to the parent close tag, ignoring whitespace 40 | // Given a tagId: 41 | // * What is it's parent tagId 42 | // * What's the char offset of the start and end of the open tag 43 | // * What's the char offset of the start and end of the close tag 44 | 45 | /** Document text. */ 46 | protected final String docText; 47 | 48 | /** Array of tag info comprised of 5 int fields: 49 | * [int parentTag, int openStartOff, int openEndOff, int closeStartOff, int closeEndOff]. 50 | * It's size indicates how many tags there are. Tag's are ID'ed sequentially from 0. */ 51 | protected final IntArrayList tagInfo; 52 | 53 | /** offsets of parent tag id change (ascending order) */ 54 | protected final IntArrayList parentChangeOffsets; 55 | /** tag id; parallel array to parentChangeOffsets */ 56 | protected final IntArrayList parentChangeIds; 57 | 58 | protected final int[] offsetPair = new int[] { -1, -1};//non-thread-safe state 59 | 60 | /** Disjoint start and end span offsets (inclusive) of non-taggable sections. Null if none. */ 61 | protected final IntArrayList nonTaggableOffsets; 62 | 63 | /** 64 | * Initialize based on the document text. 65 | * @param docText non-null structured content. 66 | * @param hasNonTaggable if there may be "non-taggable" tags to track 67 | */ 68 | protected OffsetCorrector(String docText, boolean hasNonTaggable) { 69 | this.docText = docText; 70 | final int guessNumElements = Math.max(docText.length() / 20, 4); 71 | 72 | tagInfo = new IntArrayList(guessNumElements * 5); 73 | parentChangeOffsets = new IntArrayList(guessNumElements * 2); 74 | parentChangeIds = new IntArrayList(guessNumElements * 2); 75 | nonTaggableOffsets = hasNonTaggable ? new IntArrayList(guessNumElements / 5) : null; 76 | } 77 | 78 | /** Corrects the start and end offset pair. It will return null if it can't 79 | * due to a failure to keep the offsets balance-able, or if it spans "non-taggable" tags. 80 | * The start (left) offset is pulled left as needed over whitespace and opening tags. The end 81 | * (right) offset is pulled right as needed over whitespace and closing tags. It's returned as 82 | * a 2-element array. 83 | *

Note that the returned array is internally reused; just use it to examine the response. 84 | */ 85 | public int[] correctPair(int leftOffset, int rightOffset) { 86 | rightOffset = correctEndOffsetForCloseElement(rightOffset); 87 | if (spansNonTaggable(leftOffset, rightOffset)) 88 | return null; 89 | 90 | int startTag = lookupTag(leftOffset); 91 | //offsetPair[0] = Math.max(offsetPair[0], getOpenStartOff(startTag)); 92 | int endTag = lookupTag(rightOffset-1); 93 | //offsetPair[1] = Math.min(offsetPair[1], getCloseStartOff(endTag)); 94 | 95 | // Find the ancestor tag enclosing offsetPair. And bump out left offset along the way. 96 | int iTag = startTag; 97 | for (; !tagEnclosesOffset(iTag, rightOffset); iTag = getParentTag(iTag)) { 98 | //Ensure there is nothing except whitespace thru OpenEndOff 99 | int tagOpenEndOff = getOpenEndOff(iTag); 100 | if (hasNonWhitespace(tagOpenEndOff, leftOffset)) 101 | return null; 102 | leftOffset = getOpenStartOff(iTag); 103 | } 104 | final int ancestorTag = iTag; 105 | // Bump out rightOffset until we get to ancestorTag. 106 | for (iTag = endTag; iTag != ancestorTag; iTag = getParentTag(iTag)) { 107 | //Ensure there is nothing except whitespace thru CloseStartOff 108 | int tagCloseStartOff = getCloseStartOff(iTag); 109 | if (hasNonWhitespace(rightOffset, tagCloseStartOff)) 110 | return null; 111 | rightOffset = getCloseEndOff(iTag); 112 | } 113 | 114 | offsetPair[0] = leftOffset; 115 | offsetPair[1] = rightOffset; 116 | return offsetPair; 117 | } 118 | 119 | /** Correct endOffset for adjacent element at the right side. E.g. offsetPair might point to: 120 | *

121 |    *   foo</tag>
122 |    * 
123 | * and this method pulls the end offset left to the '<'. This is necessary for use with 124 | * {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}. 125 | * 126 | * See https://issues.apache.org/jira/browse/LUCENE-5734 */ 127 | protected int correctEndOffsetForCloseElement(int endOffset) { 128 | if (docText.charAt(endOffset-1) == '>') { 129 | final int newEndOffset = docText.lastIndexOf('<', endOffset - 2); 130 | if (newEndOffset > offsetPair[0])//just to be sure 131 | return newEndOffset; 132 | } 133 | return endOffset; 134 | } 135 | 136 | protected boolean hasNonWhitespace(int start, int end) { 137 | for (int i = start; i < end; i++) { 138 | if (!Character.isWhitespace(docText.charAt(i))) 139 | return true; 140 | } 141 | return false; 142 | } 143 | 144 | protected boolean tagEnclosesOffset(int tag, int off) { 145 | return off >= getOpenStartOff(tag) && off < getCloseEndOff(tag); 146 | } 147 | 148 | protected int getParentTag(int tag) { return tagInfo.get(tag * 5 + 0); } 149 | protected int getOpenStartOff(int tag) { return tagInfo.get(tag * 5 + 1); } 150 | protected int getOpenEndOff(int tag) { return tagInfo.get(tag * 5 + 2); } 151 | protected int getCloseStartOff(int tag) { return tagInfo.get(tag * 5 + 3); } 152 | protected int getCloseEndOff(int tag) { return tagInfo.get(tag * 5 + 4); } 153 | 154 | protected int lookupTag(int off) { 155 | int idx = Arrays.binarySearch(parentChangeOffsets.buffer, 0, parentChangeOffsets.size(), off); 156 | if (idx < 0) 157 | idx = (-idx - 1) - 1;//round down 158 | return parentChangeIds.get(idx); 159 | } 160 | 161 | protected boolean spansNonTaggable(int startOff, int endOff) { 162 | if (nonTaggableOffsets == null) 163 | return false; 164 | int idx = Arrays.binarySearch(nonTaggableOffsets.buffer, 0, nonTaggableOffsets.size(), startOff); 165 | //if tag start coincides with first or last char of non-taggable span then result is true. 166 | // (probably never happens since those characters are actual element markup) 167 | if (idx >= 0) 168 | return true; 169 | idx = -idx - 1;//modify for where we would insert 170 | //if idx is odd then our span intersects a non-taggable span; return true 171 | if ((idx & 1) == 1) 172 | return true; 173 | //it's non-taggable if the next non-taggable start span is before our endOff 174 | if (idx == nonTaggableOffsets.size()) 175 | return false; 176 | return nonTaggableOffsets.get(idx) < endOff; 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TagClusterReducer.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | public interface TagClusterReducer { 26 | /** 27 | * Reduces the linked-list to only those tags that should be emitted 28 | * @param head not null; 1-element array to head which isn't null either 29 | */ 30 | void reduce(TagLL[] head); 31 | 32 | static final TagClusterReducer ALL = new TagClusterReducer() { 33 | @Override 34 | public void reduce(TagLL[] head) { 35 | } 36 | }; 37 | 38 | static final TagClusterReducer NO_SUB = new TagClusterReducer() { 39 | @Override 40 | public void reduce(TagLL[] head) { 41 | //loop forward over all tags 42 | for (TagLL tag = head[0].nextTag; tag != null; tag = tag.nextTag) { 43 | //loop backwards over prev tags from this tag 44 | for (TagLL tPrev = tag.prevTag; tPrev != null; tPrev = tPrev.prevTag) { 45 | assert tPrev.startOffset <= tag.startOffset; 46 | //if a previous tag's endOffset is <= this one's, tForward can be removed 47 | if (tPrev.endOffset >= tag.endOffset) { 48 | tag.removeLL(); 49 | break; 50 | } else if (tPrev.startOffset == tag.startOffset) { 51 | tPrev.removeLL(); 52 | //continue; 'tag' is still valid 53 | } 54 | } 55 | } 56 | } 57 | }; 58 | 59 | static final TagClusterReducer LONGEST_DOMINANT_RIGHT = new TagClusterReducer() { 60 | @Override 61 | public void reduce(TagLL[] head) { 62 | 63 | //--Optimize for common single-tag case 64 | if (head[0].nextTag == null) 65 | return; 66 | 67 | while (true) { 68 | //--Find longest not already marked 69 | TagLL longest = null; 70 | for (TagLL t = head[0]; t != null; t = t.nextTag) { 71 | if (!t.mark && (longest == null || t.charLen() >= longest.charLen())) 72 | longest = t; 73 | } 74 | if (longest == null) 75 | break; 76 | //--Mark longest (so we return it eventually) 77 | longest.mark = true; 78 | //--Remove tags overlapping this longest 79 | for (TagLL t = head[0]; t != null; t = t.nextTag) { 80 | if (t.mark) 81 | continue; 82 | 83 | if (t.overlaps(longest)) { 84 | t.removeLL(); 85 | } else if (t.startOffset >= longest.endOffset) { 86 | break;//no subsequent can possibly overlap 87 | } 88 | } 89 | }//loop 90 | 91 | //all-remaining should be marked 92 | // for (TagLL t = head; t != null; t = t.nextTag) { 93 | // assert t.mark; 94 | //// if (!t.mark) { 95 | //// t.removeLL(); 96 | //// if (head == t) 97 | //// head = t.nextTag; 98 | //// } 99 | // } 100 | assert head[0].mark; 101 | } 102 | }; 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TagLL.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.util.BytesRef; 26 | 27 | import java.io.IOException; 28 | 29 | /** 30 | * This is a Tag -- a startOffset, endOffset and value. 31 | *

32 | * A Tag starts without a value in an 33 | * "advancing" state. {@link #advance(org.apache.lucene.util.BytesRef, int)} 34 | * is called with subsequent words and then eventually it won't advance any 35 | * more, and value is set (could be null). 36 | *

37 | * A Tag is also a doubly-linked-list (hence the LL in the name). All tags share 38 | * a reference to the head via a 1-element array, which is potentially modified 39 | * if any of the linked-list methods are called. Tags in the list should have 40 | * equal or increasing start offsets. 41 | */ 42 | public class TagLL{ 43 | 44 | private final TagLL[] head;//a shared pointer to the head; 1 element 45 | TagLL prevTag, nextTag; // linked list 46 | 47 | private TermPrefixCursor cursor; 48 | 49 | final int startOffset;//inclusive 50 | int endOffset;//exclusive 51 | Object value;//null means unset 52 | 53 | /** optional boolean used by some TagClusterReducer's */ 54 | boolean mark = false; 55 | 56 | TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) { 57 | this.head = head; 58 | this.cursor = cursor; 59 | this.startOffset = startOffset; 60 | this.endOffset = endOffset; 61 | this.value = value; 62 | } 63 | 64 | /** 65 | * Advances this tag with "word" at offset "offset". If this tag is not in 66 | * an advancing state then it does nothing. If it is advancing and prior to 67 | * advancing further it sees a value, then a non-advancing tag may be inserted 68 | * into the LL as side-effect. If this returns false (it didn't advance) and 69 | * if there is no value, then it will also be removed. 70 | * 71 | * 72 | * @param word The next word or null if at an end 73 | * @param offset The last character in word's offset in the underlying 74 | * stream. If word is null then it's meaningless. 75 | * 76 | * @return Whether it advanced or not. 77 | * 78 | * @throws java.io.IOException 79 | */ 80 | boolean advance(BytesRef word, int offset) throws IOException { 81 | if (!isAdvancing()) 82 | return false; 83 | 84 | Object iVal = cursor.getDocIds(); 85 | 86 | if (word != null && cursor.advance(word)) { 87 | 88 | if (iVal != null) { 89 | addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal)); 90 | } 91 | 92 | assert offset >= endOffset; 93 | endOffset = offset; 94 | return true; 95 | } else { 96 | this.value = iVal; 97 | this.cursor = null; 98 | if (iVal == null) 99 | removeLL(); 100 | return false; 101 | } 102 | } 103 | 104 | /** Removes this tag from the chain, connecting prevTag and nextTag. Does not 105 | * modify "this" object's pointers, so the caller can refer to nextTag after 106 | * removing it. */ 107 | public void removeLL() { 108 | if (head[0] == this) 109 | head[0] = nextTag; 110 | if (prevTag != null) { 111 | prevTag.nextTag = nextTag; 112 | } 113 | if (nextTag != null) { 114 | nextTag.prevTag = prevTag; 115 | } 116 | } 117 | 118 | void addBeforeLL(TagLL tag) { 119 | assert tag.startOffset <= startOffset; 120 | if (prevTag != null) { 121 | assert prevTag.startOffset <= tag.startOffset; 122 | prevTag.nextTag = tag; 123 | tag.prevTag = prevTag; 124 | } else { 125 | assert head[0] == this; 126 | head[0] = tag; 127 | } 128 | prevTag = tag; 129 | tag.nextTag = this; 130 | } 131 | 132 | void addAfterLL(TagLL tag) { 133 | assert tag.startOffset >= startOffset; 134 | if (nextTag != null) { 135 | assert nextTag.startOffset >= tag.startOffset; 136 | nextTag.prevTag = tag; 137 | tag.nextTag = nextTag; 138 | } 139 | nextTag = tag; 140 | tag.prevTag = this; 141 | } 142 | 143 | public int charLen() { 144 | return endOffset - startOffset; 145 | } 146 | 147 | public TagLL getNextTag() { 148 | return nextTag; 149 | } 150 | 151 | public TagLL getPrevTag() { 152 | return prevTag; 153 | } 154 | 155 | public int getStartOffset() { 156 | return startOffset; 157 | } 158 | public int getEndOffset() { 159 | return endOffset; 160 | } 161 | public boolean overlaps(TagLL other) { 162 | //don't use >= or <= because startOffset is inclusive while endOffset is exclusive 163 | if (startOffset < other.startOffset) 164 | return endOffset > other.startOffset; 165 | else 166 | return startOffset < other.endOffset; 167 | } 168 | 169 | boolean isAdvancing() { 170 | return cursor != null; 171 | } 172 | 173 | @Override 174 | public String toString() { 175 | return (prevTag != null ? '*' : '-') + "|" + (nextTag != null ? '*' : '-') + 176 | " " + startOffset + " to " + endOffset + (isAdvancing() ? '+' : " #" + value); 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/Tagger.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenStream; 26 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 27 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 28 | import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; 29 | import org.apache.lucene.index.Terms; 30 | import org.apache.lucene.util.Bits; 31 | import org.apache.lucene.util.BytesRef; 32 | import org.apache.lucene.util.IntsRef; 33 | import org.slf4j.Logger; 34 | import org.slf4j.LoggerFactory; 35 | 36 | import java.io.IOException; 37 | import java.util.HashMap; 38 | import java.util.Map; 39 | 40 | /** 41 | * Tags maximum string of words in a corpus. This is a callback-style API 42 | * in which you implement {@link #tagCallback(int, int, Object)}. 43 | * 44 | * This class should be independently usable outside Solr. 45 | */ 46 | public abstract class Tagger { 47 | private final Logger log = LoggerFactory.getLogger(Tagger.class); 48 | 49 | private final TokenStream tokenStream; 50 | private final TermToBytesRefAttribute byteRefAtt; 51 | private final PositionIncrementAttribute posIncAtt; 52 | private final OffsetAttribute offsetAtt; 53 | private final TaggingAttribute taggingAtt; 54 | 55 | private final TagClusterReducer tagClusterReducer; 56 | private final Terms terms; 57 | private final Bits liveDocs; 58 | private final boolean skipAltTokens; 59 | private final boolean ignoreStopWords; 60 | 61 | private Map docIdsCache; 62 | 63 | /** Whether the WARNING about skipped tokens was already logged. */ 64 | private boolean loggedSkippedAltTokenWarning = false; 65 | 66 | public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, 67 | TagClusterReducer tagClusterReducer, boolean skipAltTokens, 68 | boolean ignoreStopWords) throws IOException { 69 | this.terms = terms; 70 | this.liveDocs = liveDocs; 71 | this.tokenStream = tokenStream; 72 | this.skipAltTokens = skipAltTokens; 73 | this.ignoreStopWords = ignoreStopWords; 74 | byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); 75 | posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); 76 | offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); 77 | taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); 78 | tokenStream.reset(); 79 | 80 | this.tagClusterReducer = tagClusterReducer; 81 | } 82 | 83 | public void enableDocIdsCache(int initSize) { 84 | if (initSize > 0) 85 | docIdsCache = new HashMap<>(initSize); 86 | } 87 | 88 | public void process() throws IOException { 89 | if (terms == null) 90 | return; 91 | 92 | //a shared pointer to the head used by this method and each Tag instance. 93 | final TagLL[] head = new TagLL[1]; 94 | 95 | TermPrefixCursor cursor = null;//re-used 96 | 97 | //boolean switch used to log warnings in case tokens where skipped during tagging. 98 | boolean skippedTokens = false; 99 | 100 | while (tokenStream.incrementToken()) { 101 | if (log.isTraceEnabled()) { 102 | log.trace("Token: {}, posInc: {}, offset: [{},{}]", 103 | byteRefAtt, posIncAtt.getPositionIncrement(), 104 | offsetAtt.startOffset(), offsetAtt.endOffset()); 105 | } 106 | //check for posInc < 1 (alternate Tokens, such as expanded Synonyms) 107 | if (posIncAtt.getPositionIncrement() < 1) { 108 | //(a) Deal with this as a configuration issue and throw an exception 109 | if (!skipAltTokens) { 110 | //TODO throw UnsupportedTokenException when PhraseBuilder is ported 111 | throw new IllegalStateException("Query Analyzer generates alternate " 112 | + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " 113 | + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " 114 | + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS 115 | + "' might result in wrong tagging results if the index time analyzer " 116 | + "is not configured accordingly. For detailed information see " 117 | + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); 118 | } else { 119 | //(b) In case the index time analyser had indexed all variants (users 120 | // need to ensure that) processing of alternate tokens can be skipped 121 | // as anyways all alternatives will be contained in the FST. 122 | skippedTokens = true; 123 | log.trace(" ... ignored token"); 124 | continue; 125 | } 126 | } 127 | //-- If PositionIncrement > 1 (stopwords) 128 | if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { 129 | log.trace(" - posInc > 1 ... mark cluster as done"); 130 | advanceTagsAndProcessClusterIfDone(head, null); 131 | } 132 | 133 | final BytesRef term; 134 | //NOTE: we need to lookup tokens if 135 | // * the LookupAtt is true OR 136 | // * there are still advancing tags (to find the longest possible match) 137 | if(taggingAtt.isTaggable() || head[0] != null){ 138 | //-- Lookup the term id from the next token 139 | term = byteRefAtt.getBytesRef(); 140 | if (term.length == 0) { 141 | throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token"); 142 | } 143 | } else { //no current cluster AND lookup == false ... 144 | term = null; //skip this token 145 | } 146 | 147 | //-- Process tag 148 | advanceTagsAndProcessClusterIfDone(head, term); 149 | 150 | //-- only create new Tags for Tokens we need to lookup 151 | if (taggingAtt.isTaggable() && term != null) { 152 | 153 | //determine if the terms index has a term starting with the provided term 154 | // TODO create a pool of these cursors to reuse them more? could be trivial impl 155 | if (cursor == null)// (else the existing cursor will be re-used) 156 | cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache); 157 | if (cursor.advance(term)) { 158 | TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); 159 | cursor = null;//because the new tag now "owns" this instance 160 | //and add it to the end 161 | if (head[0] == null) { 162 | head[0] = newTail; 163 | } else { 164 | for (TagLL t = head[0]; true; t = t.nextTag) { 165 | if (t.nextTag == null) { 166 | t.addAfterLL(newTail); 167 | break; 168 | } 169 | } 170 | } 171 | } 172 | }//if termId >= 0 173 | }//end while(incrementToken()) 174 | 175 | //-- Finish all tags 176 | advanceTagsAndProcessClusterIfDone(head, null); 177 | assert head[0] == null; 178 | 179 | if(!loggedSkippedAltTokenWarning && skippedTokens){ 180 | loggedSkippedAltTokenWarning = true; //only log once 181 | log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) " 182 | + "while processing text. This may cause problems with some Analyzer " 183 | + "configurations (e.g. query time synonym expansion). For details see " 184 | + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); 185 | } 186 | 187 | tokenStream.end(); 188 | //tokenStream.close(); caller closes because caller acquired it 189 | } 190 | 191 | private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException { 192 | //-- Advance tags 193 | final int endOffset = term != null ? offsetAtt.endOffset() : -1; 194 | boolean anyAdvance = false; 195 | for (TagLL t = head[0]; t != null; t = t.nextTag) { 196 | anyAdvance |= t.advance(term, endOffset); 197 | } 198 | 199 | //-- Process cluster if done 200 | if (!anyAdvance && head[0] != null) { 201 | tagClusterReducer.reduce(head); 202 | for (TagLL t = head[0]; t != null; t = t.nextTag) { 203 | assert t.value != null; 204 | tagCallback(t.startOffset, t.endOffset, t.value); 205 | } 206 | head[0] = null; 207 | } 208 | } 209 | 210 | /** 211 | * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset 212 | * given in the previous call. 213 | * 214 | * @param startOffset The character offset of the original stream where the tag starts. 215 | * @param endOffset One more than the character offset of the original stream where the tag ends. 216 | * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}. 217 | */ 218 | protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey); 219 | 220 | /** 221 | * Returns a sorted array of integer docIds given the corresponding key. 222 | * @param docIdsKey The lookup key. 223 | * @return Not null 224 | */ 225 | protected IntsRef lookupDocIds(Object docIdsKey) { 226 | return (IntsRef) docIdsKey; 227 | } 228 | } 229 | 230 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TaggerRequestHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import com.google.common.io.CharStreams; 26 | import org.apache.lucene.analysis.Analyzer; 27 | import org.apache.lucene.analysis.TokenStream; 28 | import org.apache.lucene.analysis.core.StopFilterFactory; 29 | import org.apache.lucene.analysis.util.TokenFilterFactory; 30 | import org.apache.lucene.index.LeafReaderContext; 31 | import org.apache.lucene.index.ReaderUtil; 32 | import org.apache.lucene.index.Terms; 33 | import org.apache.lucene.queries.function.FunctionValues; 34 | import org.apache.lucene.queries.function.ValueSource; 35 | import org.apache.lucene.search.DocIdSetIterator; 36 | import org.apache.lucene.search.IndexSearcher; 37 | import org.apache.lucene.search.Query; 38 | import org.apache.lucene.util.BitSetIterator; 39 | import org.apache.lucene.util.Bits; 40 | import org.apache.lucene.util.FixedBitSet; 41 | import org.apache.lucene.util.IntsRef; 42 | import org.apache.solr.analysis.TokenizerChain; 43 | import org.apache.solr.common.SolrException; 44 | import org.apache.solr.common.params.CommonParams; 45 | import org.apache.solr.common.params.MapSolrParams; 46 | import org.apache.solr.common.params.SolrParams; 47 | import org.apache.solr.common.util.ContentStream; 48 | import org.apache.solr.common.util.NamedList; 49 | import org.apache.solr.handler.RequestHandlerBase; 50 | import org.apache.solr.request.SolrQueryRequest; 51 | import org.apache.solr.response.SolrQueryResponse; 52 | import org.apache.solr.schema.FieldType; 53 | import org.apache.solr.schema.SchemaField; 54 | import org.apache.solr.search.BitDocSet; 55 | import org.apache.solr.search.DocList; 56 | import org.apache.solr.search.DocSet; 57 | import org.apache.solr.search.DocSlice; 58 | import org.apache.solr.search.QParser; 59 | import org.apache.solr.search.SolrIndexSearcher; 60 | import org.apache.solr.search.SolrReturnFields; 61 | import org.apache.solr.search.SyntaxError; 62 | import org.slf4j.Logger; 63 | import org.slf4j.LoggerFactory; 64 | 65 | import javax.xml.stream.XMLStreamException; 66 | import java.io.IOException; 67 | import java.io.Reader; 68 | import java.io.StringReader; 69 | import java.util.ArrayList; 70 | import java.util.Collections; 71 | import java.util.HashMap; 72 | import java.util.HashSet; 73 | import java.util.Iterator; 74 | import java.util.List; 75 | import java.util.Locale; 76 | import java.util.Map; 77 | import java.util.Set; 78 | 79 | /** 80 | * Scans posted text, looking for matching strings in the Solr index. 81 | * The public static final String members are request parameters. 82 | */ 83 | public class TaggerRequestHandler extends RequestHandlerBase { 84 | 85 | /** Request parameter. */ 86 | public static final String OVERLAPS = "overlaps"; 87 | /** Request parameter. */ 88 | public static final String TAGS_LIMIT = "tagsLimit"; 89 | /** Request parameter. */ 90 | public static final String MATCH_TEXT = "matchText"; 91 | /** Request parameter. */ 92 | public static final String SKIP_ALT_TOKENS = "skipAltTokens"; 93 | /** Request parameter. */ 94 | public static final String IGNORE_STOPWORDS = "ignoreStopwords"; 95 | /** Request parameter. */ 96 | public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust"; 97 | /** Request parameter. */ 98 | public static final String HTML_OFFSET_ADJUST = "htmlOffsetAdjust"; 99 | /** Request parameter. */ 100 | public static final String NON_TAGGABLE_TAGS = "nonTaggableTags"; 101 | 102 | private final Logger log = LoggerFactory.getLogger(getClass()); 103 | 104 | @Override 105 | public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { 106 | setTopInitArgsAsInvariants(req); 107 | 108 | //--Read params 109 | final String indexedField = req.getParams().get("field"); 110 | if (indexedField == null) 111 | throw new RuntimeException("required param 'field'"); 112 | 113 | final TagClusterReducer tagClusterReducer = 114 | chooseTagClusterReducer(req.getParams().get(OVERLAPS)); 115 | final int rows = req.getParams().getInt(CommonParams.ROWS, 10000); 116 | final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000); 117 | final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false); 118 | final SchemaField idSchemaField = req.getSchema().getUniqueKeyField(); 119 | if (idSchemaField == null) { 120 | throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" + 121 | "uniqueKey in the schema.");//TODO this could be relaxes 122 | } 123 | final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false); 124 | final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS, 125 | fieldHasIndexedStopFilter(indexedField, req)); 126 | final boolean htmlOffsetAdjust = req.getParams().getBool(HTML_OFFSET_ADJUST, false); 127 | final boolean xmlOffsetAdjust = req.getParams().getBool(XML_OFFSET_ADJUST, false); 128 | final String nonTaggableTags = req.getParams().get(NON_TAGGABLE_TAGS); 129 | 130 | //--Get posted data 131 | Reader inputReader = null; 132 | Iterable streams = req.getContentStreams(); 133 | if (streams != null) { 134 | Iterator iter = streams.iterator(); 135 | if (iter.hasNext()) { 136 | inputReader = iter.next().getReader(); 137 | } 138 | if (iter.hasNext()) { 139 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 140 | getClass().getSimpleName()+" does not support multiple ContentStreams"); 141 | } 142 | } 143 | if (inputReader == null) { 144 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 145 | getClass().getSimpleName()+" requires text to be POSTed to it"); 146 | } 147 | final String inputString;//only populated if needed 148 | if (addMatchText || xmlOffsetAdjust || htmlOffsetAdjust) { 149 | //Read the input fully into a String buffer that we'll need later, 150 | // then replace the input with a reader wrapping the buffer. 151 | inputString = CharStreams.toString(inputReader); 152 | inputReader.close(); 153 | inputReader = new StringReader(inputString); 154 | } else { 155 | inputString = null;//not used 156 | } 157 | 158 | final OffsetCorrector offsetCorrector = 159 | initOffsetCorrector(htmlOffsetAdjust, xmlOffsetAdjust, inputString, nonTaggableTags); 160 | final SolrIndexSearcher searcher = req.getSearcher(); 161 | final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc()); 162 | final List tags = new ArrayList(2000); 163 | 164 | try { 165 | Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer(); 166 | try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) { 167 | Terms terms = searcher.getSlowAtomicReader().terms(indexedField); 168 | if (terms == null) 169 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 170 | "field " + indexedField + " has no indexed data"); 171 | Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer, 172 | skipAltTokens, ignoreStopWords) { 173 | @SuppressWarnings("unchecked") 174 | @Override 175 | protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) { 176 | if (tags.size() >= tagsLimit) 177 | return; 178 | if (offsetCorrector != null) { 179 | int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset); 180 | if (offsetPair == null) { 181 | log.debug("Discarded offsets [{}, {}] because couldn't balance XML.", 182 | startOffset, endOffset); 183 | return; 184 | } 185 | startOffset = offsetPair[0]; 186 | endOffset = offsetPair[1]; 187 | } 188 | 189 | NamedList tag = new NamedList(); 190 | tag.add("startOffset", startOffset); 191 | tag.add("endOffset", endOffset); 192 | if (addMatchText) 193 | tag.add("matchText", inputString.substring(startOffset, endOffset)); 194 | //below caches, and also flags matchDocIdsBS 195 | tag.add("ids", lookupSchemaDocIds(docIdsKey)); 196 | tags.add(tag); 197 | } 198 | 199 | Map docIdsListCache = new HashMap<>(2000); 200 | 201 | ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher, 202 | idSchemaField.getType().getValueSource(idSchemaField, null)); 203 | 204 | @SuppressWarnings("unchecked") 205 | private List lookupSchemaDocIds(Object docIdsKey) { 206 | List schemaDocIds = docIdsListCache.get(docIdsKey); 207 | if (schemaDocIds != null) 208 | return schemaDocIds; 209 | IntsRef docIds = lookupDocIds(docIdsKey); 210 | //translate lucene docIds to schema ids 211 | schemaDocIds = new ArrayList(docIds.length); 212 | for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) { 213 | int docId = docIds.ints[i]; 214 | assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?"; 215 | matchDocIdsBS.set(docId);//also, flip docid in bitset 216 | try { 217 | schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here 218 | } catch (IOException e) { 219 | throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); 220 | } 221 | } 222 | assert !schemaDocIds.isEmpty(); 223 | 224 | docIdsListCache.put(docIds, schemaDocIds); 225 | return schemaDocIds; 226 | } 227 | 228 | }; 229 | tagger.enableDocIdsCache(2000);//TODO configurable 230 | tagger.process(); 231 | } 232 | } finally { 233 | inputReader.close(); 234 | } 235 | rsp.add("tagsCount",tags.size()); 236 | rsp.add("tags", tags); 237 | 238 | rsp.setReturnFields(new SolrReturnFields( req )); 239 | 240 | //Solr's standard name for matching docs in response 241 | rsp.add("response", getDocList(rows, matchDocIdsBS)); 242 | } 243 | 244 | private OffsetCorrector initOffsetCorrector(boolean htmlOffsetAdjust, boolean xmlOffsetAdjust, 245 | String inputString, String nonTaggableTags) { 246 | OffsetCorrector offsetCorrector; 247 | if (htmlOffsetAdjust) { 248 | Set nonTaggableTagSet = null; 249 | if (nonTaggableTags != null) { 250 | //comma delimited list 251 | nonTaggableTags = nonTaggableTags.toLowerCase(Locale.ROOT); 252 | final String[] strings = nonTaggableTags.split(","); 253 | nonTaggableTagSet = new HashSet<>(strings.length); 254 | Collections.addAll(nonTaggableTagSet, strings); 255 | } 256 | try { 257 | offsetCorrector = new HtmlOffsetCorrector(inputString, nonTaggableTagSet); 258 | } catch (Exception e) { 259 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 260 | "Expecting HTML but wasn't: " + e, e); 261 | } 262 | } else if (xmlOffsetAdjust) { 263 | if (nonTaggableTags != null) 264 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 265 | NON_TAGGABLE_TAGS+" not supported for xml"); 266 | try { 267 | offsetCorrector = new XmlOffsetCorrector(inputString); 268 | } catch (XMLStreamException e) { 269 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 270 | "Expecting XML but wasn't: " + e, e); 271 | } 272 | } else { 273 | offsetCorrector = null; 274 | } 275 | return offsetCorrector; 276 | } 277 | 278 | private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException { 279 | //Now we must supply a Solr DocList and add it to the response. 280 | // Typically this is gotten via a SolrIndexSearcher.search(), but in this case we 281 | // know exactly what documents to return, the order doesn't matter nor does 282 | // scoring. 283 | // Ideally an implementation of DocList could be directly implemented off 284 | // of a BitSet, but there are way too many methods to implement for a minor 285 | // payoff. 286 | int matchDocs = matchDocIdsBS.cardinality(); 287 | int[] docIds = new int[ Math.min(rows, matchDocs) ]; 288 | DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1); 289 | for (int i = 0; i < docIds.length; i++) { 290 | docIds[i] = docIdIter.nextDoc(); 291 | } 292 | return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f); 293 | } 294 | 295 | private TagClusterReducer chooseTagClusterReducer(String overlaps) { 296 | TagClusterReducer tagClusterReducer; 297 | if (overlaps == null || overlaps.equals("NO_SUB")) { 298 | tagClusterReducer = TagClusterReducer.NO_SUB; 299 | } else if (overlaps.equals("ALL")) { 300 | tagClusterReducer = TagClusterReducer.ALL; 301 | } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) { 302 | tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT; 303 | } else { 304 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 305 | "unknown tag overlap mode: "+overlaps); 306 | } 307 | return tagClusterReducer; 308 | } 309 | 310 | /** 311 | * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs 312 | * either. If null is returned, then all docs are available. 313 | */ 314 | private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException { 315 | final String[] corpusFilterQueries = req.getParams().getParams("fq"); 316 | final SolrIndexSearcher searcher = req.getSearcher(); 317 | final Bits docBits; 318 | if (corpusFilterQueries != null && corpusFilterQueries.length > 0) { 319 | List filterQueries = new ArrayList(corpusFilterQueries.length); 320 | for (String corpusFilterQuery : corpusFilterQueries) { 321 | QParser qParser = QParser.getParser(corpusFilterQuery, null, req); 322 | try { 323 | filterQueries.add(qParser.parse()); 324 | } catch (SyntaxError e) { 325 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); 326 | } 327 | } 328 | 329 | final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache 330 | //note: before Solr 4.7 we could call docSet.getBits() but no longer. 331 | if (docSet instanceof BitDocSet) { 332 | docBits = ((BitDocSet)docSet).getBits(); 333 | } else { 334 | docBits = new Bits() { 335 | 336 | @Override 337 | public boolean get(int index) { 338 | return docSet.exists(index); 339 | } 340 | 341 | @Override 342 | public int length() { 343 | return searcher.maxDoc(); 344 | } 345 | }; 346 | } 347 | } else { 348 | docBits = searcher.getSlowAtomicReader().getLiveDocs(); 349 | } 350 | return docBits; 351 | } 352 | 353 | private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) { 354 | FieldType fieldType = req.getSchema().getFieldType(field); 355 | Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer 356 | if (analyzer instanceof TokenizerChain) { 357 | TokenizerChain tokenizerChain = (TokenizerChain) analyzer; 358 | TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories(); 359 | for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { 360 | if (tokenFilterFactory instanceof StopFilterFactory) 361 | return true; 362 | } 363 | } 364 | return false; 365 | } 366 | 367 | /** 368 | * This request handler supports configuration options defined at the top level as well as 369 | * those in typical Solr 'defaults', 'appends', and 'invariants'. The top level ones are treated 370 | * as invariants. 371 | */ 372 | private void setTopInitArgsAsInvariants(SolrQueryRequest req) { 373 | // First convert top level initArgs to SolrParams 374 | HashMap map = new HashMap<>(initArgs.size()); 375 | for (int i=0; i readerContexts; 395 | private final ValueSource valueSource; 396 | private final Map fContext; 397 | private final FunctionValues[] functionValuesPerSeg; 398 | private final int[] functionValuesDocIdPerSeg; 399 | 400 | ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) { 401 | readerContexts = searcher.getIndexReader().leaves(); 402 | this.valueSource = valueSource; 403 | fContext = ValueSource.newContext(searcher); 404 | functionValuesPerSeg = new FunctionValues[readerContexts.size()]; 405 | functionValuesDocIdPerSeg = new int[readerContexts.size()]; 406 | } 407 | 408 | Object objectVal(int topDocId) throws IOException { 409 | // lookup segment level stuff: 410 | int segIdx = ReaderUtil.subIndex(topDocId, readerContexts); 411 | LeafReaderContext rcontext = readerContexts.get(segIdx); 412 | int segDocId = topDocId - rcontext.docBase; 413 | // unfortunately Lucene 7.0 requires forward only traversal (with no reset method). 414 | // So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-( 415 | FunctionValues functionValues = functionValuesPerSeg[segIdx]; 416 | if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) { 417 | functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext); 418 | } 419 | functionValuesDocIdPerSeg[segIdx] = segDocId; 420 | 421 | // get value: 422 | return functionValues.objectVal(segDocId); 423 | } 424 | } 425 | 426 | } 427 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TaggingAttribute.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenFilter; 26 | import org.apache.lucene.analysis.TokenStream; 27 | import org.apache.lucene.util.Attribute; 28 | 29 | /** 30 | * Attribute used by the {@link Tagger} to decide if a token can start a 31 | * new {@link TagLL tag}. 32 | *

33 | * By default this Attribute will return true, but it might be 34 | * reset by some {@link TokenFilter} added to the {@link TokenStream} used 35 | * to analyze the parsed text. Typically this will be done based on NLP 36 | * processing results (e.g. to only lookup Named Entities). 37 | *

38 | * NOTE: that all Tokens are used to advance existing {@link TagLL tags}. 39 | * 40 | * @author Rupert Westenthaler 41 | */ 42 | public interface TaggingAttribute extends Attribute { 43 | 44 | /** 45 | * By default this Attribute will be initialised with true. 46 | * This ensures that all tokens are taggable by default (especially if 47 | * the {@link TaggingAttribute} is not set by any component in the configured 48 | * {@link TokenStream} 49 | */ 50 | public static final boolean DEFAULT_TAGGABLE = true; 51 | 52 | /** 53 | * Getter for the taggable state of the current Token 54 | * 55 | * @return the state 56 | */ 57 | public boolean isTaggable(); 58 | 59 | /** 60 | * Setter for the taggable state. Typically called by code within 61 | * {@link TokenFilter#incrementToken()}. 62 | * 63 | * @param lookup the state 64 | */ 65 | public void setTaggable(boolean lookup); 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TaggingAttributeImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.util.AttributeImpl; 26 | import org.apache.lucene.util.AttributeReflector; 27 | 28 | /** 29 | * Implementation of the {@link TaggingAttribute} 30 | * 31 | * @author Rupert Westenthaler 32 | */ 33 | public class TaggingAttributeImpl extends AttributeImpl implements TaggingAttribute { 34 | 35 | /** 36 | * the private field initialised with {@link TaggingAttribute#DEFAULT_TAGGABLE} 37 | */ 38 | private boolean taggable = TaggingAttribute.DEFAULT_TAGGABLE; 39 | 40 | /* 41 | * (non-Javadoc) 42 | * @see org.opensextant.solrtexttagger.LookupAttribute#isLookup() 43 | */ 44 | @Override 45 | public boolean isTaggable() { 46 | return taggable; 47 | } 48 | 49 | /* 50 | * (non-Javadoc) 51 | * @see org.opensextant.solrtexttagger.LookupAttribute#setLookup(boolean) 52 | */ 53 | @Override 54 | public void setTaggable(boolean lookup) { 55 | this.taggable = lookup; 56 | } 57 | 58 | /* 59 | * (non-Javadoc) 60 | * @see org.apache.lucene.util.AttributeImpl#clear() 61 | */ 62 | @Override 63 | public void clear() { 64 | taggable = DEFAULT_TAGGABLE; 65 | } 66 | 67 | /* 68 | * (non-Javadoc) 69 | * @see org.apache.lucene.util.AttributeImpl#copyTo(org.apache.lucene.util.AttributeImpl) 70 | */ 71 | @Override 72 | public void copyTo(AttributeImpl target) { 73 | ((TaggingAttribute) target).setTaggable(taggable); 74 | } 75 | 76 | @Override 77 | public void reflectWith(AttributeReflector reflector) { 78 | reflector.reflect(TaggingAttribute.class, "taggable", isTaggable()); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/TermPrefixCursor.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.index.PostingsEnum; 26 | import org.apache.lucene.index.TermsEnum; 27 | import org.apache.lucene.util.Bits; 28 | import org.apache.lucene.util.BytesRef; 29 | import org.apache.lucene.util.BytesRefBuilder; 30 | import org.apache.lucene.util.IntsRef; 31 | 32 | import java.io.IOException; 33 | import java.util.Map; 34 | 35 | /** 36 | * Cursor into the terms that advances by prefix. 37 | */ 38 | class TermPrefixCursor { 39 | 40 | //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup. 41 | // Maybe that could be added to Lucene. 42 | 43 | // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict? 44 | 45 | private static final byte SEPARATOR_CHAR = ' '; 46 | private static final IntsRef EMPTY_INTSREF = new IntsRef(); 47 | 48 | private final TermsEnum termsEnum; 49 | private final Bits liveDocs; 50 | private final Map docIdsCache; 51 | 52 | private BytesRef prefixBuf;//we append to this 53 | private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder(); 54 | private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied 55 | private PostingsEnum postingsEnum; 56 | private IntsRef docIds; 57 | 58 | TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) { 59 | this.termsEnum = termsEnum; 60 | this.liveDocs = liveDocs; 61 | this.docIdsCache = docIdsCache; 62 | } 63 | 64 | /** Appends the separator char (if not the first) plus the given word to the prefix buffer, 65 | * then seeks to it. If the seek fails, false is returned and this cursor 66 | * can be re-used as if in a new state. The {@code word} BytesRef is considered temporary, 67 | * and is not saved within this class. */ 68 | boolean advance(BytesRef word) throws IOException { 69 | if (prefixBuf == null) { // first advance 70 | //set prefixBuf to word temporary. When advance() completes, we either null out or copy. 71 | prefixBuf = word; 72 | prefixBufOnLoan = true; 73 | if (seekPrefix()) {//... and we have to 74 | ensureBufIsACopy(); 75 | return true; 76 | } else { 77 | prefixBuf = null;//just to be darned sure 'word' isn't referenced here 78 | return false; 79 | } 80 | 81 | } else { // subsequent advance 82 | //append to existing 83 | assert !prefixBufOnLoan; 84 | 85 | prefixBufBuilder.append(SEPARATOR_CHAR); 86 | prefixBufBuilder.append(word); 87 | prefixBuf = prefixBufBuilder.get(); 88 | if (seekPrefix()) { 89 | return true; 90 | } else { 91 | prefixBuf = null; 92 | return false; 93 | } 94 | } 95 | } 96 | 97 | private void ensureBufIsACopy() { 98 | if (!prefixBufOnLoan) 99 | return; 100 | 101 | prefixBufBuilder.clear(); 102 | prefixBufBuilder.copyBytes(prefixBuf); 103 | prefixBuf = prefixBufBuilder.get(); 104 | prefixBufOnLoan = false; 105 | } 106 | 107 | /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char. 108 | * Sets docIds. **/ 109 | private boolean seekPrefix() throws IOException { 110 | TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf); 111 | 112 | docIds = null;//invalidate 113 | switch (seekStatus) { 114 | case END: 115 | return false; 116 | 117 | case FOUND: 118 | postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); 119 | docIds = postingsEnumToIntsRef(postingsEnum, liveDocs); 120 | if (docIds.length > 0) { 121 | return true; 122 | } 123 | 124 | //Pretend we didn't find it; go to next term 125 | docIds = null; 126 | if (termsEnum.next() == null) { // case END 127 | return false; 128 | } 129 | //fall through to NOT_FOUND 130 | 131 | case NOT_FOUND: 132 | //termsEnum must start with prefixBuf to continue 133 | BytesRef teTerm = termsEnum.term(); 134 | 135 | if (teTerm.length > prefixBuf.length) { 136 | for (int i = 0; i < prefixBuf.length; i++) { 137 | if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i]) 138 | return false; 139 | } 140 | if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR) 141 | return false; 142 | return true; 143 | } 144 | return false; 145 | } 146 | throw new IllegalStateException(seekStatus.toString()); 147 | } 148 | 149 | /** Returns an IntsRef either cached or reading postingsEnum. Not null. 150 | * @param postingsEnum*/ 151 | private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { 152 | // (The cache can have empty IntsRefs) 153 | 154 | //lookup prefixBuf in a cache 155 | if (docIdsCache != null) { 156 | docIds = docIdsCache.get(prefixBuf); 157 | if (docIds != null) { 158 | return docIds; 159 | } 160 | } 161 | 162 | //read postingsEnum 163 | docIds = new IntsRef(termsEnum.docFreq()); 164 | int docId; 165 | while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { 166 | if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { 167 | continue; 168 | } 169 | docIds.ints[docIds.length++] = docId; 170 | } 171 | if (docIds.length == 0) 172 | docIds = EMPTY_INTSREF; 173 | 174 | //cache 175 | if (docIdsCache != null) { 176 | ensureBufIsACopy(); 177 | //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to 178 | docIdsCache.put(prefixBuf.clone(), docIds); 179 | } 180 | return docIds; 181 | } 182 | 183 | /** The docIds of the last call to advance, if it returned true. It might be null, but 184 | * its length won't be 0. Treat as immutable. */ 185 | IntsRef getDocIds() { 186 | assert docIds == null || docIds.length != 0; 187 | return docIds; 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/XmlOffsetCorrector.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import com.ctc.wstx.stax.WstxInputFactory; 26 | import org.apache.commons.io.input.ClosedInputStream; 27 | import org.codehaus.stax2.LocationInfo; 28 | import org.codehaus.stax2.XMLInputFactory2; 29 | import org.codehaus.stax2.XMLStreamReader2; 30 | 31 | import javax.xml.stream.XMLResolver; 32 | import javax.xml.stream.XMLStreamException; 33 | import javax.xml.stream.events.XMLEvent; 34 | import java.io.InputStream; 35 | import java.io.StringReader; 36 | 37 | /** 38 | * Corrects offsets to adjust for XML formatted data. The goal is such that the caller should be 39 | * able to insert a start XML tag at the start offset and a corresponding end XML tag at the end 40 | * offset of the tagger, and have it be valid XML. See {@link #correctPair(int, int)}. 41 | * 42 | * This will not work on invalid XML. 43 | * 44 | * Not thread-safe. 45 | */ 46 | public class XmlOffsetCorrector extends OffsetCorrector { 47 | 48 | //TODO use StAX without hard requirement on woodstox. xmlStreamReader.getLocation().getCharacterOffset() 49 | 50 | private static final XMLInputFactory2 XML_INPUT_FACTORY; 51 | static { 52 | // note: similar code in Solr's EmptyEntityResolver 53 | XML_INPUT_FACTORY = new WstxInputFactory(); 54 | XML_INPUT_FACTORY.setXMLResolver(new XMLResolver() { 55 | @Override 56 | public InputStream resolveEntity(String publicId, String systemId, String baseURI, String namespace) { 57 | return ClosedInputStream.CLOSED_INPUT_STREAM; 58 | } 59 | }); 60 | // TODO disable DTD? 61 | // XML_INPUT_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE) 62 | XML_INPUT_FACTORY.configureForSpeed(); 63 | } 64 | 65 | /** 66 | * Initialize based on the document text. 67 | * @param docText non-null XML content. 68 | * @throws XMLStreamException If there's a problem parsing the XML. 69 | */ 70 | public XmlOffsetCorrector(String docText) throws XMLStreamException { 71 | super(docText, false); 72 | 73 | int tagCounter = 0; 74 | int thisTag = -1; 75 | 76 | //note: we *could* add a virtual outer tag to guarantee all text is in the context of a tag, 77 | // but we shouldn't need to because there is no findable text outside the top element. 78 | 79 | final XMLStreamReader2 xmlStreamReader = 80 | (XMLStreamReader2) XML_INPUT_FACTORY.createXMLStreamReader(new StringReader(docText)); 81 | 82 | while (xmlStreamReader.hasNext()) { 83 | int eventType = xmlStreamReader.next(); 84 | switch (eventType) { 85 | case XMLEvent.START_ELEMENT: { 86 | tagInfo.ensureCapacity(tagInfo.size() + 5); 87 | final int parentTag = thisTag; 88 | final LocationInfo info = xmlStreamReader.getLocationInfo(); 89 | tagInfo.add(parentTag); 90 | tagInfo.add((int) info.getStartingCharOffset(), (int) info.getEndingCharOffset()); 91 | tagInfo.add(-1, -1);//these 2 will be populated when we get to the close tag 92 | thisTag = tagCounter++; 93 | 94 | parentChangeOffsets.add((int) info.getStartingCharOffset()); 95 | parentChangeIds.add(thisTag); 96 | break; 97 | } 98 | case XMLEvent.END_ELEMENT: { 99 | final LocationInfo info = xmlStreamReader.getLocationInfo(); 100 | tagInfo.set(5 * thisTag + 3, (int) info.getStartingCharOffset()); 101 | tagInfo.set(5 * thisTag + 4, (int) info.getEndingCharOffset()); 102 | thisTag = getParentTag(thisTag); 103 | 104 | parentChangeOffsets.add((int) info.getEndingCharOffset()); 105 | parentChangeIds.add(thisTag); 106 | break; 107 | } 108 | default: //do nothing 109 | } 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/org/opensextant/solrtexttagger/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | /** 24 | * The classes in this package implement OpenSextant's Solr-based tagger. 25 | */ 26 | package org.opensextant.solrtexttagger; -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/AbstractTaggerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.commons.lang.builder.CompareToBuilder; 26 | import org.apache.commons.lang.builder.EqualsBuilder; 27 | import org.apache.lucene.document.Document; 28 | import org.apache.solr.SolrTestCaseJ4; 29 | import org.apache.solr.common.params.CommonParams; 30 | import org.apache.solr.common.params.ModifiableSolrParams; 31 | import org.apache.solr.common.params.SolrParams; 32 | import org.apache.solr.common.util.ContentStream; 33 | import org.apache.solr.common.util.ContentStreamBase; 34 | import org.apache.solr.common.util.NamedList; 35 | import org.apache.solr.request.SolrQueryRequest; 36 | import org.apache.solr.request.SolrQueryRequestBase; 37 | import org.apache.solr.response.SolrQueryResponse; 38 | import org.apache.solr.search.DocIterator; 39 | import org.apache.solr.search.DocList; 40 | import org.apache.solr.search.SolrIndexSearcher; 41 | import org.junit.Rule; 42 | import org.junit.rules.TestWatcher; 43 | import org.junit.runner.Description; 44 | import org.slf4j.Logger; 45 | import org.slf4j.LoggerFactory; 46 | 47 | import java.io.IOException; 48 | import java.util.Arrays; 49 | import java.util.Collections; 50 | import java.util.HashMap; 51 | import java.util.List; 52 | import java.util.Map; 53 | import java.util.TreeSet; 54 | 55 | /** 56 | * @author David Smiley - dsmiley@apache.org 57 | */ 58 | public abstract class AbstractTaggerTest extends SolrTestCaseJ4 { 59 | 60 | protected final Logger log = LoggerFactory.getLogger(getClass()); 61 | 62 | @Rule 63 | public TestWatcher watchman = new TestWatcher() { 64 | @Override 65 | protected void starting(Description description) { 66 | log.info("{} being run...", description.getDisplayName()); 67 | } 68 | }; 69 | 70 | protected final ModifiableSolrParams baseParams = new ModifiableSolrParams(); 71 | 72 | //populated in buildNames; tested in assertTags 73 | protected static List NAMES; 74 | 75 | @Override 76 | public void setUp() throws Exception { 77 | super.setUp(); 78 | baseParams.clear(); 79 | baseParams.set(CommonParams.QT, "/tag"); 80 | baseParams.set(CommonParams.WT, "xml"); 81 | } 82 | 83 | protected void assertTags(String doc, String... tags) throws Exception { 84 | TestTag[] tts = new TestTag[tags.length]; 85 | for (int i = 0; i < tags.length; i++) { 86 | tts[i] = tt(doc, tags[i]); 87 | } 88 | assertTags(reqDoc(doc), tts); 89 | } 90 | 91 | protected static void buildNames(String... names) throws Exception { 92 | deleteByQueryAndGetVersion("*:*", null); 93 | NAMES = Arrays.asList(names); 94 | //Collections.sort(NAMES); 95 | int i = 0; 96 | for (String n : NAMES) { 97 | assertU(adoc("id", ""+(i++), "name", n)); 98 | } 99 | assertU(commit()); 100 | } 101 | 102 | protected String lookupByName(String name) { 103 | for (String n : NAMES) { 104 | if (n.equalsIgnoreCase(name)) 105 | return n; 106 | } 107 | return null; 108 | } 109 | 110 | protected TestTag tt(String doc, String substring) { 111 | int startOffset = -1, endOffset; 112 | int substringIndex = 0; 113 | for(int i = 0; i <= substringIndex; i++) { 114 | startOffset = doc.indexOf(substring,++startOffset); 115 | assert startOffset >= 0 : "The test itself is broken"; 116 | } 117 | endOffset = startOffset+substring.length();//1 greater (exclusive) 118 | return new TestTag(startOffset, endOffset, substring, lookupByName(substring)); 119 | } 120 | 121 | /** Asserts the tags. Will call req.close(). */ 122 | protected void assertTags(SolrQueryRequest req, TestTag... eTags) throws Exception { 123 | try { 124 | SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get(CommonParams.QT), req); 125 | TestTag[] aTags = pullTagsFromResponse(req, rsp); 126 | 127 | String message; 128 | if (aTags.length > 10) 129 | message = null; 130 | else 131 | message = Arrays.asList(aTags).toString(); 132 | Arrays.sort(eTags); 133 | assertSortedArrayEquals(message, eTags, aTags); 134 | 135 | } finally { 136 | req.close(); 137 | } 138 | } 139 | 140 | @SuppressWarnings("unchecked") 141 | protected TestTag[] pullTagsFromResponse(SolrQueryRequest req, SolrQueryResponse rsp ) throws IOException { 142 | NamedList rspValues = rsp.getValues(); 143 | Map matchingNames = new HashMap<>(); 144 | SolrIndexSearcher searcher = req.getSearcher(); 145 | DocList docList = (DocList) rspValues.get("response"); 146 | DocIterator iter = docList.iterator(); 147 | while (iter.hasNext()) { 148 | int docId = iter.next(); 149 | Document doc = searcher.doc(docId); 150 | String id = doc.getField("id").stringValue(); 151 | String name = lookupByName(doc.get("name")); 152 | assertEquals("looking for "+name, NAMES.indexOf(name)+"", id); 153 | matchingNames.put(id, name); 154 | } 155 | 156 | //build TestTag[] aTags from response ('a' is actual) 157 | List mTagsList = (List) rspValues.get("tags"); 158 | TestTag[] aTags = new TestTag[mTagsList.size()]; 159 | int mt_i = 0; 160 | for (NamedList map : mTagsList) { 161 | List foundIds = (List) map.get("ids"); 162 | for (String id : foundIds) { 163 | aTags[mt_i++] = new TestTag( 164 | ((Number)map.get("startOffset")).intValue(), 165 | ((Number)map.get("endOffset")).intValue(), 166 | null, 167 | matchingNames.get(id)); 168 | } 169 | } 170 | return aTags; 171 | } 172 | 173 | /** REMEMBER to close() the result req object. */ 174 | protected SolrQueryRequest reqDoc(String doc, String... moreParams) { 175 | return reqDoc(doc, params(moreParams)); 176 | } 177 | 178 | /** REMEMBER to close() the result req object. */ 179 | protected SolrQueryRequest reqDoc(String doc, SolrParams moreParams) { 180 | log.debug("Test doc: "+doc); 181 | SolrParams params = SolrParams.wrapDefaults(moreParams, baseParams); 182 | SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), params) {}; 183 | Iterable stream = Collections.singleton((ContentStream)new ContentStreamBase.StringStream(doc)); 184 | req.setContentStreams(stream); 185 | return req; 186 | } 187 | 188 | /** Asserts the sorted arrays are equals, with a helpful error message when not. 189 | * @param message 190 | * @param expecteds 191 | * @param actuals 192 | */ 193 | public void assertSortedArrayEquals(String message, Object[] expecteds, Object[] actuals) { 194 | AssertionError error = null; 195 | try { 196 | assertArrayEquals(null, expecteds, actuals); 197 | } catch (AssertionError e) { 198 | error = e; 199 | } 200 | if (error == null) 201 | return; 202 | TreeSet expectedRemaining = new TreeSet<>(Arrays.asList(expecteds)); 203 | expectedRemaining.removeAll(Arrays.asList(actuals)); 204 | if (!expectedRemaining.isEmpty()) 205 | fail(message+": didn't find expected "+expectedRemaining.first()+" (of "+expectedRemaining.size()+"); "+ error); 206 | TreeSet actualsRemaining = new TreeSet<>(Arrays.asList(actuals)); 207 | actualsRemaining.removeAll(Arrays.asList(expecteds)); 208 | fail(message+": didn't expect "+actualsRemaining.first()+" (of "+actualsRemaining.size()+"); "+ error); 209 | } 210 | 211 | class TestTag implements Comparable { 212 | final int startOffset, endOffset; 213 | final String substring; 214 | final String docName; 215 | 216 | TestTag(int startOffset, int endOffset, String substring, String docName) { 217 | this.startOffset = startOffset; 218 | this.endOffset = endOffset; 219 | this.substring = substring; 220 | this.docName = docName; 221 | } 222 | 223 | @Override 224 | public String toString() { 225 | return "TestTag{" + 226 | "[" + startOffset + "-" + endOffset + "]" + 227 | " doc=" + NAMES.indexOf(docName) + ":'" + docName + "'" + 228 | (docName.equals(substring) || substring == null ? "" : " substr="+substring)+ 229 | '}'; 230 | } 231 | 232 | @Override 233 | public boolean equals(Object obj) { 234 | TestTag that = (TestTag) obj; 235 | return new EqualsBuilder() 236 | .append(this.startOffset, that.startOffset) 237 | .append(this.endOffset, that.endOffset) 238 | .append(this.docName, that.docName) 239 | .isEquals(); 240 | } 241 | 242 | @Override 243 | public int hashCode() { 244 | return startOffset;//cheesy but acceptable 245 | } 246 | 247 | @Override 248 | public int compareTo(Object o) { 249 | TestTag that = (TestTag) o; 250 | return new CompareToBuilder() 251 | .append(this.startOffset, that.startOffset) 252 | .append(this.endOffset, that.endOffset) 253 | .append(this.docName,that.docName) 254 | .toComparison(); 255 | } 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/ConcatenateFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.BaseTokenStreamTestCase; 26 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 27 | 28 | import java.io.IOException; 29 | import java.io.StringReader; 30 | 31 | public class ConcatenateFilterTest extends BaseTokenStreamTestCase { 32 | 33 | public void testTypical() throws IOException { 34 | String NYC = "new york city"; 35 | WhitespaceTokenizer stream = new WhitespaceTokenizer(); 36 | stream.setReader(new StringReader(NYC)); 37 | ConcatenateFilter filter = new ConcatenateFilter(stream); 38 | try { 39 | assertTokenStreamContents(filter, new String[]{NYC}, 40 | new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"}, 41 | new int[]{1}, null, NYC.length(), true); 42 | } catch (AssertionError e) { 43 | //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly. 44 | // It's manner of checking this is imperfect and incompatible with 45 | // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(), 46 | // which is weird. To the best of my ability, end() appears to be implemented correctly. 47 | if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()")) 48 | throw e; 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/EmbeddedSolrNoSerializeTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.document.Field; 26 | import org.apache.solr.SolrTestCaseJ4; 27 | import org.apache.solr.client.solrj.SolrServerException; 28 | import org.apache.solr.client.solrj.StreamingResponseCallback; 29 | import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; 30 | import org.apache.solr.client.solrj.request.QueryRequest; 31 | import org.apache.solr.client.solrj.response.QueryResponse; 32 | import org.apache.solr.common.SolrDocument; 33 | import org.apache.solr.common.SolrDocumentList; 34 | import org.apache.solr.common.params.ModifiableSolrParams; 35 | import org.apache.solr.common.params.SolrParams; 36 | import org.apache.solr.common.util.ContentStream; 37 | import org.apache.solr.common.util.ContentStreamBase; 38 | import org.junit.Before; 39 | import org.junit.BeforeClass; 40 | import org.junit.Ignore; 41 | import org.junit.Test; 42 | 43 | import java.io.IOException; 44 | import java.util.Collection; 45 | import java.util.Collections; 46 | import java.util.concurrent.atomic.AtomicReference; 47 | import java.util.function.BiFunction; 48 | 49 | /** 50 | * Tests that we can skip serialization of the documents when embedding 51 | * Solr. 52 | * 53 | * @author David Smiley - dsmiley@apache.org 54 | */ 55 | public class EmbeddedSolrNoSerializeTest extends SolrTestCaseJ4 { 56 | 57 | static EmbeddedSolrServer solrServer; 58 | 59 | @BeforeClass 60 | public static void init() throws Exception { 61 | initCore("solrconfig.xml", "schema.xml"); 62 | solrServer = new EmbeddedSolrServer(h.getCoreContainer(), "collection1"); 63 | //we don't need to close the EmbeddedSolrServer because SolrTestCaseJ4 closes the core 64 | } 65 | 66 | @Before 67 | public void setUp() throws Exception { 68 | super.setUp(); 69 | clearIndex(); 70 | assertU(adoc("id", "9999", "name", "Boston")); 71 | assertU(commit()); 72 | } 73 | 74 | @Test 75 | public void testTag() throws SolrServerException, IOException { 76 | ModifiableSolrParams params = params(); 77 | String input = "foo boston bar";//just one tag; 78 | QueryRequest req = new SolrTaggerRequest(params, input); 79 | req.setPath("/tag"); 80 | 81 | QueryResponse rsp = req.process(solrServer); 82 | SolrDocumentList results= (SolrDocumentList) rsp.getResponse().get("response"); 83 | assertNotNull(rsp.getResponse().get("tags")); 84 | assertNotNull(results.get(0)); 85 | } 86 | 87 | @SuppressWarnings("serial") 88 | public static class SolrTaggerRequest extends QueryRequest { 89 | 90 | private final String input; 91 | 92 | public SolrTaggerRequest(SolrParams p, String input) { 93 | super(p, METHOD.POST); 94 | this.input = input; 95 | } 96 | 97 | // Deprecated in 7.2 but should live on until 8.x 98 | @SuppressWarnings("deprecation") 99 | @Override 100 | public Collection getContentStreams() { 101 | return Collections.singleton(new ContentStreamBase.StringStream(input)); 102 | } 103 | 104 | // As of 7.2. But won't work until: https://issues.apache.org/jira/browse/SOLR-12142 105 | // @Override 106 | // public RequestWriter.ContentWriter getContentWriter(String expectedType) { 107 | // return new RequestWriter.StringPayloadContentWriter(input, "text/plain; charset=UTF8"); 108 | // } 109 | } 110 | 111 | @Test 112 | public void testSearch() throws Exception { 113 | QueryResponse rsp = solrServer.query(params("q", "name:Boston")); 114 | assertNotNull(rsp.getResults().get(0)); 115 | } 116 | 117 | @Test 118 | public void testAssertTagStreamingWithSolrTaggerRequest() throws Exception { 119 | doTestAssertTagStreaming(SolrTaggerRequest::new); 120 | } 121 | 122 | @Test @Ignore("As of Solr 7, stream.body is disabled by default for security ") // DWS: dubious, IMO 123 | // and it can't be enabled with EmbeddedSolrServer until SOLR-12126 124 | public void testAssertTagStreamingWithStreamBodyParam() throws Exception { 125 | doTestAssertTagStreaming((params, input) -> { 126 | params.set("stream.body", input); 127 | return new QueryRequest(params); 128 | }); 129 | } 130 | 131 | public void doTestAssertTagStreaming(BiFunction newQueryRequest) throws IOException, SolrServerException { 132 | ModifiableSolrParams params = params(); 133 | String input = "foo boston bar";//just one tag; 134 | QueryRequest req = newQueryRequest.apply(params, input); 135 | req.setPath("/tag"); 136 | 137 | final AtomicReference refDoc = new AtomicReference<>(); 138 | req.setStreamingResponseCallback(new StreamingResponseCallback() { 139 | @Override 140 | public void streamSolrDocument(SolrDocument doc) { 141 | refDoc.set(doc); 142 | } 143 | 144 | @Override 145 | public void streamDocListInfo(long numFound, long start, Float maxScore) { 146 | 147 | } 148 | }); 149 | QueryResponse rsp = req.process(solrServer); 150 | assertNotNull(rsp.getResponse().get("tags")); 151 | assertNotNull(refDoc.get()); 152 | assertEquals("Boston", ((Field)refDoc.get().getFieldValue("name")).stringValue()); 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/HtmlInterpolationTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.junit.Ignore; 26 | import org.junit.Test; 27 | 28 | public class HtmlInterpolationTest extends XmlInterpolationTest { 29 | @Override 30 | public void setUp() throws Exception { 31 | super.setUp(); 32 | baseParams.set("htmlOffsetAdjust", "true"); 33 | baseParams.set("matchText", "true"); 34 | } 35 | 36 | @Override 37 | @Test @Ignore //because in html mode, seemingly everything is valid 38 | public void testValidatingXml() throws Exception { 39 | } 40 | 41 | @Override 42 | @Test @Ignore //because in html mode, seemingly everything is valid 43 | public void testInvalidXml() throws Exception { 44 | } 45 | 46 | @Override 47 | protected void validateXml(String xml) throws Exception { 48 | //cause this test to *not* try to parse as actual html 49 | } 50 | 51 | @Test 52 | public void testHtml() throws Exception { 53 | buildNames("start end"); 54 | 55 | assertXmlTag("before start
end after
", true);//br is assumed empty 56 | 57 | //no wrapping tags: 58 | assertXmlTag("start end", true); 59 | assertXmlTag("start end other text", true); 60 | assertXmlTag("start end other text", true); 61 | assertXmlTag("other text start end", true); 62 | assertXmlTag("start end", true); 63 | } 64 | 65 | @Test 66 | public void testHtmlNonTaggable() throws Exception { 67 | baseParams.set("nonTaggableTags","a" + (random().nextBoolean() ? ",sub" : "")); 68 | buildNames("start end"); 69 | 70 | assertXmlTag("start end", true); 71 | assertXmlTag("start end", false); 72 | assertXmlTag("start end", false); 73 | assertXmlTag("before start
end after
", true);//adjacent 74 | assertXmlTag("before inner start
end after
", true); 75 | 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/RandomizedTaggerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import com.carrotsearch.randomizedtesting.annotations.Repeat; 26 | import com.carrotsearch.randomizedtesting.generators.RandomNumbers; 27 | import com.carrotsearch.randomizedtesting.generators.RandomPicks; 28 | import com.carrotsearch.randomizedtesting.generators.RandomStrings; 29 | import org.junit.BeforeClass; 30 | import org.junit.Test; 31 | 32 | import java.util.ArrayList; 33 | import java.util.HashSet; 34 | import java.util.List; 35 | import java.util.Random; 36 | import java.util.Set; 37 | 38 | /** 39 | * Randomly generate taggable text and verify via simple tag algorithm. 40 | */ 41 | @Repeat(iterations = 10) 42 | public class RandomizedTaggerTest extends AbstractTaggerTest { 43 | 44 | @BeforeClass 45 | public static void beforeClass() throws Exception { 46 | initCore("solrconfig.xml", "schema.xml"); 47 | } 48 | 49 | @Test 50 | public void test() throws Exception { 51 | final Random R = random(); 52 | 53 | Set names = new HashSet<>(); 54 | //random list of single-word names 55 | final int NUM_SINGLES = 4;//RandomInts.randomIntBetween(R, 1, 5); 56 | for (int i = 0; i < NUM_SINGLES; i++) { 57 | if (i == 0)//first is a big string (perhaps triggers bugs related to growing buffers) 58 | names.add(randomStringOfLength(16, 32)); 59 | else 60 | names.add(randomString()); 61 | } 62 | 63 | //add random list of multi-word names, partially including existing names 64 | final int NUM_MULTI = 10; 65 | for (int i = 0; i < NUM_MULTI; i++) { 66 | final int numWords = RandomNumbers.randomIntBetween(R, 2, 4); 67 | StringBuilder buf = new StringBuilder(); 68 | for (int j = 0; j < numWords; j++) { 69 | if (j != 0) 70 | buf.append(' '); 71 | if (R.nextBoolean()) {//new likely non-existent word 72 | buf.append(randomString()); 73 | } else {//existing word (possible multi-word from prev iteration) 74 | buf.append(RandomPicks.randomFrom(R, names)); 75 | } 76 | } 77 | names.add(buf.toString()); 78 | } 79 | 80 | // BUILD NAMES 81 | buildNames(names.toArray(new String[names.size()])); 82 | 83 | // QUERY LOOP 84 | for (int tTries = 0; tTries < 10 * RANDOM_MULTIPLIER; tTries++) { 85 | // Build up random input, similar to multi-word random names above 86 | StringBuilder input = new StringBuilder(); 87 | final int INPUT_WORD_LEN = 20; 88 | input.append(' ');//must start with space based on assertBruteForce logic 89 | for (int i = 0; i < INPUT_WORD_LEN; i++) { 90 | if (R.nextBoolean()) {//new likely non-existent word 91 | input.append(randomString()); 92 | } else {//existing word (possible multi-word from prev iteration) 93 | input.append(RandomPicks.randomFrom(R, NAMES)); 94 | } 95 | input.append(' ');//must end with a space 96 | } 97 | 98 | boolean madeIt = false; 99 | try { 100 | assertBruteForce(input.toString()); 101 | madeIt = true; 102 | } finally { 103 | if (!madeIt) { 104 | System.out.println("Reproduce with:"); 105 | System.out.print(" buildNames("); 106 | for (int i = 0; i < NAMES.size(); i++) { 107 | if (i != 0) 108 | System.out.print(','); 109 | System.out.print('"'); 110 | System.out.print(NAMES.get(i)); 111 | System.out.print('"'); 112 | } 113 | System.out.println(");"); 114 | System.out.println(" assertBruteForce(\"" + input+"\");"); 115 | } 116 | } 117 | } 118 | 119 | } 120 | 121 | private void assertBruteForce(String input) throws Exception { 122 | assert input.matches(" .* "); 123 | baseParams.set("overlaps", "ALL"); 124 | 125 | //loop through NAMES and find all tag offsets 126 | List testTags = new ArrayList<>(); 127 | for (String name : NAMES) { 128 | String spaceName = " "+name+" "; 129 | int off = 0; 130 | while (true) { 131 | int idx = input.indexOf(spaceName, off); 132 | if (idx < 0) 133 | break; 134 | testTags.add(new TestTag(idx + 1, idx + 1 + name.length(), name, name)); 135 | off = idx + 1; 136 | } 137 | } 138 | 139 | //assert 140 | assertTags(reqDoc(input), testTags.toArray(new TestTag[testTags.size()])); 141 | } 142 | 143 | private String randomString() { return randomStringOfLength(1, 1); } 144 | 145 | private String randomStringOfLength(int min, int max) { 146 | return RandomStrings.randomAsciiLettersOfLengthBetween(random(), min, max).toLowerCase(); 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/Tagger2Test.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.junit.BeforeClass; 26 | import org.junit.Ignore; 27 | import org.junit.Test; 28 | 29 | import java.nio.charset.StandardCharsets; 30 | 31 | /** 32 | * Test the {@link org.opensextant.solrtexttagger.TaggerRequestHandler}. 33 | */ 34 | public class Tagger2Test extends AbstractTaggerTest { 35 | 36 | @BeforeClass 37 | public static void beforeClass() throws Exception { 38 | initCore("solrconfig.xml", "schema.xml"); 39 | } 40 | 41 | @Override 42 | public void setUp() throws Exception { 43 | super.setUp(); 44 | baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); 45 | } 46 | 47 | /** whole matching, no sub-tags */ 48 | @Test 49 | public void testLongestDominantRight() throws Exception { 50 | buildNames("in", "San", "in San", "Francisco", "San Francisco", 51 | "San Francisco State College", "College of California", 52 | "Clayton", "Clayton North", "North Carolina"); 53 | 54 | assertTags("He lived in San Francisco.", 55 | "in", "San Francisco"); 56 | 57 | assertTags("He enrolled in San Francisco State College of California", 58 | "in", "San Francisco State College"); 59 | 60 | assertTags("He lived in Clayton North Carolina", 61 | "in", "Clayton", "North Carolina"); 62 | 63 | } 64 | 65 | // As of Lucene/Solr 4.9, StandardTokenizer never does this anymore (reported to Lucene dev-list, 66 | // Jan 26th 2015. Honestly it's not particularly important to us but it renders this test 67 | // pointless. 68 | /** Orig issue https://github.com/OpenSextant/SolrTextTagger/issues/2 related: #13 */ 69 | @Test 70 | @Ignore 71 | public void testVeryLongWord() throws Exception { 72 | String SANFRAN = "San Francisco"; 73 | buildNames(SANFRAN); 74 | 75 | // exceeds default 255 max token length which means it in-effect becomes a stop-word 76 | StringBuilder STOP = new StringBuilder(260);//>255 77 | for (int i = 0; i < STOP.capacity(); i++) { 78 | STOP.append((char) ('0' + (i % 10))); 79 | } 80 | 81 | String doc = "San " + STOP + " Francisco"; 82 | assertTags(doc);//no match due to default stop word handling 83 | //and we find it when we ignore stop words 84 | assertTags(reqDoc(doc, "ignoreStopwords", "true"), new TestTag(0, doc.length(), doc, lookupByName(SANFRAN))); 85 | } 86 | 87 | /** Support for stopwords (posInc > 1); 88 | * discussion: https://github.com/OpenSextant/SolrTextTagger/issues/13 */ 89 | @Test 90 | public void testStopWords() throws Exception { 91 | baseParams.set("qt", "/tagStop");//stop filter (pos inc enabled) index & query 92 | 93 | String SOUTHOFWALES = "South of Wales";//'of' is stop word index time & query 94 | String ACITYA = "A City A"; 95 | 96 | buildNames(SOUTHOFWALES, ACITYA); 97 | 98 | //round-trip works 99 | assertTags(reqDoc(SOUTHOFWALES), new TestTag(0, SOUTHOFWALES.length(), SOUTHOFWALES, 100 | lookupByName(SOUTHOFWALES))); 101 | // but offsets doesn't include stopword when leading or trailing... 102 | assertTags(reqDoc(ACITYA), new TestTag(2, 6, "City", 103 | lookupByName(ACITYA))); 104 | //break on stop words 105 | assertTags(reqDoc(SOUTHOFWALES, "ignoreStopwords", "false"));//match nothing 106 | } 107 | 108 | /** Ensure character offsets work for multi-byte characters */ 109 | @Test 110 | public void testMultibyteChar() throws Exception { 111 | // https://unicode-table.com/en/2019/ 112 | // 0 1 2 3 4 113 | // 01234567890123456789012345678901234567890 114 | String TEXT = "He mentionned ’Obama’ in the White House"; 115 | assertEquals(40, TEXT.length()); // char length (in Java, UTF16) 116 | 117 | String QUOTE = TEXT.substring(14, 15); 118 | assertEquals(8217, QUOTE.codePointAt(0)); 119 | 120 | //UTF8 121 | assertEquals(3, QUOTE.getBytes(StandardCharsets.UTF_8).length); 122 | assertEquals(1, "a".getBytes(StandardCharsets.UTF_8).length); 123 | assertEquals(40 + 2*2, TEXT.getBytes(StandardCharsets.UTF_8).length); 124 | 125 | //UTF16 big endian (by specifying big/little endian, there is no "byte order mark") 126 | assertEquals(2, QUOTE.getBytes(StandardCharsets.UTF_16BE).length); 127 | assertEquals(2, "a".getBytes(StandardCharsets.UTF_16BE).length); 128 | assertEquals(40 * 2, TEXT.getBytes(StandardCharsets.UTF_16BE).length); 129 | 130 | 131 | buildNames("Obama"); 132 | 133 | assertTags(TEXT, "Obama"); 134 | 135 | // TODO test surrogate pairs (i.e. code points not in the BMP) 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/TaggerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.solr.common.params.CommonParams; 26 | import org.apache.solr.common.params.ModifiableSolrParams; 27 | import org.apache.solr.request.SolrQueryRequest; 28 | import org.junit.BeforeClass; 29 | import org.junit.Test; 30 | 31 | import java.util.Arrays; 32 | import java.util.stream.Collectors; 33 | 34 | /** 35 | * The original test for {@link org.opensextant.solrtexttagger.TaggerRequestHandler}. 36 | */ 37 | public class TaggerTest extends AbstractTaggerTest { 38 | 39 | @BeforeClass 40 | public static void beforeClass() throws Exception { 41 | initCore("solrconfig.xml", "schema.xml"); 42 | } 43 | 44 | private void indexAndBuild() throws Exception { 45 | N[] names = N.values(); 46 | String[] namesStrs = new String[names.length]; 47 | for (int i = 0; i < names.length; i++) { 48 | namesStrs[i] = names[i].getName(); 49 | } 50 | buildNames(namesStrs); 51 | } 52 | 53 | /** Name corpus */ 54 | enum N { 55 | //keep order to retain ord() 56 | London, London_Business_School, Boston, City_of_London, 57 | of, the//filtered out of the corpus by a custom query 58 | ; 59 | 60 | String getName() { return name().replace('_',' '); } 61 | static N lookupByName(String name) { return N.valueOf(name.replace(' ', '_')); } 62 | int getId() { return ordinal(); } 63 | } 64 | 65 | @Test 66 | public void testFormat() throws Exception { 67 | baseParams.set("qt", "/tagPartial"); 68 | baseParams.set("overlaps", "NO_SUB"); 69 | indexAndBuild(); 70 | 71 | String rspStr = _testFormatRequest(false); 72 | String expected = "\n" + 73 | "\n" + 74 | "1" + 75 | "" + 76 | "0" + 77 | "6" + 78 | "1" + 79 | "" + 80 | "" + 81 | "1London Business School" + 82 | "\n" + 83 | "\n"; 84 | assertEquals(expected, rspStr); 85 | } 86 | 87 | @Test 88 | public void testFormatMatchText() throws Exception { 89 | baseParams.set("qt", "/tagPartial"); 90 | baseParams.set("overlaps", "NO_SUB"); 91 | indexAndBuild(); 92 | 93 | String rspStr = _testFormatRequest(true); 94 | String expected = "\n" + 95 | "\n" + 96 | "1" + 97 | "" + 98 | "0" + 99 | "6<" + 100 | "str name=\"matchText\">school" + 101 | "1" + 102 | "" + 103 | "" + 104 | "1London Business School" + 105 | "\n" + 106 | "\n"; 107 | assertEquals(expected, rspStr); 108 | } 109 | 110 | private String _testFormatRequest(boolean matchText) throws Exception { 111 | String doc = "school";//just one tag 112 | SolrQueryRequest req = reqDoc(doc, "indent", "off", "omitHeader", "on", "matchText", ""+matchText); 113 | String rspStr = h.query(req); 114 | req.close(); 115 | return rspStr; 116 | } 117 | 118 | @Test 119 | /** Partial matching, no sub-tags */ 120 | public void testPartialMatching() throws Exception { 121 | baseParams.set("qt", "/tagPartial"); 122 | baseParams.set("overlaps", "NO_SUB"); 123 | indexAndBuild(); 124 | 125 | //these match nothing 126 | assertTags(reqDoc("") ); 127 | assertTags(reqDoc(" ") ); 128 | assertTags(reqDoc("the") ); 129 | 130 | String doc; 131 | 132 | //just London Business School via "school" substring 133 | doc = "school"; 134 | assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); 135 | 136 | doc = "a school"; 137 | assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); 138 | 139 | doc = "school a"; 140 | assertTags(reqDoc(doc), tt(doc,"school", 0, N.London_Business_School)); 141 | 142 | //More interesting 143 | 144 | doc = "school City"; 145 | assertTags(reqDoc(doc), 146 | tt(doc, "school", 0, N.London_Business_School), 147 | tt(doc, "City", 0, N.City_of_London) ); 148 | 149 | doc = "City of London Business School"; 150 | assertTags(reqDoc(doc), //no plain London (sub-tag) 151 | tt(doc, "City of London", 0, N.City_of_London), 152 | tt(doc, "London Business School", 0, N.London_Business_School)); 153 | } 154 | 155 | @Test 156 | /** whole matching, no sub-tags */ 157 | public void testWholeMatching() throws Exception { 158 | baseParams.set("qt", "/tag"); 159 | baseParams.set("overlaps", "NO_SUB"); 160 | indexAndBuild(); 161 | 162 | //these match nothing 163 | assertTags(reqDoc("")); 164 | assertTags(reqDoc(" ") ); 165 | assertTags(reqDoc("the") ); 166 | 167 | //partial on N.London_Business_School matches nothing 168 | assertTags(reqDoc("school") ); 169 | assertTags(reqDoc("a school") ); 170 | assertTags(reqDoc("school a") ); 171 | assertTags(reqDoc("school City") ); 172 | 173 | String doc; 174 | 175 | doc = "school business london";//backwards 176 | assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); 177 | 178 | doc = "of London Business School"; 179 | assertTags(reqDoc(doc), //no plain London (sub-tag) 180 | tt(doc, "London Business School", 0, N.London_Business_School)); 181 | 182 | //More interesting 183 | doc = "City of London Business School"; 184 | assertTags(reqDoc(doc), //no plain London (sub-tag) 185 | tt(doc, "City of London", 0, N.City_of_London), 186 | tt(doc, "London Business School", 0, N.London_Business_School)); 187 | 188 | doc = "City of London Business"; 189 | assertTags(reqDoc(doc), //no plain London (sub-tag) no Business (partial-match) 190 | tt(doc, "City of London", 0, N.City_of_London)); 191 | 192 | doc = "London Business magazine"; 193 | assertTags(reqDoc(doc), //Just London; L.B.S. fails 194 | tt(doc, "London", 0, N.London)); 195 | } 196 | 197 | @Test 198 | /** whole matching, with sub-tags */ 199 | public void testSubTags() throws Exception { 200 | baseParams.set("qt", "/tag"); 201 | baseParams.set("overlaps", "ALL"); 202 | indexAndBuild(); 203 | 204 | //these match nothing 205 | assertTags(reqDoc("")); 206 | assertTags(reqDoc(" ") ); 207 | assertTags(reqDoc("the") ); 208 | 209 | //partial on N.London_Business_School matches nothing 210 | assertTags(reqDoc("school") ); 211 | assertTags(reqDoc("a school") ); 212 | assertTags(reqDoc("school a") ); 213 | assertTags(reqDoc("school City") ); 214 | 215 | String doc; 216 | 217 | doc = "school business london";//backwards 218 | assertTags(reqDoc(doc), tt(doc,"london", 0, N.London)); 219 | 220 | //More interesting 221 | doc = "City of London Business School"; 222 | assertTags(reqDoc(doc), 223 | tt(doc, "City of London", 0, N.City_of_London), 224 | tt(doc, "London", 0, N.London), 225 | tt(doc, "London Business School", 0, N.London_Business_School)); 226 | 227 | doc = "City of London Business"; 228 | assertTags(reqDoc(doc), 229 | tt(doc, "City of London", 0, N.City_of_London), 230 | tt(doc, "London", 0, N.London)); 231 | } 232 | 233 | @Test 234 | public void testMultipleFilterQueries() throws Exception { 235 | baseParams.set("qt", "/tag"); 236 | baseParams.set("overlaps", "ALL"); 237 | 238 | // build up the corpus with some additional fields for filtering purposes 239 | deleteByQueryAndGetVersion("*:*", null); 240 | 241 | int i = 0; 242 | assertU(adoc("id", ""+i++, "name", N.London.getName(), "type", "city", "country", "UK")); 243 | assertU(adoc("id", ""+i++, "name", N.London_Business_School.getName(), "type", "school", "country", "UK")); 244 | assertU(adoc("id", ""+i++, "name", N.Boston.getName(), "type", "city", "country", "US")); 245 | assertU(adoc("id", ""+i++, "name", N.City_of_London.getName(), "type", "org", "country", "UK")); 246 | assertU(commit()); 247 | 248 | // not calling buildNames so that we can bring along extra attributes for filtering 249 | NAMES = Arrays.stream(N.values()).map(N::getName).collect(Collectors.toList()); 250 | 251 | // phrase that matches everything 252 | String doc = "City of London Business School in Boston"; 253 | 254 | // first do no filtering 255 | ModifiableSolrParams p = new ModifiableSolrParams(); 256 | p.add(CommonParams.Q, "*:*"); 257 | assertTags(reqDoc(doc, p), 258 | tt(doc, "City of London", 0, N.City_of_London), 259 | tt(doc, "London", 0, N.London), 260 | tt(doc, "London Business School", 0, N.London_Business_School), 261 | tt(doc, "Boston", 0, N.Boston)); 262 | 263 | // add a single fq 264 | p.add(CommonParams.FQ, "type:city"); 265 | assertTags(reqDoc(doc, p), 266 | tt(doc, "London", 0, N.London), 267 | tt(doc, "Boston", 0, N.Boston)); 268 | 269 | // add another fq 270 | p.add(CommonParams.FQ, "country:US"); 271 | assertTags(reqDoc(doc, p), 272 | tt(doc, "Boston", 0, N.Boston)); 273 | } 274 | 275 | private TestTag tt(String doc, String substring, int substringIndex, N name) { 276 | assert substringIndex == 0; 277 | 278 | //little bit of copy-paste code from super.tt() 279 | int startOffset = -1, endOffset; 280 | int substringIndex1 = 0; 281 | for(int i = 0; i <= substringIndex1; i++) { 282 | startOffset = doc.indexOf(substring, ++startOffset); 283 | assert startOffset >= 0 : "The test itself is broken"; 284 | } 285 | endOffset = startOffset+ substring.length();//1 greater (exclusive) 286 | return new TestTag(startOffset, endOffset, substring, lookupByName(name.getName())); 287 | } 288 | 289 | } 290 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/TaggingAttributeTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.junit.BeforeClass; 26 | import org.junit.Test; 27 | 28 | /** 29 | * Test the {@link org.opensextant.solrtexttagger.TaggerRequestHandler} with 30 | * a Analyzer chain that does use the {@link TaggingAttribute}. See the test 31 | * configuration under 'taggingattribute'. 32 | */ 33 | public class TaggingAttributeTest extends AbstractTaggerTest { 34 | 35 | @BeforeClass 36 | public static void beforeClass() throws Exception { 37 | //NOTE: We use the TaggingAttribute specific configuration 38 | // Reference solr-home in target/test-classes since that's where it's copied and any config 39 | // persisting (e.g. from rest managed stuff) will happen there. 40 | initCore("solrconfig.xml", "schema.xml", "target/test-classes/taggingattribute"); 41 | } 42 | 43 | @Test 44 | /** 45 | * Whole matching, no sub-tags. Links only words with > 3 letters. 46 | * Because of that "San" is not used to start tags 47 | * 48 | */ 49 | public void testTaggingAttribute() throws Exception { 50 | // this test is based on the longest dominant right test, so we use the 51 | // the same TagClusterReducer setting 52 | baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); 53 | 54 | buildNames("in", "San", "in San", "Francisco", "San Francisco", 55 | "San Francisco State College", "College of California", 56 | "Clayton", "Clayton North", "North Carolina"); 57 | 58 | assertTags("He lived in San Francisco.", 59 | //"in", "San Francisco"); //whis would be expected without taggable 60 | "Francisco");// this are the expected results with taggable 61 | 62 | assertTags("He enrolled in San Francisco State College of California", 63 | //"in", "San Francisco State College"); //without taggable enabled 64 | "Francisco", "College of California");// With taggable 65 | //NOTE this also tests that started tags are advanced for non-taggable 66 | // tokens, as otherwise 'College of California' would not be 67 | // suggested. 68 | 69 | assertTags("He lived in Clayton North Carolina", 70 | //"in", "Clayton", "North Carolina"); 71 | "Clayton", "North Carolina"); 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/WordLengthTaggingFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenFilter; 26 | import org.apache.lucene.analysis.TokenStream; 27 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 28 | 29 | import java.io.IOException; 30 | 31 | /** 32 | * Simple TokenFilter that lookup only Tokens with more as the parsed number 33 | * of chars.

34 | * NOTE:This implementation is only intended to be used as an example 35 | * and for unit testing the {@link TaggingAttribute} feature. Typically 36 | * implementations will be based on NLP results (e.g. using POS tags or 37 | * detected Named Entities). 38 | *

39 | * Example Usage:

40 | * Currently the usage requires to modify the Analyzer as defined by the 41 | * indexedField. An alternative would be to allow the configuration 42 | * of a special FieldType in the schema.xml and use this Analyzer for processing 43 | * the text sent to the request.

44 | * While the current solution is fine for direct API usage, defining the 45 | * Analyzer in the schema.xml would be better suitable for using this feature 46 | * with the {@link TaggerRequestHandler}. 47 | * 48 | *

 49 |  *     Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer();
 50 |  *     //get the TokenStream from the Analyzer
 51 |  *     TokenStream baseStream = analyzer.tokenStream("", reader);
 52 |  *     //add a FilterStream that sets the LookupAttribute to the end
 53 |  *     TokenStream filterStream = new WordLengthLookupFilter(baseStream);
 54 |  *     //create the Tagger using the modified analyzer chain.
 55 |  *     new Tagger(corpus, filterStream, tagClusterReducer) {
 56 |  *
 57 |  *         protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
 58 |  *             //implement the callback
 59 |  *         }
 60 |  *
 61 |  *     }.process();
 62 |  * 
63 | * 64 | * @author Rupert Westenthaler 65 | */ 66 | public class WordLengthTaggingFilter extends TokenFilter { 67 | 68 | /** 69 | * The default minimum length is 3 70 | */ 71 | public static final int DEFAULT_MIN_LENGTH = 3; 72 | private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class); 73 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 74 | private int minLength; 75 | 76 | /** 77 | * TokenFilter only marks tokens to be looked up with equals or more as 78 | * {@link #DEFAULT_MIN_LENGTH} characters 79 | * 80 | * @param input 81 | */ 82 | public WordLengthTaggingFilter(TokenStream input) { 83 | this(input, null); 84 | } 85 | 86 | /** 87 | * TokenFilter only marks tokens to be looked up with equals or more characters 88 | * as the parsed minimum. 89 | * 90 | * @param input the TokenStream to consume tokens from 91 | * @param minLength The minimum length to lookup a Token. null 92 | * or <= 0 to use the #DEFAULT_MIN_LENGTH 93 | */ 94 | public WordLengthTaggingFilter(TokenStream input, Integer minLength) { 95 | super(input); 96 | if (minLength == null || minLength <= 0) { 97 | this.minLength = DEFAULT_MIN_LENGTH; 98 | } else { 99 | this.minLength = minLength; 100 | } 101 | } 102 | 103 | @Override 104 | public final boolean incrementToken() throws IOException { 105 | if (input.incrementToken()) { 106 | int size = offsetAtt.endOffset() - offsetAtt.startOffset(); 107 | lookupAtt.setTaggable(size >= minLength); 108 | return true; 109 | } else { 110 | return false; 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/WordLengthTaggingFilterFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.lucene.analysis.TokenStream; 26 | import org.apache.lucene.analysis.util.TokenFilterFactory; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | import java.util.Map; 31 | 32 | public class WordLengthTaggingFilterFactory extends TokenFilterFactory { 33 | 34 | private final Logger log = LoggerFactory.getLogger(WordLengthTaggingFilterFactory.class); 35 | 36 | public static final String MIN_LENGTH = "minLength"; 37 | 38 | private final Integer minLength; 39 | 40 | public WordLengthTaggingFilterFactory(Map args) { 41 | super(args); 42 | int minLength = -1; 43 | Object value = args.get(MIN_LENGTH); 44 | if (value != null) { 45 | try { 46 | minLength = Integer.parseInt(value.toString()); 47 | } catch (NumberFormatException e) { 48 | log.warn("Unable to parse minLength from value 'minLength=\"{}\"'", value); 49 | 50 | } 51 | } 52 | if (minLength <= 0) { 53 | log.info("use default minLength={}", WordLengthTaggingFilter.DEFAULT_MIN_LENGTH); 54 | this.minLength = null; 55 | } else { 56 | log.info("set minLength={}", minLength); 57 | this.minLength = minLength; 58 | } 59 | } 60 | 61 | @Override 62 | public TokenStream create(TokenStream input) { 63 | return new WordLengthTaggingFilter(input, minLength); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/test/java/org/opensextant/solrtexttagger/XmlInterpolationTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | This software was produced for the U. S. Government 3 | under Contract No. W15P7T-11-C-F600, and is 4 | subject to the Rights in Noncommercial Computer Software 5 | and Noncommercial Computer Software Documentation 6 | Clause 252.227-7014 (JUN 1995) 7 | 8 | Copyright 2013 The MITRE Corporation. All Rights Reserved. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package org.opensextant.solrtexttagger; 24 | 25 | import org.apache.commons.io.IOUtils; 26 | import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; 27 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 28 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 29 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 30 | import org.apache.solr.common.SolrException; 31 | import org.apache.solr.request.SolrQueryRequest; 32 | import org.apache.solr.response.SolrQueryResponse; 33 | import org.junit.BeforeClass; 34 | import org.junit.Test; 35 | import org.xml.sax.InputSource; 36 | 37 | import javax.xml.parsers.DocumentBuilder; 38 | import javax.xml.parsers.DocumentBuilderFactory; 39 | import java.io.IOException; 40 | import java.io.Reader; 41 | import java.io.StringReader; 42 | import java.util.ArrayList; 43 | import java.util.Collections; 44 | import java.util.List; 45 | 46 | public class XmlInterpolationTest extends AbstractTaggerTest { 47 | 48 | private static DocumentBuilder xmlDocBuilder; 49 | 50 | 51 | @BeforeClass 52 | public static void beforeClass() throws Exception { 53 | DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance(); 54 | xmlDocBuilderFactory.setValidating(true); 55 | xmlDocBuilderFactory.setNamespaceAware(true); 56 | xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder(); 57 | 58 | initCore("solrconfig.xml", "schema.xml"); 59 | } 60 | 61 | @Override 62 | public void setUp() throws Exception { 63 | super.setUp(); 64 | baseParams.set("qt", "/tagXml"); 65 | baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); 66 | baseParams.set("xmlOffsetAdjust", "true"); 67 | } 68 | 69 | @Test 70 | public void test() throws Exception { 71 | buildNames("start end"); 72 | 73 | assertXmlTag("before start end after", true); 74 | assertXmlTag("before start
end after
", true); 75 | assertXmlTag("before start end after", true); 76 | assertXmlTag("before start end after", true); 77 | assertXmlTag("before start end after", true); 78 | assertXmlTag("before start end after", true);//adjacent tags 79 | assertXmlTag("before start end after", true); 80 | assertXmlTag("before start end after", true); 81 | 82 | assertXmlTag("

before start

end after
", false); 83 | assertXmlTag("before start

end after

", false); 84 | 85 | assertXmlTag("before start end after", true); 86 | } 87 | 88 | @Test(expected = SolrException.class) 89 | public void testInvalidXml() throws Exception { 90 | assertXmlTag("notXml", false); 91 | } 92 | 93 | @Test(expected = Exception.class) 94 | public void testValidatingXml() throws Exception { 95 | validateXml("foo"); 96 | } 97 | 98 | protected void assertXmlTag(String docText, boolean expected) throws Exception { 99 | final SolrQueryRequest req = reqDoc(docText); 100 | try { // 5.4 and beyond we can use try-with-resources 101 | final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req); 102 | final TestTag[] testTags = pullTagsFromResponse(req, rsp); 103 | if (!expected) { 104 | assertEquals(0, testTags.length); 105 | } else { 106 | assertEquals(1, testTags.length); 107 | final TestTag tag = testTags[0]; 108 | validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName)); 109 | } 110 | } finally { 111 | req.close(); 112 | } 113 | } 114 | 115 | protected void validateXml(String xml) throws Exception { 116 | // the "parse" method also validates XML, will throw an exception if mis-formatted 117 | xmlDocBuilder.parse(new InputSource(new StringReader(xml))); 118 | } 119 | 120 | 121 | @Test 122 | public void testLuceneHtmlFilterBehavior() { 123 | String docText; 124 | 125 | //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734 126 | docText = "start end"; 127 | assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); 128 | 129 | //Space after "end" means offset doesn't include 130 | docText = "start end "; 131 | assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); 132 | 133 | //Matches entity at end 134 | final String endStr = String.format("en&#x%02x;", (int) 'd'); 135 | docText = "start " + endStr + ""; 136 | assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end")); 137 | //... and at start 138 | final String startStr = String.format("&#x%02x;tart", (int) 's'); 139 | docText = "" + startStr + " end"; 140 | assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end")); 141 | 142 | //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start". 143 | docText = "" 145 | + "]>&start;"; 146 | assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start")); 147 | 148 | //Test entity behavior 149 | docText = " — – & &foo;   a b"; 150 | assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"}, 151 | analyzeReturnTokens(docText)); 152 | 153 | //Observe offset adjustment of trailing entity to end tag 154 | docText = "foo bar"; 155 | assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo")); 156 | } 157 | 158 | private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) { 159 | String insertStart = "";// (normally we'd escape id) 160 | String insertEnd = ""; 161 | return docText.substring(0, startOffset) 162 | + insertStart 163 | + docText.substring(startOffset, endOffset) 164 | + insertEnd 165 | + docText.substring(endOffset); 166 | } 167 | 168 | private int[] tagExpect(String docText, String start, String end) { 169 | return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()}; 170 | } 171 | 172 | private int[] analyzeTagOne(String docText, String start, String end) { 173 | int[] result = {-1, -1}; 174 | 175 | Reader filter = new HTMLStripCharFilter(new StringReader(docText)); 176 | 177 | WhitespaceTokenizer ts = new WhitespaceTokenizer(); 178 | final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); 179 | final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); 180 | try { 181 | ts.setReader(filter); 182 | ts.reset(); 183 | while (ts.incrementToken()) { 184 | final String termString = termAttribute.toString(); 185 | if (termString.equals(start)) 186 | result[0] = offsetAttribute.startOffset(); 187 | if (termString.equals(end)) { 188 | result[1] = offsetAttribute.endOffset(); 189 | return result; 190 | } 191 | } 192 | ts.end(); 193 | } catch (IOException e) { 194 | throw new RuntimeException(e); 195 | } finally { 196 | IOUtils.closeQuietly(ts); 197 | } 198 | return result; 199 | } 200 | 201 | private String[] analyzeReturnTokens(String docText) { 202 | List result = new ArrayList<>(); 203 | 204 | Reader filter = new HTMLStripCharFilter(new StringReader(docText), 205 | Collections.singleton("unescaped")); 206 | WhitespaceTokenizer ts = new WhitespaceTokenizer(); 207 | final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); 208 | try { 209 | ts.setReader(filter); 210 | ts.reset(); 211 | while (ts.incrementToken()) { 212 | result.add(termAttribute.toString()); 213 | } 214 | ts.end(); 215 | } catch (IOException e) { 216 | throw new RuntimeException(e); 217 | } finally { 218 | IOUtils.closeQuietly(ts); 219 | } 220 | return result.toArray(new String[result.size()]); 221 | } 222 | 223 | } 224 | -------------------------------------------------------------------------------- /src/test/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory: -------------------------------------------------------------------------------- 1 | org.opensextant.solrtexttagger.WordLengthTaggingFilterFactory -------------------------------------------------------------------------------- /src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 23 | 24 | 25 | 26 | 27 | 28 | 30 | 31 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/test/resources/solr/collection1/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | id 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 87 | 88 | 89 | 90 | 91 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /src/test/resources/solr/collection1/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | 25 | 27 | 28 | ${tests.luceneMatchVersion:LUCENE_CURRENT} 29 | ${solr.data.dir:} 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | name_tag:[* TO *] 43 | 44 | 45 | 46 | 47 | name_tag:[* TO *] 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | name_tag 57 | NOT name:(of the) 58 | 59 | 60 | 61 | 62 | 63 | name_tagStop 64 | 65 | 66 | 67 | 68 | name_tagPartial 69 | NOT name:(of the) 70 | 71 | 72 | 73 | 74 | name_tagXml 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/test/resources/taggingattribute/collection1/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | id 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 59 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /src/test/resources/taggingattribute/collection1/conf/solrconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | 25 | 27 | 28 | ${tests.luceneMatchVersion:LUCENE_CURRENT} 29 | ${solr.data.dir:} 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | name_tag 43 | NOT name:(of the) 44 | 45 | 46 | 47 | --------------------------------------------------------------------------------