├── .gitignore ├── DiceRelevancyFeedback.iml ├── LICENSE ├── README.md ├── pom.xml ├── src └── main │ └── java │ └── org │ └── dice │ └── solrenhancements │ ├── JarVersion.java │ ├── relevancyfeedback │ ├── InterestingTerm.java │ ├── RFHelper.java │ ├── RFParams.java │ ├── RFQuery.java │ ├── RFResult.java │ ├── RFTerm.java │ ├── RelevancyFeedback.java │ └── RelevancyFeedbackHandler.java │ ├── tokenfilters │ ├── ConcatenateTokenFilter.java │ ├── ConcatenateTokenFilterFactory.java │ ├── ConstantTokenFilter.java │ ├── ConstantTokenFilterFactory.java │ ├── MeanPayloadTokenFilter.java │ ├── MeanPayloadTokenFilterFactory.java │ ├── PayloadQueryBoostTokenFilter.java │ ├── PayloadQueryBoostTokenFilterFactory.java │ ├── TypeEraseFilter.java │ └── TypeEraseFilterFactory.java │ └── unsupervisedfeedback │ ├── UnsupervisedFeedbackHandler.java │ ├── UnsupervisedFeedbackHelper.java │ └── UnsupervisedFeedbackParams.java └── target └── DiceRelevancyFeedback-1.0.jar /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | #*.jar 15 | *.war 16 | *.ear 17 | *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | 24 | *.class 25 | 26 | #idea files 27 | **/.idea/workspace.xml 28 | **/.idea/tasks.xml 29 | .idea 30 | 31 | #target/* 32 | target/classes/ 33 | target/maven-archiver/* 34 | target/maven-status/* 35 | target/generated-sources/ 36 | target/generated-test-sources/ 37 | target/surefire/ 38 | target/test-classes/ 39 | -------------------------------------------------------------------------------- /DiceRelevancyFeedback.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Dice Relevancy Feedback 2 | ======================== 3 | 4 | Dice.com's solr plugins for performing personalized search, and recommendations (via the relevancy feedback plugin) and conceptual / semantic search (via the unsupervised feedback plugin). 5 | 6 | ## Links 7 | * [Slides from the talk](https://www.slideshare.net/lucidworks/personalized-search-and-job-recommendations-simon-hughes-dicecom) 8 | * [Video of the Talk](https://www.youtube.com/watch?v=-uiQY2Zatjo&index=31&list=PLU6n9Voqu_1FMt0C-tVNFK0PBqWhTb2Nv) 9 | 10 | ## Building the Plugin 11 | A pre-built jar file can be found in the ```./target``` folder. The project contains a maven pom.xml file which can also be used to build it from source. 12 | 13 | ## Supported Solr versions 14 | - Solr 5.4 (see branch) 15 | - Solr 6.3 (see branch) **also master** 16 | - Solr 7.0 (see branch) - also works in 7.1 17 | 18 | If there is a particular version of Solr you need this for, please create a GitHub issue and I'll see what I can do. 19 | To manually compile it for a specific version, use maven to compile the plugins using the pom.xml file, and update the versions of the solr and lucene libraries in that file, and use maven to pull in those dependencies. Then fix any compilation errors. 20 | 21 | ## Importing into SOLR 22 | Please see the official SOLR guidelines for registering plugins with solr. This basically involves simply dropping the jar file into one of the folders that Solr checks for class and jar files on core reload. 23 | 24 | - [Solr Plugins](https://wiki.apache.org/solr/SolrPlugins) 25 | - [Adding custom plugins in Solr cloud](https://lucene.apache.org/solr/guide/6_6/adding-custom-plugins-in-solrcloud-mode.html) 26 | 27 | # Relevancy Feedback Plugin 28 | An **example request handler configuration** for the solrconfig.xml is shown below, with comments outlining the main parameters: 29 | ```$xml 30 | 31 | 32 | true 33 | json 34 | true 35 | 36 | 37 | lucene 38 | 39 | 40 | jobTitle,skill,company 41 | 42 | skillFromSkill,extractTitles 43 | 44 | 46 | skillFromSkill^3 extractTitles^4.5 47 | 48 | 10 49 | 50 | 51 | 10 52 | true 53 | 54 | 55 | true 56 | 57 | 58 | true 59 | 60 | 61 | 62 | 25% 63 | 64 | 65 | details 66 | 67 | 68 | 69 | 70 | 74 | 75 | 76 | 77 | 78 | 83 | 84 | 85 | 86 | 87 | 88 | title 89 | company_text^0.01 title^12 skill^4 description^0.3 90 | company_text^0.01 title^12 skill^4 description^0.6 91 | 92 | 96 | 97 | 98 | title,title_syn 99 | extractSkills,extractTitles 100 | 101 | 102 | 103 | 104 | 107 | 108 | 109 | 110 | 111 | 112 | 114 | extractSkills^4.5 extractTitles^2.25 title^3.0 title_syn^3.0 115 | 116 | 117 | ``` 118 | #### Example Request 119 | [http://localhost:8983/solr/Jobs/rf?q=id:11f407d319d6cc707437fad874a097c0+id:a2fd2f2e34667d61fadcdcabfd359cf4&rows=10&df=title&fl=title,skills,geoCode,city,state&wt=json](http://localhost:8983/solr/Jobs/rf?q=id:11f407d319d6cc707437fad874a097c0+id:a2fd2f2e34667d61fadcdcabfd359cf4&rows=10&df=title&fl=title,skills,geoCode,city,state&wt=json) 120 | 121 | #### Example Response 122 | ```$json 123 | { 124 | "match":{ 125 | "numFound":2, 126 | "start":0, 127 | "docs":[ 128 | { 129 | "id":"a2fd2f2e34667d61fadcdcabfd359cf4", 130 | "title":"Console AAA Sports Video Game Programmer.", 131 | "skills":["Sports Game Experience a plus.", 132 | "2-10 years plus Console AAA Video Game Programming Experience"], 133 | "geocode":"38.124447,-122.55051", 134 | "city":"Novato", 135 | "state":"CA" 136 | }, 137 | { 138 | "id":"11f407d319d6cc707437fad874a097c0", 139 | "title":"Game Engineer - Creative and Flexible Work Environment!", 140 | "skills":["3D Math", 141 | "Unity3d", 142 | "C#", 143 | "3D Math - game programming", 144 | "game programming", 145 | "C++", 146 | "Java"], 147 | "geocode":"33.97331,-118.243614", 148 | "city":"Los Angeles", 149 | "state":"CA" 150 | } 151 | ] 152 | }, 153 | "response":{ 154 | "numFound":5333, 155 | "start":0, 156 | "docs":[ 157 | { 158 | "title":"Software Design Engineer 3 (Game Developer)", 159 | "skills":["C#", 160 | "C++", 161 | "Unity"], 162 | "geocode":"47.683647,-122.12183", 163 | "city":"Redmond", 164 | "state":"WA" 165 | }, 166 | { 167 | "title":"Game Server Engineer - MMO Mobile Gaming Start-Up!", 168 | "skills":["AWS", 169 | "Node.JS", 170 | "pubnub", 171 | "Websockets", 172 | "pubnub - Node.JS", 173 | "Vagrant", 174 | "Linux", 175 | "Git", 176 | "MongoDB", 177 | "Jenkins", 178 | "Docker"], 179 | "geocode":"37.777115,-122.41733", 180 | "city":"San Francisco", 181 | "state":"CA" 182 | },... 183 | ] 184 | } 185 | } 186 | ``` 187 | 188 | # Unsupervised Feedback (Blind Feedback) Plugin 189 | An example request handler configuration for the solrconfig.xml is shown below, with comments outlining the main parameters: 190 | ```$xml 191 | 192 | 193 | true 194 | json 195 | true 196 | 197 | 198 | edismax 199 | title 200 | title^1.5 skills^1.25 description^1.1 201 | title^3.0 skills^2.5 description^1.5 202 | 1 203 | OR 204 | 205 | jobTitle,skills,company 206 | 30 207 | 208 | 209 | skillsFromskills,titleFromJobTitle 210 | 211 | 50 212 | 213 | 10 214 | true 215 | 216 | 218 | 219 | skillsFromskills^4.5 titleFromJobTitle^6.0 220 | 221 | 222 | details 223 | 224 | 225 | true 226 | 227 | false 228 | 229 | 230 | ``` 231 | #### Example Request 232 | [http://localhost:8983/solr/DiceJobsCP/ufselect?q=Machine+Learning+Engineer&start=0&rows=10&uf.logtf=false&fl=title,skills,geoCode,city,state&fq={!geofilt+sfield=jobEndecaGeoCode+d=48+pt=39.6955,-105.0841}&wt=json](http://localhost:8983/solr/DiceJobsCP/ufselect?q=Machine+Learning+Engineer&start=0&rows=10&uf.logtf=false&fl=title,skills,geoCode,city,state&fq={!geofilt+sfield=jobEndecaGeoCode+d=48+pt=39.6955,-105.0841}&wt=json) 233 | 234 | #### Example Response 235 | ```$json 236 | { 237 | "match": 238 | { 239 | "numFound":7729, 240 | "start":0, 241 | "docs":[ 242 | { 243 | "title":"NLP/Machine Learning Engineer", 244 | "skills":["Linux", 245 | "NLP (Natural Language Processing)", 246 | "SQL", 247 | "Bash", 248 | "Python", 249 | "ML (Machine Learning)", 250 | "JavaScript", 251 | "Java"], 252 | "geocode":"42.35819,-71.050674", 253 | "city":"Boston", 254 | "state":"MA" 255 | }, 256 | { 257 | "title":"Machine Learning Engineer", 258 | "skills":["machine learning", 259 | "java", 260 | "scala"], 261 | "geocode":"47.60473,-122.32594", 262 | "city":"Seattle", 263 | "state":"WA" 264 | }, 265 | { 266 | "title":"Machine Learning Engineer - REMOTE!", 267 | "skills":["Neo4j", 268 | "Hadoop", 269 | "gensim", 270 | "gensim - C++", 271 | "Java", 272 | "R", 273 | "MongoDB", 274 | "elastic search", 275 | "sci-kit learn", 276 | "Python", 277 | "C++"], 278 | "geocode":"37.777115,-122.41733", 279 | "city":"San Francisco", 280 | "state":"CA" 281 | },... 282 | ] 283 | } 284 | ``` 285 | 286 | ### Isn't this just the MLT Handler? 287 | While it is loosely based on the Solr MLT handler code and algorithm (which is just the Rocchio algorithm), there are some key differences in the algorithm design. The MLT handler takes the top k terms across all configured fields when constructing the MLT query. If you have a field that has a broader vocabulary than the other fields, the average document frequency of a term will be lower than in other fields with smaller vocabularies. This means that these terms will have high relative idf scores and tend to dominate the top terms selected by the Solr MLT handler. Our request handler takes the top k terms per field. It also ensure that that no matter how many terms are matched per field (up to the configured limit), that field has the same weighting in the resulting query as all other fields, before the field specific weights specified in the rf.qf parameter are applied. This is the second problem with the Solr MLT handler that we address. We also provide a lot of extra functionality. We allow for passing in of content streams, matching against multiple documents (more like 'THESE' as opposed to more like 'this'), applying the boost query parser to the resulting MLT query to allow for any arbitrary solr boost to be applied (multiplicative). And we support the mm parameter, so we can force documents to come back that only match a set % of the top terms. 288 | 289 | ### Important Considerations When using for Personalized Search 290 | If you wish to use this to perform search personalization, as demonstrated in my Lucene Revolution 2017 talk, you need to pass in the user's current search query using the regular q parameter, and the information used to generate the rocchio query is passed via the rf.q parameter (when using documents to generate the Rocchio query) or via the content stream parameters (rf.stream.head and rf.stream.body, which take strings of content). Note however, that the boosts applied to the terms in the rocchio query are not of comparative weights to those in your user query, due to the process of normalization that the algorithm applies. So you will need to experiment with different rf.qf values until you find the right level of influence on your query, based on your search configuration. Also, given that the rocchio query generated for each user is likely the same across the user's search session (depending on your use case of course), a more efficent way of using this to do personalization is simply to use the RF handler to generate the rochio query for you once when the user logs in, cache this query, and then use it as a boost query (within your regular search request handler) for personalizing subsequent user searches. The handler returns the rocchio query in the rf.query parameter in the response. If you want to use the handler just to get the query (and not execute the search), you can set the rows parameter to 0. You can also iterate over the set of 'interesting terms' returned by the algorithm, along with their weights, if you set rf.interestingTerms=details, and use this to build your boost query. 291 | 292 | ### Potential Enhancements 293 | Aside from ensuring this works with more versions of solr (please leave feedback as to which versions you all want), there are a number of possible enhancements: 294 | 295 | - **Relevancy Feedback Handler** Allow the learning of negative terms from the negative examples (if supplied - needs a separate query parameter), then implement using negative boosting. Another enhancement would be to allow the max terms per field (rf.maxflqt) to be specified on a per field basis, so that you can vary the max number of terms extracted by field. 296 | - **Unsupervised Feedback (Blind Feedback)** Use the *positional relevance model* detailed in this paper: http://dl.acm.org/citation.cfm?id=1835546. This uses only terms found near the query's terms in the document, as these are generally more relevant than using the whole document. The highlighter component can presumably be used as a reference to determine how to get this information from the postings list, or maybe even used directly to get this information. 297 | 298 | ### Contact Details 299 | If you have a feature request, please submit it to the issues list. If you have questions, that is also a good place to post them, but you can also reach out to me at simon.hughes@dice.com if you don't here back. 300 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.dice.relevancyfeedback 8 | DiceRelevancyFeedback 9 | 1.0 10 | jar 11 | 12 | 13 | 14 | com.google.guava 15 | guava 16 | 12.0 17 | 18 | 19 | 20 | 21 | org.apache.solr 22 | solr-core 23 | 6.3.0 24 | 25 | 26 | 27 | org.apache.solr 28 | solr-solrj 29 | 6.3.0 30 | 31 | 32 | 33 | 34 | org.apache.lucene 35 | lucene-analyzers-common 36 | 6.3.0 37 | 38 | 39 | org.apache.lucene 40 | lucene-queryparser 41 | 6.3.0 42 | 43 | 44 | org.apache.lucene 45 | lucene-queries 46 | 6.3.0 47 | 48 | 49 | org.apache.lucene 50 | lucene-core 51 | 6.3.0 52 | 53 | 54 | org.json 55 | json 56 | 20131018 57 | 58 | 59 | 60 | junit 61 | junit 62 | 4.11 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/JarVersion.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements; 2 | 3 | import org.slf4j.Logger; 4 | 5 | import java.io.InputStream; 6 | import java.net.URL; 7 | import java.util.Enumeration; 8 | 9 | /** 10 | * Created by simon.hughes on 7/7/16. 11 | */ 12 | public class JarVersion { 13 | 14 | private class stub{ 15 | 16 | } 17 | 18 | public static String getVersion(Logger log){ 19 | 20 | Enumeration resources; 21 | StringBuilder stringBuilder = new StringBuilder(); 22 | 23 | try { 24 | resources = stub.class.getClassLoader().getResources("META-INF/MANIFEST.MF"); 25 | while (resources.hasMoreElements()) { 26 | URL url = resources.nextElement(); 27 | /* let's not read other jar's manifests */ 28 | if (!url.toString().contains("DiceSolrEnhancements")) { 29 | continue; 30 | } 31 | InputStream reader = url.openStream(); 32 | while(reader.available() > 0) { 33 | char c = (char) reader.read(); 34 | stringBuilder.append(c); 35 | /* skip lines that don't contain the built-date */ 36 | if (stringBuilder.toString().contains(System.getProperty("line.separator")) && 37 | !stringBuilder.toString().contains("Build-Time")) { 38 | stringBuilder.setLength(0); 39 | } 40 | } 41 | } 42 | } catch (Exception e) { 43 | log.warn("Failed to read manifest during request for version!"); 44 | return "Error reading manifest!"; 45 | } 46 | return stringBuilder.toString(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/InterestingTerm.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | import org.apache.lucene.index.Term; 4 | 5 | import java.util.Comparator; 6 | 7 | /** 8 | * Created by simon.hughes on 9/2/14. 9 | */ 10 | public class InterestingTerm 11 | { 12 | public Term term; 13 | public float boost; 14 | 15 | public static Comparator BOOST_ORDER = new Comparator() { 16 | @Override 17 | public int compare(InterestingTerm t1, InterestingTerm t2) { 18 | float d = t1.boost - t2.boost; 19 | if( d == 0 ) { 20 | return 0; 21 | } 22 | return (d>0)?-1:1; 23 | } 24 | }; 25 | } -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RFHelper.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | /** 4 | * Created by simon.hughes on 9/2/14. 5 | */ 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.IndexReader; 9 | import org.apache.lucene.index.Term; 10 | import org.apache.lucene.queries.function.BoostedQuery; 11 | import org.apache.lucene.queries.function.FunctionQuery; 12 | import org.apache.lucene.queries.function.ValueSource; 13 | import org.apache.lucene.queries.function.valuesource.QueryValueSource; 14 | import org.apache.lucene.search.*; 15 | import org.apache.solr.common.SolrException; 16 | import org.apache.solr.common.params.FacetParams; 17 | import org.apache.solr.common.params.SolrParams; 18 | import org.apache.solr.schema.SchemaField; 19 | import org.apache.solr.search.*; 20 | import org.apache.solr.util.SolrPluginUtils; 21 | 22 | import java.io.IOException; 23 | import java.io.Reader; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.regex.Pattern; 27 | 28 | /** 29 | * Helper class for RelevancyFeedback that can be called from other request handlers 30 | */ 31 | public class RFHelper 32 | { 33 | // Pattern is thread safe -- TODO? share this with general 'fl' param 34 | private static final Pattern splitList = Pattern.compile(",| "); 35 | 36 | final SolrIndexSearcher searcher; 37 | final QParser qParser; 38 | final RelevancyFeedback relevancyFeedback; 39 | final IndexReader reader; 40 | final SchemaField uniqueKeyField; 41 | final boolean needDocSet; 42 | 43 | 44 | public RFHelper(SolrParams params, SolrIndexSearcher searcher, SchemaField uniqueKeyField, QParser qParser ) 45 | { 46 | this.searcher = searcher; 47 | this.qParser = qParser; 48 | this.reader = searcher.getIndexReader(); 49 | this.uniqueKeyField = uniqueKeyField; 50 | this.needDocSet = params.getBool(FacetParams.FACET, false); 51 | 52 | SolrParams required = params.required(); 53 | String[] fields = splitList.split(required.get(RFParams.SIMILARITY_FIELDS)); 54 | if( fields.length < 1 ) { 55 | throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, 56 | "RelevancyFeedback requires at least one similarity field: "+ RFParams.SIMILARITY_FIELDS ); 57 | } 58 | 59 | this.relevancyFeedback = new RelevancyFeedback( reader ); 60 | relevancyFeedback.setFieldNames(fields); 61 | 62 | final String flMustMatch = params.get(RFParams.FL_MUST_MATCH); 63 | if( flMustMatch != null && flMustMatch.trim().length() > 0 ) { 64 | String[] mustMatchFields = splitList.split(flMustMatch.trim()); 65 | relevancyFeedback.setMatchFieldNames(mustMatchFields); 66 | } 67 | 68 | final String flMustNOTMatch = params.get(RFParams.FL_MUST_NOT_MATCH); 69 | if( flMustNOTMatch != null && flMustNOTMatch.trim().length() > 0 ) { 70 | String[] differntMatchFields = splitList.split(flMustNOTMatch.trim()); 71 | relevancyFeedback.setDifferentFieldNames(differntMatchFields); 72 | } 73 | 74 | String[] payloadFields = getFieldList(RFParams.PAYLOAD_FIELDS, params); 75 | if(payloadFields != null){ 76 | throw new RuntimeException("Payload fields are not currently supported"); 77 | //relevancyFeedback.setPayloadFields(payloadFields); 78 | } 79 | relevancyFeedback.setAnalyzer( searcher.getSchema().getIndexAnalyzer() ); 80 | 81 | // configurable params 82 | 83 | relevancyFeedback.setMm( params.get(RFParams.MM, RelevancyFeedback.DEFAULT_MM)); 84 | relevancyFeedback.setMinTermFreq( params.getInt(RFParams.MIN_TERM_FREQ, RelevancyFeedback.DEFAULT_MIN_TERM_FREQ)); 85 | relevancyFeedback.setMinDocFreq( params.getInt(RFParams.MIN_DOC_FREQ, RelevancyFeedback.DEFAULT_MIN_DOC_FREQ)); 86 | relevancyFeedback.setMaxDocFreq( params.getInt(RFParams.MAX_DOC_FREQ, RelevancyFeedback.DEFAULT_MAX_DOC_FREQ)); 87 | relevancyFeedback.setMinWordLen( params.getInt(RFParams.MIN_WORD_LEN, RelevancyFeedback.DEFAULT_MIN_WORD_LENGTH)); 88 | relevancyFeedback.setMaxWordLen( params.getInt(RFParams.MAX_WORD_LEN, RelevancyFeedback.DEFAULT_MAX_WORD_LENGTH)); 89 | 90 | relevancyFeedback.setBoost( params.getBool(RFParams.BOOST, true ) ); 91 | 92 | // new parameters 93 | relevancyFeedback.setBoostFn(params.get(RFParams.BOOST_FN)); 94 | relevancyFeedback.setNormalizeFieldBoosts(params.getBool(RFParams.NORMALIZE_FIELD_BOOSTS, RelevancyFeedback.DEFAULT_NORMALIZE_FIELD_BOOSTS)); 95 | // new versions of previous parameters moved to the field level 96 | relevancyFeedback.setMaxQueryTermsPerField(params.getInt(RFParams.MAX_QUERY_TERMS_PER_FIELD, RelevancyFeedback.DEFAULT_MAX_QUERY_TERMS_PER_FIELD)); 97 | relevancyFeedback.setMaxNumTokensParsedPerField(params.getInt(RFParams.MAX_NUM_TOKENS_PARSED_PER_FIELD, RelevancyFeedback.DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD)); 98 | relevancyFeedback.setLogTf(params.getBool(RFParams.IS_LOG_TF, RelevancyFeedback.DEFAULT_IS_LOG_TF)); 99 | 100 | relevancyFeedback.setBoostFields(SolrPluginUtils.parseFieldBoosts(params.getParams(RFParams.QF))); 101 | relevancyFeedback.setStreamBoostFields(SolrPluginUtils.parseFieldBoosts(params.getParams(RFParams.STREAM_QF))); 102 | 103 | String streamHead = params.get(RFParams.STREAM_HEAD); 104 | if(streamHead != null) { 105 | relevancyFeedback.setStreamHead(streamHead); 106 | } 107 | 108 | // Set stream fields 109 | String[] streamHeadFields = getFieldList(RFParams.STREAM_HEAD_FL, params); 110 | if(streamHeadFields != null){ 111 | relevancyFeedback.setStreamHeadfieldNames(streamHeadFields); 112 | } 113 | 114 | String[] streamBodyFields = getFieldList(RFParams.STREAM_BODY_FL, params); 115 | if(streamBodyFields != null){ 116 | relevancyFeedback.setStreamBodyfieldNames(streamBodyFields); 117 | } 118 | } 119 | 120 | private String[] getFieldList(String key, SolrParams params) { 121 | final String fieldList = params.get(key); 122 | if(fieldList != null && fieldList.trim().length() > 0) { 123 | String[] fields = splitList.split(fieldList); 124 | if(fields != null){ 125 | return fields; 126 | } 127 | } 128 | return null; 129 | } 130 | 131 | private Query getBoostedFunctionQuery(Query q) throws SyntaxError{ 132 | 133 | if (relevancyFeedback.getBoostFn() == null || relevancyFeedback.getBoostFn().trim().length() == 0) { 134 | return q; 135 | } 136 | 137 | Query boost = this.qParser.subQuery(relevancyFeedback.getBoostFn(), FunctionQParserPlugin.NAME).getQuery(); 138 | ValueSource vs; 139 | if (boost instanceof FunctionQuery) { 140 | vs = ((FunctionQuery) boost).getValueSource(); 141 | } else { 142 | vs = new QueryValueSource(boost, 1.0f); 143 | } 144 | return new BoostedQuery(q, vs); 145 | } 146 | 147 | public RFResult getMatchesFromDocs(DocIterator iterator, int start, int rows, List filters, int flags, Sort lsort, Query userQuery) throws IOException, SyntaxError 148 | { 149 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder(); 150 | List ids = new ArrayList(); 151 | 152 | while(iterator.hasNext()) { 153 | int id = iterator.nextDoc(); 154 | Document doc = reader.document(id); 155 | ids.add(id); 156 | 157 | // add exclusion filters to prevent matching seed documents 158 | TermQuery tq = new TermQuery(new Term(uniqueKeyField.getName(), uniqueKeyField.getType().storedToIndexed(doc.getField(uniqueKeyField.getName())))); 159 | qryBuilder.add(tq, BooleanClause.Occur.MUST_NOT); 160 | } 161 | 162 | RFQuery RFQuery = relevancyFeedback.like(ids); 163 | 164 | Query rawrfQuery = RFQuery.getOrQuery(); 165 | 166 | if(RFQuery.getMustMatchQuery() != null){ 167 | filters.add(RFQuery.getMustMatchQuery()); 168 | } 169 | if(RFQuery.getMustNOTMatchQuery() != null){ 170 | filters.add(RFQuery.getMustNOTMatchQuery()); 171 | } 172 | 173 | Query boostedrfQuery = getBoostedFunctionQuery(rawrfQuery); 174 | qryBuilder.add(boostedrfQuery, BooleanClause.Occur.MUST); 175 | 176 | Query finalQuery = null; 177 | 178 | if(userQuery != null){ 179 | // set user query as a MUST clause, and tack on RF query as a boosted OR (should) 180 | Query rfQuery = qryBuilder.build(); 181 | 182 | BooleanQuery.Builder personalizedQryBuilder = new BooleanQuery.Builder(); 183 | personalizedQryBuilder.add(userQuery, BooleanClause.Occur.MUST); 184 | personalizedQryBuilder.add(rfQuery, BooleanClause.Occur.SHOULD); 185 | 186 | finalQuery = personalizedQryBuilder.build(); 187 | } 188 | else{ 189 | finalQuery = qryBuilder.build(); 190 | } 191 | 192 | DocListAndSet results = new DocListAndSet(); 193 | if (this.needDocSet) { 194 | results = searcher.getDocListAndSet(finalQuery, filters, lsort, start, rows, flags); 195 | } else { 196 | results.docList = searcher.getDocList(finalQuery, filters, lsort, start, rows, flags); 197 | } 198 | 199 | return new RFResult(RFQuery.getRFTerms(), finalQuery, results); 200 | } 201 | 202 | 203 | public RFResult getMatchesFromContentSteam(Reader reader, int start, int rows, List filters, int flags, Sort lsort, Query userQuery) throws IOException, SyntaxError 204 | { 205 | RFQuery RFQuery = relevancyFeedback.like(reader); 206 | Query rawRFQuery = RFQuery.getOrQuery(); 207 | 208 | if(RFQuery.getMustMatchQuery() != null || RFQuery.getMustNOTMatchQuery() != null){ 209 | throw new RuntimeException( 210 | String.format("The %s and the %s parameters are not supported for content stream queries", 211 | RFParams.FL_MUST_MATCH, RFParams.FL_MUST_NOT_MATCH)); 212 | } 213 | 214 | Query boostedRFQuery = getBoostedFunctionQuery(rawRFQuery); 215 | Query finalQuery = boostedRFQuery; 216 | if(userQuery != null){ 217 | // set user query as a MUST clause, and tack on RF query as a boosted OR (should) 218 | BooleanQuery.Builder personalizedQryBuilder = new BooleanQuery.Builder(); 219 | personalizedQryBuilder.add(userQuery, BooleanClause.Occur.MUST); 220 | personalizedQryBuilder.add(boostedRFQuery, BooleanClause.Occur.SHOULD); 221 | 222 | finalQuery = personalizedQryBuilder.build(); 223 | } 224 | 225 | DocListAndSet results = new DocListAndSet(); 226 | if (this.needDocSet) { 227 | results = searcher.getDocListAndSet( finalQuery, filters, lsort, start, rows, flags); 228 | } else { 229 | results.docList = searcher.getDocList( finalQuery, filters, lsort, start, rows, flags); 230 | } 231 | return new RFResult(RFQuery.getRFTerms(), finalQuery, results); 232 | } 233 | 234 | public RelevancyFeedback getRelevancyFeedback() 235 | { 236 | return relevancyFeedback; 237 | } 238 | } 239 | 240 | 241 | -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RFParams.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | import org.apache.solr.search.QueryParsing; 4 | 5 | import java.util.Locale; 6 | 7 | /** 8 | * Created by simon.hughes on 9/4/14. 9 | */ 10 | public interface RFParams { 11 | java.lang.String RF = "rf"; 12 | java.lang.String PREFIX = "rf."; 13 | java.lang.String SIMILARITY_FIELDS = PREFIX + "fl"; 14 | java.lang.String MIN_TERM_FREQ =PREFIX + "mintf"; 15 | java.lang.String MAX_DOC_FREQ = PREFIX + "maxdf"; 16 | java.lang.String MIN_DOC_FREQ = PREFIX + "mindf"; 17 | java.lang.String MIN_WORD_LEN = PREFIX + "minwl"; 18 | java.lang.String MAX_WORD_LEN = PREFIX + "maxwl"; 19 | // don't clash with regular mm 20 | java.lang.String MM = PREFIX + "mm"; 21 | //Changed from maxqt 22 | java.lang.String MAX_QUERY_TERMS_PER_FIELD = PREFIX + "maxflqt"; 23 | //Changed from maxntp 24 | java.lang.String MAX_NUM_TOKENS_PARSED_PER_FIELD = PREFIX + "maxflntp"; 25 | java.lang.String BOOST = PREFIX + "boost"; 26 | java.lang.String FQ = PREFIX + "fq"; 27 | 28 | java.lang.String QF = PREFIX + "qf"; 29 | 30 | // allows user to specify a query, and we use the RF terms to boost that query 31 | java.lang.String RF_QUERY = PREFIX + "q"; 32 | java.lang.String RF_DEFTYPE = PREFIX + QueryParsing.DEFTYPE; 33 | 34 | // new to this plugin 35 | java.lang.String FL_MUST_MATCH = PREFIX + "fl.match"; // list of fields that must match the target document 36 | java.lang.String FL_MUST_NOT_MATCH = PREFIX + "fl.different"; // list of fields that must NOT match the target document 37 | 38 | java.lang.String BOOST_FN = PREFIX + "boostfn"; 39 | java.lang.String PAYLOAD_FIELDS = PREFIX + "payloadfl"; 40 | 41 | // normalize field boosts 42 | java.lang.String NORMALIZE_FIELD_BOOSTS = PREFIX + "normflboosts"; 43 | java.lang.String IS_LOG_TF = PREFIX + "logtf"; 44 | 45 | java.lang.String STREAM_HEAD = "stream.head"; 46 | java.lang.String STREAM_HEAD_FL = "stream.head.fl"; 47 | java.lang.String STREAM_BODY_FL = "stream.body.fl"; 48 | 49 | java.lang.String STREAM_QF = "stream.qf"; 50 | // end new to this plugin 51 | 52 | // the /rf request handler uses 'rows' 53 | public final static String DOC_COUNT = PREFIX + "count"; 54 | 55 | // Do you want to include the original document in the results or not 56 | public final static String MATCH_INCLUDE = PREFIX + "match.include"; 57 | 58 | // If multiple docs are matched in the query, what offset do you want? 59 | public final static String MATCH_OFFSET = PREFIX + "match.offset"; 60 | 61 | // Do you want to include the original document in the results or not 62 | public final static String INTERESTING_TERMS = PREFIX + "interestingTerms"; // false,details,(list or true) 63 | 64 | public enum TermStyle { 65 | NONE, 66 | LIST, 67 | DETAILS; 68 | 69 | public static TermStyle get( String p ) 70 | { 71 | if( p != null ) { 72 | p = p.toUpperCase(Locale.ROOT); 73 | if( p.equals( "DETAILS" ) ) { 74 | return DETAILS; 75 | } 76 | else if( p.equals( "LIST" ) ) { 77 | return LIST; 78 | } 79 | } 80 | return NONE; 81 | } 82 | } 83 | } -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RFQuery.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | import org.apache.lucene.queries.payloads.AveragePayloadFunction; 4 | import org.apache.lucene.queries.payloads.PayloadScoreQuery; 5 | import org.apache.lucene.search.*; 6 | import org.apache.lucene.search.spans.SpanTermQuery; 7 | import org.apache.solr.util.SolrPluginUtils; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | /** 13 | * Created by simon.hughes on 11/25/14. 14 | */ 15 | public class RFQuery { 16 | 17 | private final List RFTerms; 18 | private final String mm; 19 | private BooleanQuery mustMatchQuery = null; 20 | private BooleanQuery mustNOTMatchQuery = null; 21 | 22 | public RFQuery(List RFTerms, String mm){ 23 | this.RFTerms = RFTerms == null? new ArrayList() : RFTerms; 24 | this.mm = mm; 25 | } 26 | public BooleanQuery getMustMatchQuery(){ 27 | return this.mustMatchQuery; 28 | } 29 | 30 | public void setMustMatchQuery(BooleanQuery query){ 31 | this.mustMatchQuery = query; 32 | } 33 | 34 | public Query getMustNOTMatchQuery(){ 35 | return this.mustNOTMatchQuery; 36 | } 37 | 38 | public void setMustNOTMatchQuery(BooleanQuery query){ 39 | this.mustNOTMatchQuery = query; 40 | } 41 | 42 | public List getRFTerms(){ 43 | return RFTerms; 44 | } 45 | 46 | public Query getOrQuery(){ 47 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder(); 48 | for(RFTerm RFTerm : this.RFTerms){ 49 | qryBuilder.add(toBoostedQuery(RFTerm), BooleanClause.Occur.SHOULD); 50 | } 51 | SolrPluginUtils.setMinShouldMatch(qryBuilder, mm); 52 | return qryBuilder.build(); 53 | } 54 | 55 | private Query toBoostedQuery(RFTerm RFTerm){ 56 | Query tq = toTermQuery(RFTerm); 57 | return new BoostQuery(tq, RFTerm.getFinalScore()); 58 | } 59 | 60 | private Query toTermQuery(RFTerm RFTerm) { 61 | if(RFTerm.hasPayload()) { 62 | return new PayloadScoreQuery(new SpanTermQuery(RFTerm.getTerm()), new AveragePayloadFunction(), false); 63 | } 64 | else{ 65 | return new TermQuery(RFTerm.getTerm()); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RFResult.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | import org.apache.lucene.search.Query; 4 | import org.apache.solr.search.DocListAndSet; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by simon.hughes on 1/6/17. 10 | */ 11 | public class RFResult { 12 | private final List RFTerms; 13 | private final Query finalRfQuery; 14 | private DocListAndSet results; 15 | 16 | public RFResult(List RFTerms, Query finalRfQuery, DocListAndSet results){ 17 | this.RFTerms = RFTerms; 18 | this.finalRfQuery = finalRfQuery; 19 | this.results = results; 20 | } 21 | 22 | public DocListAndSet getResults() { 23 | return results; 24 | } 25 | 26 | public List getRFTerms(){ 27 | return RFTerms; 28 | } 29 | 30 | public Query getQuery() { 31 | return finalRfQuery; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RFTerm.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | import com.google.common.base.Strings; 4 | import org.apache.lucene.index.Term; 5 | 6 | import java.text.DecimalFormat; 7 | import java.util.Comparator; 8 | 9 | /** 10 | * Created by simon.hughes on 9/4/14. 11 | */ 12 | public class RFTerm implements Comparable { 13 | 14 | private final String word; 15 | private final String fieldName; 16 | private final float idf; 17 | private final int docFreq; 18 | private final float tf; 19 | private final float fieldBoost; 20 | private final float payload; 21 | private final static DecimalFormat format = new DecimalFormat("#0.00"); 22 | 23 | private final static DecimalFormat intFormat = new DecimalFormat("##.##"); 24 | private final boolean logTf; 25 | private final boolean hasPayload; 26 | private final boolean useBoost; 27 | 28 | private float vectorLength = 1.0f; 29 | 30 | // non-payload 31 | public RFTerm(String word, String fieldName, float tf, float idf, int docFreq, boolean logTf, float fieldBoost, boolean useBoost){ 32 | this(word, fieldName, tf, idf, docFreq, logTf, fieldBoost, 1.0f, useBoost, false); 33 | } 34 | 35 | // with payload 36 | public RFTerm(String word, String fieldName, float tf, float idf, int docFreq, boolean logTf, float fieldBoost, float payload, boolean useBoost, boolean hasPayload){ 37 | 38 | this.word = word; 39 | this.fieldName = fieldName; 40 | this.idf = idf; 41 | this.docFreq = docFreq; 42 | this.tf = tf; 43 | this.fieldBoost = fieldBoost; 44 | this.payload = payload; 45 | this.logTf = logTf; 46 | this.useBoost = useBoost; 47 | this.hasPayload = hasPayload; 48 | } 49 | 50 | public String getWord() { 51 | return word; 52 | } 53 | 54 | public String getFieldName() { 55 | return fieldName; 56 | } 57 | 58 | public float getIdf() { 59 | return idf; 60 | } 61 | 62 | public int getDocFreq() { 63 | return docFreq; 64 | } 65 | 66 | public float getTf() { 67 | return tf; 68 | } 69 | 70 | public float getPayload() { 71 | return payload; 72 | } 73 | 74 | public float getFieldBoost() { return fieldBoost; } 75 | 76 | private String padFloat(float f){ 77 | String formatted = format.format(f); 78 | return Strings.padStart(formatted, 7, ' '); 79 | } 80 | 81 | private String padInt(float f){ 82 | String formatted = intFormat.format(f); 83 | return Strings.padStart(formatted, 5, ' '); 84 | } 85 | 86 | public float getTermWeight(){ 87 | if(this.hasPayload()){ 88 | // for the payload, typically we want to include the TF but not the IDF. This is what is passed to the payload value 89 | return this.getPayload(); 90 | } 91 | else { 92 | if(false == this.useBoost){ 93 | return 1.0f; 94 | } 95 | float tfVal = this.tf; 96 | if (this.logTf) { 97 | tfVal = getLogTf(); 98 | } 99 | return tfVal * this.idf; 100 | } 101 | } 102 | 103 | public float getNormalizedTermWeight(){ 104 | return this.getTermWeight() / this.vectorLength; 105 | } 106 | 107 | private float getLogTf() { 108 | return (float) Math.log(this.tf + 1.0d); 109 | } 110 | 111 | public float getFinalScore(){ 112 | return this.getFieldBoost() * this.getNormalizedTermWeight(); 113 | } 114 | 115 | public String valuesToString(){ 116 | StringBuilder sb = new StringBuilder(); 117 | sb.append("score: ").append(padFloat(this.getFinalScore())); 118 | sb.append(" term wt: ").append(padFloat(this.getTermWeight())); 119 | 120 | if(this.useBoost) { 121 | if (this.logTf) { 122 | sb.append(" log(tf): ").append(padFloat(this.getLogTf())); 123 | } else { 124 | sb.append(" tf: ").append(padInt(this.getTf())); 125 | } 126 | sb.append(" df: ").append(padInt((this.getDocFreq()))); 127 | sb.append(" idf: ").append(padFloat((this.getIdf()))); 128 | } 129 | if(this.hasPayload()) 130 | { 131 | sb.append(" pyld: ").append(padFloat((this.getPayload()))); 132 | } 133 | sb.append(" fldBst: ").append(padFloat((this.getFieldBoost()))); 134 | sb.append(" veclen: ").append(padFloat((this.vectorLength))); 135 | return sb.toString(); 136 | } 137 | 138 | public static Comparator FLD_BOOST_X_SCORE_ORDER = new Comparator() { 139 | @Override 140 | public int compare(RFTerm t1, RFTerm t2) { 141 | float d = t2.getFinalScore() - t1.getFinalScore(); 142 | if( d == 0 ) { 143 | return 0; 144 | } 145 | return (d>0)?1:-1; 146 | } 147 | }; 148 | 149 | public int compareTo(RFTerm o) { 150 | return ((Float)o.getFinalScore()).compareTo(this.getFinalScore()); 151 | } 152 | 153 | // used in debug info (relevancyFeedback.interestingTerms = details) 154 | public Term getTerm() { 155 | return new Term(this.getFieldName(), this.getWord()); 156 | } 157 | 158 | public boolean hasPayload() { 159 | return hasPayload; 160 | } 161 | 162 | public void setVectorLength(float vectorLength) { 163 | this.vectorLength = vectorLength; 164 | } 165 | } -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RelevancyFeedback.java: -------------------------------------------------------------------------------- 1 | package org.dice.solrenhancements.relevancyfeedback; 2 | 3 | /** 4 | * Created by simon.hughes on 9/2/14. 5 | */ 6 | /** 7 | * Copyright 2004-2005 The Apache Software Foundation. 8 | * 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | */ 21 | 22 | import org.apache.lucene.analysis.Analyzer; 23 | import org.apache.lucene.analysis.TokenStream; 24 | import org.apache.lucene.analysis.payloads.PayloadHelper; 25 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 26 | import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; 27 | import org.apache.lucene.document.Document; 28 | import org.apache.lucene.index.*; 29 | import org.apache.lucene.search.BooleanClause; 30 | import org.apache.lucene.search.BooleanQuery; 31 | import org.apache.lucene.search.TermQuery; 32 | import org.apache.lucene.search.similarities.ClassicSimilarity; 33 | import org.apache.lucene.search.similarities.TFIDFSimilarity; 34 | import org.apache.lucene.util.*; 35 | import org.apache.lucene.util.PriorityQueue; 36 | 37 | import java.io.IOException; 38 | import java.io.Reader; 39 | import java.io.StringReader; 40 | import java.util.*; 41 | 42 | 43 | /** 44 | * Generate "more queryFromDocuments this" similarity queries. 45 | * Based on this mail: 46 | *

  47 |  * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
  48 |  * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
  49 |  * is usually fast enough.  But looking up the docFreq() of every term in the document is
  50 |  * probably too slow.
  51 |  * 
  52 |  * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
  53 |  * or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
  54 |  * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
  55 |  * reduce the number of terms under consideration.  Another heuristic is that terms with a
  56 |  * high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
  57 |  * number of characters, not selecting anything less than, e.g., six or seven characters.
  58 |  * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
  59 |  * that do a pretty good job of characterizing a document.
  60 |  * 

  61 |  * It all depends on what you're trying to do.  If you're trying to eek out that last percent
  62 |  * of precision and recall regardless of computational difficulty so that you can win a TREC
  63 |  * competition, then the techniques I mention above are useless.  But if you're trying to
  64 |  * provide a "more queryFromDocuments this" button on a search results page that does a decent job and has
  65 |  * good performance, such techniques might be useful.
  66 |  * 

  67 |  * An efficient, effective "more-queryFromDocuments-this" query generator would be a great contribution, if
  68 |  * anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
  69 |  * text), analyzer Analyzer, and return a set of representative terms using heuristics queryFromDocuments those
  70 |  * above.  The frequency and length thresholds could be parameters, etc.
  71 |  * 

  72 |  * Doug
  73 |  *

74 | *

75 | *

76 | *

77 | *

Initial Usage

78 | *

79 | * This class has lots of options to try to make it efficient and flexible. 80 | * The simplest possible usage is as follows. The bold 81 | * fragment is specific to this class. 82 | *

83 | *

  84 |  * 
  85 |  * IndexReader ir = ...
  86 |  * IndexSearcher is = ...
  87 |  * 

  88 |  * RelevancyFeedback relevancyFeedback = new RelevancyFeedback(ir);
  89 |  * Reader target = ... // orig source of doc you want to find similarities to
  90 |  * Query query = relevancyFeedback.queryFromDocuments( target);
  91 |  * 

  92 |  * Hits hits = is.search(query);
  93 |  * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
  94 |  * //you ignore the doc if it matches your 'target' document, as it should be similar to itself
  95 |  * 

  96 |  *

97 | *

98 | * Thus you: 99 | *

do your normal, Lucene setup for searching, 101 | *
create a RelevancyFeedback, 102 | *
get the text of the doc you want to find similarities to 103 | *
then call one of the queryFromDocuments() calls to generate a similarity query 104 | *
call the searcher to find the similar docs 105 | *

106 | *

107 | *

More Advanced Usage

108 | *

109 | * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine 110 | * multiple fields (e.g. body and title) for similarity. 111 | *

112 | *

113 | * Depending on the size of your index and the size and makeup of your documents you 114 | * may want to call the other set methods to control how the similarity queries are 115 | * generated: 116 | *

{@link #setMinTermFreq setMinTermFreq(...)} 118 | *
{@link #setMinDocFreq setMinDocFreq(...)} 119 | *
{@link #setMaxDocFreq setMaxDocFreq(...)} 120 | *
{@link #setMaxDocFreqPct setMaxDocFreqPct(...)} 121 | *
{@link #setMinWordLen setMinWordLen(...)} 122 | *
{@link #setMaxWordLen setMaxWordLen(...)} 123 | *
{@link #setMaxQueryTermsPerField setMaxQueryTermsPerField(...)} 124 | *
{@link #setMaxNumTokensParsedPerField setMaxNumTokensParsedPerField(...)} 125 | *
{@link #setStopWords setStopWord(...)} 126 | *

127 | *

128 | *

129 | *

 130 |  * Changes: Mark Harwood 29/02/04
 131 |  * Some bugfixing, some refactoring, some optimisation.
 132 |  * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
 133 |  * - bugfix: No significant terms being created for fields with a termvector - because
 134 |  * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
 135 |  * - refactor: moved common code into isNoiseWord()
 136 |  * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
 137 |  *

138 | */ 139 | public final class RelevancyFeedback { 140 | 141 | /** 142 | * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. 143 | * 144 | * @see #setMm(String) 145 | */ 146 | public static final String DEFAULT_MM = "1"; 147 | 148 | /** 149 | * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. 150 | * 151 | * @see #getMaxNumTokensParsedPerField 152 | */ 153 | public static final int DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD = 5000; 154 | 155 | /** 156 | * Ignore terms with less than this frequency in the source doc. 157 | * 158 | * @see #getMinTermFreq 159 | * @see #setMinTermFreq 160 | */ 161 | public static final int DEFAULT_MIN_TERM_FREQ = 1; 162 | 163 | /** 164 | * Ignore words which do not occur in at least this many docs. 165 | * 166 | * @see #getMinDocFreq 167 | * @see #setMinDocFreq 168 | */ 169 | public static final int DEFAULT_MIN_DOC_FREQ = 5; 170 | 171 | /** 172 | * Ignore words which occur in more than this many docs. 173 | * 174 | * @see #getMaxDocFreq 175 | * @see #setMaxDocFreq 176 | * @see #setMaxDocFreqPct 177 | */ 178 | public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; 179 | 180 | /** 181 | * Boost terms in query based on score. 182 | * 183 | * @see #isBoost 184 | * @see #setBoost 185 | */ 186 | public static final boolean DEFAULT_BOOST = true; 187 | 188 | /** 189 | * Normalize field boosts 190 | * 191 | * @see #isNormalizeFieldBoosts 192 | * @see #setNormalizeFieldBoosts 193 | */ 194 | public static final boolean DEFAULT_NORMALIZE_FIELD_BOOSTS = true; 195 | 196 | /** 197 | * Log the term frequency of use the raw frequency? 198 | * 199 | * @see #isLogTf 200 | * @see #setLogTf 201 | */ 202 | public static final boolean DEFAULT_IS_LOG_TF = false; 203 | 204 | /** 205 | * Default field names. Null is used to specify that the field names should be looked 206 | * up at runtime from the provided reader. 207 | */ 208 | public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; 209 | 210 | /** 211 | * Ignore words less than this length or if 0 then this has no effect. 212 | * 213 | * @see #getMinWordLen 214 | * @see #setMinWordLen 215 | */ 216 | public static final int DEFAULT_MIN_WORD_LENGTH = 0; 217 | 218 | /** 219 | * Ignore words greater than this length or if 0 then this has no effect. 220 | * 221 | * @see #getMaxWordLen 222 | * @see #setMaxWordLen 223 | */ 224 | public static final int DEFAULT_MAX_WORD_LENGTH = 0; 225 | 226 | /** 227 | * Default set of stopwords. 228 | * If null means to allow stop words. 229 | * 230 | * @see #setStopWords 231 | * @see #getStopWords 232 | */ 233 | public static final Set DEFAULT_STOP_WORDS = null; 234 | 235 | /** 236 | * Current set of stop words. 237 | */ 238 | private Set stopWords = DEFAULT_STOP_WORDS; 239 | 240 | /** 241 | * Return a Query with no more than this many terms. 242 | * 243 | * @see org.apache.lucene.search.BooleanQuery#getMaxClauseCount 244 | * @see #getMaxQueryTermsPerField 245 | * @see #setMaxQueryTermsPerField 246 | */ 247 | public static final int DEFAULT_MAX_QUERY_TERMS_PER_FIELD = 100; 248 | 249 | 250 | /** 251 | * mm setting for RF query 252 | */ 253 | private String mm = null; 254 | 255 | /** 256 | * Analyzer that will be used to parse the doc. 257 | */ 258 | private Analyzer analyzer = null; 259 | 260 | /** 261 | * Ignore words less frequent that this. 262 | */ 263 | private int minTermFreq = DEFAULT_MIN_TERM_FREQ; 264 | 265 | /** 266 | * Ignore words which do not occur in at least this many docs. 267 | */ 268 | private int minDocFreq = DEFAULT_MIN_DOC_FREQ; 269 | 270 | /** 271 | * Ignore words which occur in more than this many docs. 272 | */ 273 | private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; 274 | 275 | /** 276 | * Should we apply a boost to the Query based on the scores? 277 | */ 278 | private boolean boost = DEFAULT_BOOST; 279 | 280 | /** 281 | * Should we normalized the field boosts per field? 282 | */ 283 | private boolean normalizeFieldBoosts = DEFAULT_NORMALIZE_FIELD_BOOSTS; 284 | 285 | /** 286 | * Should we normalized the field boosts per field? 287 | */ 288 | private boolean isLogTf = DEFAULT_IS_LOG_TF; 289 | 290 | /** 291 | * Field name we'll analyze. 292 | */ 293 | private String[] fieldNames = DEFAULT_FIELD_NAMES; 294 | private String[] matchFieldNames = new String[]{}; 295 | private String[] differentFieldNames = new String[]{}; 296 | 297 | private String streamHead = null; 298 | 299 | private String[] streamBodyfieldNames = new String[0]; 300 | private String[] streamHeadfieldNames = new String[0]; 301 | 302 | private HashSet payloadFields = new HashSet(); 303 | 304 | private Map boostFields; 305 | private Map streamBoostFields; 306 | 307 | 308 | /** 309 | * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support 310 | */ 311 | private int maxNumTokensParsedPerField = DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD; 312 | 313 | /** 314 | * Ignore words if less than this len. 315 | */ 316 | private int minWordLen = DEFAULT_MIN_WORD_LENGTH; 317 | 318 | /** 319 | * Ignore words if greater than this len. 320 | */ 321 | private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; 322 | 323 | /** 324 | * Don't return a query longer than this. 325 | */ 326 | private int maxQueryTermsPerField = DEFAULT_MAX_QUERY_TERMS_PER_FIELD; 327 | 328 | /** 329 | * For idf() calculations. 330 | */ 331 | private TFIDFSimilarity similarity;// = new DefaultSimilarity(); 332 | 333 | /** 334 | * IndexReader to use 335 | */ 336 | private final IndexReader ir; 337 | 338 | 339 | 340 | /** 341 | * Gets the value of the relevancyFeedback.mm parameter (mm for the RF query) 342 | * 343 | * @return - the minimum should match parameter string - follows the normal mm syntax 344 | * @see #setMm(String) 345 | **/ 346 | public String getMm() { 347 | return this.mm; 348 | } 349 | 350 | /** 351 | * Sets the text for the relevancyFeedback.mm parameter (mm for the RF query) 352 | * 353 | * @param mm - minimum should match parameter string - follows the normal mm syntax 354 | * @see #getMm() 355 | **/ 356 | public void setMm(String mm) { 357 | this.mm = mm; 358 | } 359 | 360 | 361 | /** 362 | * Tie Breaker used in DisjunctionMaxQuery 363 | **/ 364 | private String boostFn = ""; 365 | 366 | /** 367 | * Gets the text for the Multiplicative Boost Function 368 | * 369 | * @return the multiplicative boostFunction used in the RF query 370 | * @see #setBoostFn(String) 371 | **/ 372 | public String getBoostFn() { 373 | return boostFn; 374 | } 375 | 376 | /** 377 | * Sets the text for the Multiplicative Boost Function 378 | * 379 | * @see #getBoostFn() 380 | **/ 381 | public void setBoostFn(String boostFn) { 382 | this.boostFn = boostFn; 383 | } 384 | 385 | /** 386 | * Constructor requiring an IndexReader. 387 | */ 388 | public RelevancyFeedback(IndexReader ir) { 389 | this(ir, new ClassicSimilarity()); 390 | } 391 | 392 | public RelevancyFeedback(IndexReader ir, TFIDFSimilarity sim) { 393 | this.ir = ir; 394 | this.similarity = sim; 395 | 396 | } 397 | 398 | 399 | public TFIDFSimilarity getSimilarity() { 400 | return similarity; 401 | } 402 | 403 | public void setSimilarity(TFIDFSimilarity similarity) { 404 | this.similarity = similarity; 405 | } 406 | 407 | /** 408 | * Returns an analyzer that will be used to parse source doc with. The default analyzer 409 | * is not set. 410 | * 411 | * @return the analyzer that will be used to parse source doc with. 412 | */ 413 | public Analyzer getAnalyzer() { 414 | return analyzer; 415 | } 416 | 417 | /** 418 | * Sets the analyzer to use. An analyzer is not required for generating a query with the 419 | * {@link #like(List)} method, all other 'queryFromDocuments' methods require an analyzer. 420 | * 421 | * @param analyzer the analyzer to use to tokenize text. 422 | */ 423 | public void setAnalyzer(Analyzer analyzer) { 424 | this.analyzer = analyzer; 425 | } 426 | 427 | /** 428 | * Returns the frequency below which terms will be ignored in the source doc. The default 429 | * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. 430 | * 431 | * @return the frequency below which terms will be ignored in the source doc. 432 | */ 433 | public int getMinTermFreq() { 434 | return minTermFreq; 435 | } 436 | 437 | /** 438 | * Sets the frequency below which terms will be ignored in the source doc. 439 | * 440 | * @param minTermFreq the frequency below which terms will be ignored in the source doc. 441 | */ 442 | public void setMinTermFreq(int minTermFreq) { 443 | this.minTermFreq = minTermFreq; 444 | } 445 | 446 | /** 447 | * Returns the frequency at which words will be ignored which do not occur in at least this 448 | * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. 449 | * 450 | * @return the frequency at which words will be ignored which do not occur in at least this 451 | * many docs. 452 | */ 453 | public int getMinDocFreq() { 454 | return minDocFreq; 455 | } 456 | 457 | /** 458 | * Sets the frequency at which words will be ignored which do not occur in at least this 459 | * many docs. 460 | * 461 | * @param minDocFreq the frequency at which words will be ignored which do not occur in at 462 | * least this many docs. 463 | */ 464 | public void setMinDocFreq(int minDocFreq) { 465 | this.minDocFreq = minDocFreq; 466 | } 467 | 468 | /** 469 | * Returns the maximum frequency in which words may still appear. 470 | * Words that appear in more than this many docs will be ignored. The default frequency is 471 | * {@link #DEFAULT_MAX_DOC_FREQ}. 472 | * 473 | * @return get the maximum frequency at which words are still allowed, 474 | * words which occur in more docs than this are ignored. 475 | */ 476 | public int getMaxDocFreq() { 477 | return maxDocFreq; 478 | } 479 | 480 | /** 481 | * Set the maximum frequency in which words may still appear. Words that appear 482 | * in more than this many docs will be ignored. 483 | * 484 | * @param maxFreq the maximum count of documents that a term may appear 485 | * in to be still considered relevant 486 | */ 487 | public void setMaxDocFreq(int maxFreq) { 488 | this.maxDocFreq = maxFreq; 489 | } 490 | 491 | /** 492 | * Set the maximum percentage in which words may still appear. Words that appear 493 | * in more than this many percent of all docs will be ignored. 494 | * 495 | * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear 496 | * in to be still considered relevant 497 | */ 498 | public void setMaxDocFreqPct(int maxPercentage) { 499 | this.maxDocFreq = maxPercentage * ir.numDocs() / 100; 500 | } 501 | 502 | /** 503 | * Returns whether to boost terms in query based on "score" or not. The default is 504 | * {@link #DEFAULT_BOOST}. 505 | * 506 | * @return whether to boost terms in query based on "score" or not. 507 | * @see #setBoost 508 | */ 509 | public boolean isBoost() { 510 | return boost; 511 | } 512 | 513 | /** 514 | * Sets whether to boost terms in query based on "score" or not. 515 | * 516 | * @param boost true to boost terms in query based on "score", false otherwise. 517 | * @see #isBoost 518 | */ 519 | public void setBoost(boolean boost) { 520 | this.boost = boost; 521 | } 522 | 523 | /** 524 | * Returns whether to normalize the size of field level boosts across all field terms 525 | * {@Link #DEFAULT_NORMALIZE_FIELD_BOOSTS} 526 | * 527 | * @return whether to normalize field boosts to unit length, or not 528 | * @see #setNormalizeFieldBoosts(boolean) 529 | */ 530 | public boolean isNormalizeFieldBoosts() { 531 | return normalizeFieldBoosts; 532 | } 533 | 534 | /** 535 | * Sets whether to normalize the size of field level boosts across all field terms or not 536 | * 537 | * @param normalizeFieldBoosts true to field boosts to unit length, or false otherwise. 538 | * @see #isNormalizeFieldBoosts 539 | */ 540 | public void setNormalizeFieldBoosts(boolean normalizeFieldBoosts) { 541 | this.normalizeFieldBoosts = normalizeFieldBoosts; 542 | } 543 | 544 | /** 545 | * Returns whether to log the term frequency of the fields 546 | * {@Link #DEFAULT_IS_LOG_TF} 547 | * 548 | * @return whether to take the logarithm of the term frequency or not 549 | * @see #setLogTf(boolean) 550 | */ 551 | public boolean isLogTf() { 552 | return isLogTf; 553 | } 554 | 555 | /** 556 | * Sets whether to log the term frequency of the fields 557 | * 558 | * @param isLogTf true to take the logarithm of the term frequency or not, false otherwise 559 | * @see #isLogTf 560 | */ 561 | public void setLogTf(boolean isLogTf) { 562 | this.isLogTf = isLogTf; 563 | } 564 | 565 | /** 566 | * Returns the field names that will be used when generating the 'More Like This' query. 567 | * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. 568 | * 569 | * @return the field names that will be used when generating the 'More Like This' query. 570 | */ 571 | public String[] getFieldNames() { 572 | if (fieldNames == null) { 573 | // gather list of all valid fields from lucene, if none specified 574 | Collection fields = MultiFields.getIndexedFields(ir); 575 | fieldNames = fields.toArray(new String[fields.size()]); 576 | } 577 | 578 | return fieldNames; 579 | } 580 | 581 | /** 582 | * Returns the field names must be matched in the target document 583 | * 584 | * @return the field names that must be matched in the target document 585 | */ 586 | public String[] getMatchFieldNames() { 587 | return matchFieldNames; 588 | } 589 | 590 | /** 591 | * Returns the field names must NOT be matched in the target document 592 | * 593 | * @return the field names that must NOT be matched in the target document 594 | */ 595 | public String[] getDifferentFieldNames() { 596 | return differentFieldNames; 597 | } 598 | 599 | /** 600 | * Sets the field names that will be used when generating the 'More Like This' query. 601 | * Set this to null for the field names to be determined at runtime from the IndexReader 602 | * provided in the constructor. 603 | * 604 | * @param fieldNames the field names that will be used when generating the 'More Like This' 605 | * query. 606 | */ 607 | public void setFieldNames(String[] fieldNames) { 608 | this.fieldNames = fieldNames; 609 | } 610 | 611 | /** 612 | * Sets the field names that must match the target document in the RF query 613 | * 614 | * @param fieldNames the field names that will be used 615 | */ 616 | public void setMatchFieldNames(String[] fieldNames) { 617 | this.matchFieldNames = fieldNames; 618 | } 619 | 620 | /** 621 | * Sets the field names that must match the target document in the RF query 622 | * 623 | * @param fieldNames the field names that will be used 624 | */ 625 | public void setDifferentFieldNames(String[] fieldNames) { 626 | this.differentFieldNames = fieldNames; 627 | } 628 | 629 | /* 630 | * Returns the field names for processing the stream body. 631 | * 632 | * @return the field names used when parsing terms from the stream.body parameter 633 | */ 634 | public String[] getStreamBodyfieldNames() { 635 | if(streamBodyfieldNames.length == 0){ 636 | // don't potentially return every field by calling the getter 637 | return fieldNames; 638 | } 639 | return streamBodyfieldNames; 640 | } 641 | 642 | /* 643 | * Sets the field names used for processing the stream body. 644 | * 645 | * @param streamBodyfieldNames the field names used when parsing terms from the stream.body parameter 646 | */ 647 | public void setStreamBodyfieldNames(String[] streamBodyfieldNames) { 648 | this.streamBodyfieldNames = streamBodyfieldNames; 649 | } 650 | 651 | /* 652 | * Gets the field names used for processing the stream head. 653 | * 654 | * @return the field names used when parsing terms from the stream.head parameter 655 | */ 656 | public String[] getStreamHeadfieldNames() { 657 | if(streamHeadfieldNames.length == 0){ 658 | return fieldNames; 659 | } 660 | return streamHeadfieldNames; 661 | } 662 | 663 | /* 664 | * Sets the field names used for processing the stream.head parameter. 665 | * 666 | * @param streamHeadfieldNames the field names used when parsing terms from the stream.head parameter 667 | */ 668 | public void setStreamHeadfieldNames(String[] streamHeadfieldNames) { 669 | this.streamHeadfieldNames = streamHeadfieldNames; 670 | } 671 | 672 | /* 673 | * Gets the stream.head value, if specified. This is a string to be parsed if the q parameter is null 674 | * (assumes a document stream as input from stream.body and optionally from stream.head) 675 | * 676 | * @return stream.head value 677 | */ 678 | public String getStreamHead() { 679 | return streamHead; 680 | } 681 | 682 | /* 683 | * Sets the stream.head value, if specified. This is a string to be parsed if the q parameter is null 684 | * (assumes a document stream as input from stream.body and optionally from stream.head) 685 | * 686 | * @param streamHead stream.head value 687 | */ 688 | public void setStreamHead(String streamHead) { 689 | this.streamHead = streamHead; 690 | } 691 | 692 | 693 | /** 694 | * Returns the minimum word length below which words will be ignored. Set this to 0 for no 695 | * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. 696 | * 697 | * @return the minimum word length below which words will be ignored. 698 | */ 699 | public int getMinWordLen() { 700 | return minWordLen; 701 | } 702 | 703 | /** 704 | * Sets the minimum word length below which words will be ignored. 705 | * 706 | * @param minWordLen the minimum word length below which words will be ignored. 707 | */ 708 | public void setMinWordLen(int minWordLen) { 709 | this.minWordLen = minWordLen; 710 | } 711 | 712 | /** 713 | * Returns the maximum word length above which words will be ignored. Set this to 0 for no 714 | * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. 715 | * 716 | * @return the maximum word length above which words will be ignored. 717 | */ 718 | public int getMaxWordLen() { 719 | return maxWordLen; 720 | } 721 | 722 | /** 723 | * Sets the maximum word length above which words will be ignored. 724 | * 725 | * @param maxWordLen the maximum word length above which words will be ignored. 726 | */ 727 | public void setMaxWordLen(int maxWordLen) { 728 | this.maxWordLen = maxWordLen; 729 | } 730 | 731 | /** 732 | * Set the set of stopwords. 733 | * Any word in this set is considered "uninteresting" and ignored. 734 | * Even if your Analyzer allows stopwords, you might want to tell the RelevancyFeedback code to ignore them, as 735 | * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". 736 | * 737 | * @param stopWords set of stopwords, if null it means to allow stop words 738 | * @see #getStopWords 739 | */ 740 | public void setStopWords(Set stopWords) { 741 | this.stopWords = stopWords; 742 | } 743 | 744 | /** 745 | * Get the current stop words being used. 746 | * 747 | * @see #setStopWords 748 | */ 749 | public Set getStopWords() { 750 | return stopWords; 751 | } 752 | 753 | 754 | /** 755 | * Returns the maximum number of query terms that will be included in any generated query. 756 | * The default is {@link #DEFAULT_MAX_QUERY_TERMS_PER_FIELD}. 757 | * 758 | * @return the maximum number of query terms that will be included in any generated query. 759 | */ 760 | public int getMaxQueryTermsPerField() { 761 | return maxQueryTermsPerField; 762 | } 763 | 764 | /** 765 | * Sets the maximum number of query terms that will be included in any generated query. 766 | * 767 | * @param maxQueryTermsPerField the maximum number of query terms that will be included in any 768 | * generated query. 769 | */ 770 | public void setMaxQueryTermsPerField(int maxQueryTermsPerField) { 771 | this.maxQueryTermsPerField = maxQueryTermsPerField; 772 | } 773 | 774 | /** 775 | * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support 776 | * @see #DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD 777 | */ 778 | public int getMaxNumTokensParsedPerField() { 779 | return maxNumTokensParsedPerField; 780 | } 781 | 782 | /** 783 | * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support 784 | */ 785 | public void setMaxNumTokensParsedPerField(int i) { 786 | maxNumTokensParsedPerField = i; 787 | } 788 | 789 | /** 790 | * Gets the field level boosts specified in the request 791 | * 792 | * @return The field level boosts specified in the request 793 | */ 794 | public Map getBoostFields() { 795 | return this.boostFields; 796 | } 797 | 798 | private float getFieldBoost(String fieldName) { 799 | Float boost = this.boostFields.get(fieldName); 800 | return boost == null? 1.0f: boost; 801 | } 802 | 803 | private float getStreamFieldBoost(String fieldName) { 804 | Float streamBodyBoost = this.streamBoostFields.get(fieldName); 805 | if(streamBodyBoost == null) 806 | { 807 | streamBodyBoost = this.boostFields.get(fieldName); 808 | } 809 | return streamBodyBoost == null? 1.0f: streamBodyBoost; 810 | } 811 | 812 | /** 813 | * Sets the field level boosts 814 | * 815 | * @param boostFields The field level boosts specified in the request 816 | */ 817 | public void setBoostFields(Map boostFields) { 818 | this.boostFields = boostFields; 819 | } 820 | 821 | /** 822 | * Sets the field level boosts 823 | * 824 | * @param boostFields The field level boosts specified in the request 825 | */ 826 | public void setStreamBoostFields(Map boostFields) { 827 | this.streamBoostFields = boostFields; 828 | } 829 | 830 | /** 831 | * Gets the payload fields, if specified 832 | * 833 | * @return array of payload fields 834 | */ 835 | public String[] getPayloadFields() { 836 | String[] arr = new String[this.payloadFields.size()]; 837 | return this.payloadFields.toArray(arr); 838 | } 839 | 840 | /** 841 | * Sets the payload fields. These fields use the stored payload value to apply a multiplicative boost to the term values 842 | * 843 | * @param payloadFields the array of payload field names 844 | */ 845 | public void setPayloadFields(String[] payloadFields) { 846 | if(payloadFields == null) { 847 | return; 848 | } 849 | for(String fieldname: payloadFields){ 850 | this.payloadFields.add(fieldname.trim().toLowerCase()); 851 | } 852 | } 853 | 854 | /** 855 | * Return a query that will return docs queryFromDocuments the passed lucene document ID. 856 | * 857 | * @param docNums the documentIDs of the lucene docs to generate the 'More Like This" query for. 858 | * @return a query that will return docs queryFromDocuments the passed lucene document ID. 859 | */ 860 | public RFQuery like(List docNums) throws IOException { 861 | 862 | Map> fieldTermFreq = new HashMap>(); 863 | Map> mustMatchTerms = new HashMap>(); 864 | Map> mustNOTMatchTerms = new HashMap>(); 865 | // don't go over duplicate documents 866 | for(Integer docNum: docNums){ 867 | retrieveTerms(docNum, getFieldNames(), fieldTermFreq); 868 | retrieveTerms(docNum, getMatchFieldNames(), mustMatchTerms); 869 | retrieveTerms(docNum, getDifferentFieldNames(), mustNOTMatchTerms); 870 | } 871 | 872 | RFQuery rfResult = buildQueryFromFieldTermFrequencies(fieldTermFreq, false); 873 | if(mustMatchTerms.size() > 0){ 874 | rfResult.setMustMatchQuery(buildMustMatchQuery(mustMatchTerms, true)); 875 | } 876 | if(mustNOTMatchTerms.size() > 0){ 877 | rfResult.setMustNOTMatchQuery(buildMustMatchQuery(mustNOTMatchTerms, false)); 878 | } 879 | return rfResult; 880 | } 881 | 882 | /** 883 | * Return a query that will return docs queryFromDocuments the passed Reader. 884 | * 885 | * @param reader a stream reader for the document stream (from the stream.body parameter) 886 | * @return a query that will return docs queryFromDocuments the passed Reader. 887 | */ 888 | public RFQuery like(Reader reader) throws IOException { 889 | 890 | return like(getStreamHeadfieldNames(), getStreamBodyfieldNames(), reader); 891 | } 892 | 893 | private RFQuery like(String[] streamHeadfields, String[] streamBodyfields, Reader reader) throws IOException { 894 | 895 | if(streamBodyfields == null){ 896 | throw new UnsupportedOperationException( 897 | String.format("To use RelevancyFeedback to process a document stream, a field list must be specified " 898 | + "using either the %s parameter or the %s parameter", 899 | RFParams.SIMILARITY_FIELDS, RFParams.STREAM_BODY_FL)); 900 | } 901 | 902 | Map> fieldTermFreq = new HashMap>(); 903 | String streamBody = org.apache.commons.io.IOUtils.toString(reader); 904 | for(String fieldName: streamBodyfields){ 905 | Map words = new HashMap(); 906 | fieldTermFreq.put(fieldName, words); 907 | addTermWeights(new StringReader(streamBody), words, fieldName); 908 | } 909 | if(getStreamHead() != null){ 910 | if(streamHeadfields == null){ 911 | throw new UnsupportedOperationException( 912 | String.format("To use RelevancyFeedback to process a document stream using the stream.head as input," 913 | +"a field list must be specified using either the %s parameter or the %s parameter", 914 | RFParams.SIMILARITY_FIELDS, RFParams.STREAM_HEAD_FL)); 915 | } 916 | for(String fieldName: streamHeadfields){ 917 | Map words = null; 918 | if(fieldTermFreq.containsKey(fieldName)) { 919 | words = fieldTermFreq.get(fieldName); 920 | } 921 | else{ 922 | words = new HashMap(); 923 | fieldTermFreq.put(fieldName, words); 924 | } 925 | addTermWeights(new StringReader(getStreamHead()), words, fieldName); 926 | } 927 | } 928 | return buildQueryFromFieldTermFrequencies(fieldTermFreq, true); 929 | } 930 | 931 | private RFQuery buildQueryFromFieldTermFrequencies(Map> fieldTermFreq, boolean contentStreamQuery) throws IOException { 932 | 933 | List interestingTerms = new ArrayList(); 934 | for(String fieldName: fieldTermFreq.keySet()){ 935 | Map words = fieldTermFreq.get(fieldName); 936 | PriorityQueue queue = createQueue(fieldName, words, contentStreamQuery); 937 | interestingTerms.addAll(getMostInterestingTerms(queue)); 938 | } 939 | 940 | RFQuery rfResult = new RFQuery(interestingTerms, getMm()); 941 | return rfResult; 942 | } 943 | 944 | /** 945 | * Compute the top most interesting terms from the priority queue of all RF Terms 946 | */ 947 | private List getMostInterestingTerms(PriorityQueue q) { 948 | 949 | int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField; 950 | double sumQuaredBoost = 0.0f; 951 | 952 | List interestingTerms = new ArrayList(); 953 | RFTerm currentTerm = null; 954 | while ((currentTerm = q.pop()) != null 955 | && interestingTerms.size() < maxTerms) { 956 | // if not boost, then set score to 1.0 not tf.idf 957 | // now implemented inside RFTerm 958 | 959 | // if not boost, boostValue == 1.0, so this just adds 1 as desired 960 | sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2); 961 | interestingTerms.add(currentTerm); 962 | } 963 | 964 | float vectorLength = (float) Math.sqrt(sumQuaredBoost); 965 | if(vectorLength <= 0.0){ 966 | return new ArrayList(); 967 | } 968 | 969 | if(this.isNormalizeFieldBoosts()){ 970 | for(RFTerm term: interestingTerms){ 971 | term.setVectorLength(vectorLength); 972 | } 973 | } 974 | return interestingTerms; 975 | } 976 | 977 | /** 978 | * Create a PriorityQueue from a word->tf map. 979 | * 980 | * @param words a map of words keyed on the word(String) with Int objects as the values. 981 | */ 982 | private PriorityQueue createQueue(String fieldName, Map words, boolean contentStreamQuery) throws IOException { 983 | // have collected all words in doc and their freqs 984 | int numDocs = ir.numDocs(); 985 | FreqQ res = new FreqQ(words.size()); // will order words by score 986 | 987 | for (String word : words.keySet()) { // for every word 988 | if(word.trim().length() == 0) 989 | { 990 | continue; 991 | } 992 | 993 | float tf = words.get(word).x; // term freq in the source doc 994 | 995 | if (minTermFreq > 0 && tf < minTermFreq) { 996 | continue; // filter out words that don't occur enough times in the source 997 | } 998 | 999 | int docFreq = ir.docFreq(new Term(fieldName, word)); 1000 | if (minDocFreq > 0 && docFreq < minDocFreq) { 1001 | continue; // filter out words that don't occur in enough docs 1002 | } 1003 | 1004 | //if (docFreq == 0 || docFreq > maxDocFreq) { 1005 | if (docFreq > maxDocFreq) { 1006 | continue; // filter out words that occur in too many docs 1007 | } 1008 | 1009 | float idf = similarity.idf(docFreq, numDocs); 1010 | final float fieldBoost = contentStreamQuery? this.getStreamFieldBoost(fieldName): this.getFieldBoost(fieldName); 1011 | final RFTerm RFTerm; 1012 | if(isPayloadField(fieldName)){ 1013 | RFTerm = new RFTerm( 1014 | word, // the word 1015 | fieldName, // the field name 1016 | tf, // tf 1017 | idf, // idf 1018 | docFreq, // freq in all docs 1019 | isLogTf(), 1020 | fieldBoost, 1021 | tf, // this is the payload score if a payload field. Code could better reflect this admittedly 1022 | this.boost, 1023 | true 1024 | ); 1025 | } 1026 | else{ 1027 | RFTerm = new RFTerm( 1028 | word, // the word 1029 | fieldName, // the field name 1030 | tf, // tf 1031 | idf, // idf 1032 | docFreq, // freq in all docs 1033 | this.isLogTf(), 1034 | fieldBoost, 1035 | this.boost 1036 | ); 1037 | } 1038 | res.insertWithOverflow(RFTerm); 1039 | } 1040 | return res; 1041 | } 1042 | 1043 | private BooleanQuery buildMustMatchQuery(Map> fieldValues, boolean mustMatch){ 1044 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder(); 1045 | for(Map.Entry> entry: fieldValues.entrySet()){ 1046 | String fieldName = entry.getKey(); 1047 | for(Map.Entry fieldValue: entry.getValue().entrySet()){ 1048 | String value = fieldValue.getKey(); 1049 | TermQuery tq = new TermQuery(new Term(fieldName, value)); 1050 | if(mustMatch) { 1051 | qryBuilder.add(tq, BooleanClause.Occur.MUST); 1052 | } 1053 | else{ 1054 | qryBuilder.add(tq, BooleanClause.Occur.MUST_NOT); 1055 | } 1056 | } 1057 | } 1058 | return qryBuilder.build(); 1059 | } 1060 | 1061 | /** 1062 | * Describe the parameters that control how the "more queryFromDocuments this" query is formed. 1063 | */ 1064 | public String describeParams() { 1065 | StringBuilder sb = new StringBuilder(); 1066 | sb.append("\t").append("maxQueryTermsPerField : ").append(maxQueryTermsPerField).append("\n"); 1067 | sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); 1068 | sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); 1069 | sb.append("\t").append("fieldNames : "); 1070 | String delim = ""; 1071 | for (String fieldName : getFieldNames()) { 1072 | sb.append(delim).append(fieldName); 1073 | delim = ", "; 1074 | } 1075 | sb.append("\n"); 1076 | sb.append("\t").append("boost : ").append(boost).append("\n"); 1077 | sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); 1078 | sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); 1079 | return sb.toString(); 1080 | } 1081 | 1082 | /** 1083 | * Find words for a more-queryFromDocuments-this query former. 1084 | * 1085 | * @param docNum the id of the lucene document from which to find terms 1086 | * @param fields the list of field of the lucene document from which to extract terms 1087 | * @param fieldToTermFreqMap data structure to populate with term frequencies 1088 | */ 1089 | public Map> retrieveTerms(int docNum, String[] fields, Map> fieldToTermFreqMap) throws IOException { 1090 | 1091 | if(fieldToTermFreqMap == null) { 1092 | fieldToTermFreqMap = new HashMap>(); 1093 | } 1094 | 1095 | if(fields == null || fields.length == 0){ 1096 | return fieldToTermFreqMap; 1097 | } 1098 | 1099 | final Fields vectors = ir.getTermVectors(docNum); 1100 | final Document document = ir.document(docNum); 1101 | 1102 | for (String fieldName : fields) { 1103 | 1104 | Map termFreqMap = null; 1105 | if(fieldToTermFreqMap.containsKey(fieldName)){ 1106 | termFreqMap = fieldToTermFreqMap.get(fieldName); 1107 | } 1108 | else{ 1109 | termFreqMap = new HashMap(); 1110 | fieldToTermFreqMap.put(fieldName, termFreqMap); 1111 | } 1112 | 1113 | Terms vector = null; 1114 | if (vectors != null) { 1115 | vector = vectors.terms(fieldName); 1116 | } 1117 | 1118 | // field does not store term vector info 1119 | // even if term vectors enabled, need to extract payload from regular field reader 1120 | if (vector == null || isPayloadField(fieldName)) { 1121 | IndexableField docFields[] = document.getFields(fieldName); 1122 | for (IndexableField field : docFields) { 1123 | final String stringValue = field.stringValue(); 1124 | if (stringValue != null) { 1125 | addTermWeights(new StringReader(stringValue), termFreqMap, fieldName); 1126 | } 1127 | } 1128 | } else { 1129 | addTermWeights(termFreqMap, vector); 1130 | } 1131 | } 1132 | 1133 | return fieldToTermFreqMap; 1134 | } 1135 | 1136 | /** 1137 | * Adds terms and frequencies found in vector into the Map termWeightMap 1138 | * 1139 | * @param termWeightMap a Map of terms and their weights 1140 | * @param vector List of terms and their weights for a doc/field 1141 | */ 1142 | private void addTermWeights(Map termWeightMap, Terms vector) throws IOException { 1143 | final TermsEnum termsEnum = vector.iterator(); 1144 | CharsRefBuilder spare = new CharsRefBuilder(); 1145 | BytesRef text; 1146 | while((text = termsEnum.next()) != null) { 1147 | spare.copyUTF8Bytes(text); 1148 | final String term = spare.toString(); 1149 | if (isNoiseWord(term)) { 1150 | continue; 1151 | } 1152 | final int freq = (int) termsEnum.totalTermFreq(); 1153 | 1154 | //TODO try this 1155 | //termsEnum.docsAndPositions(.....).getPayload() 1156 | 1157 | // increment frequency 1158 | Flt cnt = termWeightMap.get(term); 1159 | if (cnt == null) { 1160 | termWeightMap.put(term, new Flt(freq)); 1161 | } else { 1162 | cnt.x += freq; 1163 | } 1164 | } 1165 | } 1166 | 1167 | /** 1168 | * Adds term weights found by tokenizing text from reader into the Map words 1169 | * 1170 | * @param reader a source of text to be tokenized 1171 | * @param termWeightMap a Map of terms and their weights 1172 | * @param fieldName Used by analyzer for any special per-field analysis 1173 | */ 1174 | private void addTermWeights(Reader reader, Map termWeightMap, String fieldName) 1175 | throws IOException { 1176 | if (analyzer == null) { 1177 | throw new UnsupportedOperationException("To use RelevancyFeedback without " + 1178 | "term vectors, you must provide an Analyzer"); 1179 | } 1180 | 1181 | TokenStream ts = analyzer.tokenStream(fieldName, reader); 1182 | try { 1183 | int tokenCount = 0; 1184 | // for every token 1185 | CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); 1186 | PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); 1187 | 1188 | ts.reset(); 1189 | while (ts.incrementToken()) { 1190 | String word = termAtt.toString(); 1191 | tokenCount++; 1192 | if (tokenCount > maxNumTokensParsedPerField) { 1193 | break; 1194 | } 1195 | if(word.trim().length() == 0){ 1196 | continue; 1197 | } 1198 | if (isNoiseWord(word)) { 1199 | continue; 1200 | } 1201 | 1202 | BytesRef payload = payloadAttr.getPayload(); 1203 | float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field 1204 | if(isPayloadField(fieldName) && payload != null){ 1205 | tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 1206 | } 1207 | // increment frequency 1208 | Flt termWeight = termWeightMap.get(word); 1209 | if (termWeight == null) { 1210 | termWeightMap.put(word, new Flt(tokenWeight)); 1211 | } else { 1212 | termWeight.x += tokenWeight; 1213 | } 1214 | } 1215 | ts.end(); 1216 | } finally { 1217 | IOUtils.closeWhileHandlingException(ts); 1218 | } 1219 | } 1220 | 1221 | /** 1222 | * determines if the passed term is likely to be of interest in "more queryFromDocuments" comparisons 1223 | * 1224 | * @param term The word being considered 1225 | * @return true if should be ignored, false if should be used in further analysis 1226 | */ 1227 | private boolean isNoiseWord(String term) { 1228 | int len = term.length(); 1229 | if (minWordLen > 0 && len < minWordLen) { 1230 | return true; 1231 | } 1232 | if (maxWordLen > 0 && len > maxWordLen) { 1233 | return true; 1234 | } 1235 | return stopWords != null && stopWords.contains(term); 1236 | } 1237 | 1238 | private boolean isPayloadField(String fieldName){ 1239 | return this.payloadFields.contains(fieldName.trim().toLowerCase()); 1240 | } 1241 | 1242 | /** 1243 | * PriorityQueue that orders words by score. 1244 | */ 1245 | private static class FreqQ extends PriorityQueue { 1246 | FreqQ(int s) { 1247 | super(s); 1248 | } 1249 | 1250 | @Override 1251 | protected boolean lessThan(RFTerm aa, RFTerm bb) { 1252 | return aa.getFinalScore() > bb.getFinalScore(); 1253 | } 1254 | } 1255 | 1256 | /** 1257 | * Use for frequencies and to avoid renewing Integers. 1258 | */ 1259 | 1260 | private static class Flt { 1261 | float x; 1262 | 1263 | Flt(float x) { 1264 | this.x = x; 1265 | } 1266 | } 1267 | } -------------------------------------------------------------------------------- /src/main/java/org/dice/solrenhancements/relevancyfeedback/RelevancyFeedbackHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.dice.solrenhancements.relevancyfeedback; 19 | 20 | import com.google.common.base.Strings; 21 | import org.apache.lucene.search.Query; 22 | import org.apache.solr.common.SolrException; 23 | import org.apache.solr.common.params.*; 24 | import org.apache.solr.common.util.ContentStream; 25 | import org.apache.solr.common.util.NamedList; 26 | import org.apache.solr.handler.RequestHandlerBase; 27 | import org.apache.solr.handler.component.FacetComponent; 28 | import org.apache.solr.request.SimpleFacets; 29 | import org.apache.solr.request.SolrQueryRequest; 30 | import org.apache.solr.response.SolrQueryResponse; 31 | import org.apache.solr.schema.SchemaField; 32 | import org.apache.solr.search.*; 33 | import org.apache.solr.util.SolrPluginUtils; 34 | import org.dice.solrenhancements.JarVersion; 35 | 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | 39 | import java.io.IOException; 40 | import java.io.Reader; 41 | import java.net.MalformedURLException; 42 | import java.net.URL; 43 | import java.util.*; 44 | 45 | /** 46 | * Solr RelevancyFeedback -- 47 | * 48 | * Return similar documents either based on a single document or based on posted text. 49 | * 50 | * @since solr 1.3 51 | */ 52 | public class RelevancyFeedbackHandler extends RequestHandlerBase 53 | { 54 | private final static String EDISMAX = ExtendedDismaxQParserPlugin.NAME; 55 | private String version = null; 56 | 57 | private static final Logger log = LoggerFactory.getLogger( RelevancyFeedbackHandler.class ); 58 | 59 | 60 | @Override 61 | public void init(NamedList args) { 62 | super.init(args); 63 | } 64 | 65 | @Override 66 | public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception 67 | { 68 | // set and override parameters 69 | SolrIndexSearcher searcher = req.getSearcher(); 70 | SchemaField uniqueKeyField = searcher.getSchema().getUniqueKeyField(); 71 | ModifiableSolrParams params = new ModifiableSolrParams(req.getParams()); 72 | configureSolrParameters(req, params, uniqueKeyField.getName()); 73 | 74 | // Set field flags 75 | ReturnFields returnFields = new SolrReturnFields( req ); 76 | rsp.setReturnFields( returnFields ); 77 | int flags = 0; 78 | if (returnFields.wantsScore()) { 79 | flags |= SolrIndexSearcher.GET_SCORES; 80 | } 81 | // note: set in configureSolrParameters 82 | String userQdefType = params.get(QueryParsing.DEFTYPE, EDISMAX); 83 | String rfDefType = params.get(RFParams.RF_DEFTYPE, EDISMAX); 84 | 85 | String userQ = params.get( CommonParams.Q ); 86 | String rfQ = params.get(RFParams.RF_QUERY); 87 | 88 | Query rfQuery = null; 89 | Query userQuery = null; 90 | 91 | SortSpec sortSpec = null; 92 | QParser rfQueryParser = null; 93 | QParser userQueryParser = null; 94 | 95 | List targetFqFilters = null; 96 | List rfFqFilters = null; 97 | 98 | try { 99 | if (rfQ != null) { 100 | rfQueryParser = QParser.getParser(rfQ, rfDefType, req); 101 | rfQuery = rfQueryParser.getQuery(); 102 | sortSpec = rfQueryParser.getSort(true); 103 | } 104 | else{ 105 | rfQueryParser = QParser.getParser(null, rfDefType, req); 106 | sortSpec = rfQueryParser.getSort(true); 107 | } 108 | 109 | targetFqFilters = getFilters(req, CommonParams.FQ); 110 | rfFqFilters = getFilters(req, RFParams.FQ); 111 | } catch (SyntaxError e) { 112 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); 113 | } 114 | 115 | try { 116 | if (userQ != null) { 117 | userQueryParser = QParser.getParser(userQ, userQdefType, req); 118 | userQuery = userQueryParser.getQuery(); 119 | } 120 | 121 | } catch (SyntaxError e) { 122 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); 123 | } 124 | 125 | RFHelper rfhelper = new RFHelper( params, searcher, uniqueKeyField, rfQueryParser ); 126 | 127 | // Hold on to the interesting terms if relevant 128 | RFParams.TermStyle termStyle = RFParams.TermStyle.get(params.get(RFParams.INTERESTING_TERMS)); 129 | 130 | RFResult RFResult = null; 131 | DocListAndSet rfDocs = null; 132 | 133 | // Parse Required Params 134 | // This will either have a single Reader or valid query 135 | Reader reader = null; 136 | try { 137 | int start = params.getInt(CommonParams.START, 0); 138 | int rows = params.getInt(CommonParams.ROWS, 10); 139 | 140 | // for use when passed a content stream 141 | if (rfQ == null || rfQ.trim().length() < 1) { 142 | reader = getContentStreamReader(req, reader); 143 | } 144 | // Find documents RelevancyFeedback - either with a reader or a query 145 | // -------------------------------------------------------------------------------- 146 | if (reader != null) { 147 | // this will only be initialized if used with a content stream (see above) 148 | rfQ = "NULL - from content stream"; 149 | RFResult = rfhelper.getMatchesFromContentSteam(reader, start, rows, rfFqFilters, flags, sortSpec.getSort(), userQuery); 150 | } else if (rfQ != null) { 151 | // Matching options 152 | RFResult = getMatchesFromQuery(rsp, params, flags, rfQ, rfQuery, userQuery, sortSpec, 153 | targetFqFilters, rfFqFilters, searcher, rfhelper, start, rows); 154 | } else { 155 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 156 | "RelevancyFeedback requires either a query (?rf.q=) or text (using stream.head and stream.body fields in a POST) to find similar documents."); 157 | } 158 | if(RFResult != null) 159 | { 160 | rfDocs = RFResult.getResults(); 161 | } 162 | 163 | } finally { 164 | if (reader != null) { 165 | reader.close(); 166 | } 167 | } 168 | 169 | if( rfDocs == null ) { 170 | rfDocs = new DocListAndSet(); // avoid NPE 171 | } 172 | rsp.add( "response", rfDocs.docList ); 173 | if(RFResult != null && RFResult.getQuery() != null) { 174 | rsp.add(RFParams.PREFIX + "query:", RFResult.getQuery().toString()); 175 | } 176 | 177 | if( RFResult != null && termStyle != RFParams.TermStyle.NONE) { 178 | addInterestingTerms(rsp, termStyle, RFResult); 179 | } 180 | 181 | // maybe facet the results 182 | if (params.getBool(FacetParams.FACET,false)) { 183 | addFacet(req, rsp, params, rfDocs); 184 | } 185 | 186 | addDebugInfo(req, rsp, rfQ, rfFqFilters, rfhelper, RFResult, rfDocs); 187 | } 188 | 189 | private void configureSolrParameters(SolrQueryRequest req, ModifiableSolrParams params, String uniqueKeyField){ 190 | 191 | // default to the the edismax parser 192 | String defType = params.get(QueryParsing.DEFTYPE, EDISMAX); 193 | // allow useage of custom edismax implementations, such as our own 194 | if(defType.toLowerCase().contains(EDISMAX.toLowerCase())){ 195 | params.set(DisMaxParams.MM, 0); 196 | // edismax blows up without df field, even if you specify the field to match on in the query 197 | params.set(CommonParams.DF, uniqueKeyField); 198 | } 199 | params.set(QueryParsing.DEFTYPE, defType); 200 | req.setParams(params); 201 | } 202 | 203 | private Reader getContentStreamReader(SolrQueryRequest req, Reader reader) throws IOException { 204 | Iterable streams = req.getContentStreams(); 205 | if (streams != null) { 206 | Iterator iter = streams.iterator(); 207 | if (iter.hasNext()) { 208 | reader = iter.next().getReader(); 209 | } 210 | if (iter.hasNext()) { 211 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 212 | "RelevancyFeedback does not support multiple ContentStreams"); 213 | } 214 | } 215 | return reader; 216 | } 217 | 218 | private RFResult getMatchesFromQuery(SolrQueryResponse rsp, SolrParams params, int flags, String q, Query query, Query userQuery, SortSpec sortSpec, List targetFqFilters, List rfFqFilters, SolrIndexSearcher searcher, RFHelper rfHelper, int start, int rows) throws IOException, SyntaxError { 219 | 220 | boolean includeMatch = params.getBool(RFParams.MATCH_INCLUDE, true); 221 | int matchOffset = params.getInt(RFParams.MATCH_OFFSET, 0); 222 | // Find the base match 223 | DocList match = searcher.getDocList(query, targetFqFilters, null, matchOffset, 10000, flags); // only get the first one... 224 | if(match.matches() == 0 && userQuery == null){ 225 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, 226 | String.format("RelevancyFeedback was unable to find any documents matching the query: '%s'.", q)); 227 | } 228 | 229 | if (includeMatch) { 230 | rsp.add("match", match); 231 | } 232 | 233 | // This is an iterator, but we only handle the first match 234 | DocIterator iterator = match.iterator(); 235 | if (iterator.hasNext() || userQuery != null) { 236 | // do a RelevancyFeedback query for each document in results 237 | return rfHelper.getMatchesFromDocs(iterator, start, rows, rfFqFilters, flags, sortSpec.getSort(), userQuery); 238 | } 239 | return null; 240 | } 241 | 242 | private List extractInterestingTerms(List RFTerms){ 243 | List terms = new ArrayList(); 244 | for( RFTerm term : RFTerms) { 245 | InterestingTerm it = new InterestingTerm(); 246 | it.term = term.getTerm(); 247 | it.boost = term.getFinalScore(); 248 | terms.add(it); 249 | } 250 | Collections.sort(terms, InterestingTerm.BOOST_ORDER); 251 | return terms; 252 | } 253 | 254 | private void addInterestingTerms(SolrQueryResponse rsp, RFParams.TermStyle termStyle, RFResult RFResult) { 255 | 256 | List RFTerms = RFResult.getRFTerms(); 257 | Collections.sort(RFTerms, RFTerm.FLD_BOOST_X_SCORE_ORDER); 258 | 259 | if( termStyle == RFParams.TermStyle.DETAILS ) { 260 | List interesting = extractInterestingTerms(RFResult.getRFTerms()); 261 | 262 | int longest = 0; 263 | for( InterestingTerm t : interesting ) { 264 | longest = Math.max(t.term.toString().length(), longest); 265 | } 266 | 267 | NamedList it = new NamedList(); 268 | for( InterestingTerm t : interesting ) { 269 | it.add( Strings.padEnd(t.term.toString(), longest, ' '), t.boost ); 270 | } 271 | rsp.add( "interestingTerms", it ); 272 | } 273 | else { 274 | List it = new ArrayList( RFTerms.size() ); 275 | for( RFTerm RFTerm : RFTerms) { 276 | it.add(RFTerm.getWord()); 277 | } 278 | rsp.add( "interestingTerms", it ); 279 | } 280 | } 281 | 282 | private void addFacet(SolrQueryRequest req, SolrQueryResponse rsp, SolrParams params, DocListAndSet rfDocs) { 283 | if( rfDocs.docSet == null ) { 284 | rsp.add( "facet_counts", null ); 285 | } 286 | else { 287 | FacetComponent fct = new FacetComponent(); 288 | rsp.add( "facet_counts", fct.getFacetCounts(new SimpleFacets(req, rfDocs.docSet, params )) ); 289 | } 290 | } 291 | 292 | private void addDebugInfo(SolrQueryRequest req, SolrQueryResponse rsp, String q, List rfFqFilters, RFHelper rfHelper, RFResult RFResult, DocListAndSet rfDocs) { 293 | 294 | boolean dbg = req.getParams().getBool(CommonParams.DEBUG_QUERY, false); 295 | boolean dbgQuery = false, dbgResults = false; 296 | if (dbg == false){//if it's true, we are doing everything anyway. 297 | String[] dbgParams = req.getParams().getParams(CommonParams.DEBUG); 298 | if (dbgParams != null) { 299 | for (int i = 0; i < dbgParams.length; i++) { 300 | if (dbgParams[i].equals(CommonParams.QUERY)){ 301 | dbgQuery = true; 302 | } else if (dbgParams[i].equals(CommonParams.RESULTS)){ 303 | dbgResults = true; 304 | } 305 | } 306 | } 307 | } else { 308 | dbgQuery = true; 309 | dbgResults = true; 310 | } 311 | // Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug? 312 | if (dbg == true && RFResult != null) { 313 | try { 314 | 315 | NamedList it = getRFTermsForDebug(RFResult); 316 | 317 | NamedList