├── .gitignore
├── DiceRelevancyFeedback.iml
├── LICENSE
├── README.md
├── pom.xml
├── src
└── main
│ └── java
│ └── org
│ └── dice
│ └── solrenhancements
│ ├── JarVersion.java
│ ├── relevancyfeedback
│ ├── InterestingTerm.java
│ ├── RFHelper.java
│ ├── RFParams.java
│ ├── RFQuery.java
│ ├── RFResult.java
│ ├── RFTerm.java
│ ├── RelevancyFeedback.java
│ └── RelevancyFeedbackHandler.java
│ ├── tokenfilters
│ ├── ConcatenateTokenFilter.java
│ ├── ConcatenateTokenFilterFactory.java
│ ├── ConstantTokenFilter.java
│ ├── ConstantTokenFilterFactory.java
│ ├── MeanPayloadTokenFilter.java
│ ├── MeanPayloadTokenFilterFactory.java
│ ├── PayloadQueryBoostTokenFilter.java
│ ├── PayloadQueryBoostTokenFilterFactory.java
│ ├── TypeEraseFilter.java
│ └── TypeEraseFilterFactory.java
│ └── unsupervisedfeedback
│ ├── UnsupervisedFeedbackHandler.java
│ ├── UnsupervisedFeedbackHelper.java
│ └── UnsupervisedFeedbackParams.java
└── target
└── DiceRelevancyFeedback-1.0.jar
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | #*.jar
15 | *.war
16 | *.ear
17 | *.zip
18 | *.tar.gz
19 | *.rar
20 |
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 |
24 | *.class
25 |
26 | #idea files
27 | **/.idea/workspace.xml
28 | **/.idea/tasks.xml
29 | .idea
30 |
31 | #target/*
32 | target/classes/
33 | target/maven-archiver/*
34 | target/maven-status/*
35 | target/generated-sources/
36 | target/generated-test-sources/
37 | target/surefire/
38 | target/test-classes/
39 |
--------------------------------------------------------------------------------
/DiceRelevancyFeedback.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Dice Relevancy Feedback
2 | ========================
3 |
4 | Dice.com's solr plugins for performing personalized search, and recommendations (via the relevancy feedback plugin) and conceptual / semantic search (via the unsupervised feedback plugin).
5 |
6 | ## Links
7 | * [Slides from the talk](https://www.slideshare.net/lucidworks/personalized-search-and-job-recommendations-simon-hughes-dicecom)
8 | * [Video of the Talk](https://www.youtube.com/watch?v=-uiQY2Zatjo&index=31&list=PLU6n9Voqu_1FMt0C-tVNFK0PBqWhTb2Nv)
9 |
10 | ## Building the Plugin
11 | A pre-built jar file can be found in the ```./target``` folder. The project contains a maven pom.xml file which can also be used to build it from source.
12 |
13 | ## Supported Solr versions
14 | - Solr 5.4 (see branch)
15 | - Solr 6.3 (see branch) **also master**
16 | - Solr 7.0 (see branch) - also works in 7.1
17 |
18 | If there is a particular version of Solr you need this for, please create a GitHub issue and I'll see what I can do.
19 | To manually compile it for a specific version, use maven to compile the plugins using the pom.xml file, and update the versions of the solr and lucene libraries in that file, and use maven to pull in those dependencies. Then fix any compilation errors.
20 |
21 | ## Importing into SOLR
22 | Please see the official SOLR guidelines for registering plugins with solr. This basically involves simply dropping the jar file into one of the folders that Solr checks for class and jar files on core reload.
23 |
24 | - [Solr Plugins](https://wiki.apache.org/solr/SolrPlugins)
25 | - [Adding custom plugins in Solr cloud](https://lucene.apache.org/solr/guide/6_6/adding-custom-plugins-in-solrcloud-mode.html)
26 |
27 | # Relevancy Feedback Plugin
28 | An **example request handler configuration** for the solrconfig.xml is shown below, with comments outlining the main parameters:
29 | ```$xml
30 |
31 |
32 | true
33 | json
34 | true
35 |
36 |
37 | lucene
38 |
39 |
40 | jobTitle,skill,company
41 |
42 | skillFromSkill,extractTitles
43 |
44 |
46 | skillFromSkill^3 extractTitles^4.5
47 |
48 | 10
49 |
50 |
51 | 10
52 | true
53 |
54 |
55 | true
56 |
57 |
58 | true
59 |
60 |
61 |
62 | 25%
63 |
64 |
65 | details
66 |
67 |
68 |
69 |
70 |
74 |
75 |
76 |
77 |
78 |
83 |
84 |
85 |
86 |
87 |
88 | title
89 | company_text^0.01 title^12 skill^4 description^0.3
90 | company_text^0.01 title^12 skill^4 description^0.6
91 |
92 |
96 |
97 |
98 | title,title_syn
99 | extractSkills,extractTitles
100 |
101 |
102 |
103 |
104 |
107 |
108 |
109 |
110 |
111 |
112 |
114 | extractSkills^4.5 extractTitles^2.25 title^3.0 title_syn^3.0
115 |
116 |
117 | ```
118 | #### Example Request
119 | [http://localhost:8983/solr/Jobs/rf?q=id:11f407d319d6cc707437fad874a097c0+id:a2fd2f2e34667d61fadcdcabfd359cf4&rows=10&df=title&fl=title,skills,geoCode,city,state&wt=json](http://localhost:8983/solr/Jobs/rf?q=id:11f407d319d6cc707437fad874a097c0+id:a2fd2f2e34667d61fadcdcabfd359cf4&rows=10&df=title&fl=title,skills,geoCode,city,state&wt=json)
120 |
121 | #### Example Response
122 | ```$json
123 | {
124 | "match":{
125 | "numFound":2,
126 | "start":0,
127 | "docs":[
128 | {
129 | "id":"a2fd2f2e34667d61fadcdcabfd359cf4",
130 | "title":"Console AAA Sports Video Game Programmer.",
131 | "skills":["Sports Game Experience a plus.",
132 | "2-10 years plus Console AAA Video Game Programming Experience"],
133 | "geocode":"38.124447,-122.55051",
134 | "city":"Novato",
135 | "state":"CA"
136 | },
137 | {
138 | "id":"11f407d319d6cc707437fad874a097c0",
139 | "title":"Game Engineer - Creative and Flexible Work Environment!",
140 | "skills":["3D Math",
141 | "Unity3d",
142 | "C#",
143 | "3D Math - game programming",
144 | "game programming",
145 | "C++",
146 | "Java"],
147 | "geocode":"33.97331,-118.243614",
148 | "city":"Los Angeles",
149 | "state":"CA"
150 | }
151 | ]
152 | },
153 | "response":{
154 | "numFound":5333,
155 | "start":0,
156 | "docs":[
157 | {
158 | "title":"Software Design Engineer 3 (Game Developer)",
159 | "skills":["C#",
160 | "C++",
161 | "Unity"],
162 | "geocode":"47.683647,-122.12183",
163 | "city":"Redmond",
164 | "state":"WA"
165 | },
166 | {
167 | "title":"Game Server Engineer - MMO Mobile Gaming Start-Up!",
168 | "skills":["AWS",
169 | "Node.JS",
170 | "pubnub",
171 | "Websockets",
172 | "pubnub - Node.JS",
173 | "Vagrant",
174 | "Linux",
175 | "Git",
176 | "MongoDB",
177 | "Jenkins",
178 | "Docker"],
179 | "geocode":"37.777115,-122.41733",
180 | "city":"San Francisco",
181 | "state":"CA"
182 | },...
183 | ]
184 | }
185 | }
186 | ```
187 |
188 | # Unsupervised Feedback (Blind Feedback) Plugin
189 | An example request handler configuration for the solrconfig.xml is shown below, with comments outlining the main parameters:
190 | ```$xml
191 |
192 |
193 | true
194 | json
195 | true
196 |
197 |
198 | edismax
199 | title
200 | title^1.5 skills^1.25 description^1.1
201 | title^3.0 skills^2.5 description^1.5
202 | 1
203 | OR
204 |
205 | jobTitle,skills,company
206 | 30
207 |
208 |
209 | skillsFromskills,titleFromJobTitle
210 |
211 | 50
212 |
213 | 10
214 | true
215 |
216 |
218 |
219 | skillsFromskills^4.5 titleFromJobTitle^6.0
220 |
221 |
222 | details
223 |
224 |
225 | true
226 |
227 | false
228 |
229 |
230 | ```
231 | #### Example Request
232 | [http://localhost:8983/solr/DiceJobsCP/ufselect?q=Machine+Learning+Engineer&start=0&rows=10&uf.logtf=false&fl=title,skills,geoCode,city,state&fq={!geofilt+sfield=jobEndecaGeoCode+d=48+pt=39.6955,-105.0841}&wt=json](http://localhost:8983/solr/DiceJobsCP/ufselect?q=Machine+Learning+Engineer&start=0&rows=10&uf.logtf=false&fl=title,skills,geoCode,city,state&fq={!geofilt+sfield=jobEndecaGeoCode+d=48+pt=39.6955,-105.0841}&wt=json)
233 |
234 | #### Example Response
235 | ```$json
236 | {
237 | "match":
238 | {
239 | "numFound":7729,
240 | "start":0,
241 | "docs":[
242 | {
243 | "title":"NLP/Machine Learning Engineer",
244 | "skills":["Linux",
245 | "NLP (Natural Language Processing)",
246 | "SQL",
247 | "Bash",
248 | "Python",
249 | "ML (Machine Learning)",
250 | "JavaScript",
251 | "Java"],
252 | "geocode":"42.35819,-71.050674",
253 | "city":"Boston",
254 | "state":"MA"
255 | },
256 | {
257 | "title":"Machine Learning Engineer",
258 | "skills":["machine learning",
259 | "java",
260 | "scala"],
261 | "geocode":"47.60473,-122.32594",
262 | "city":"Seattle",
263 | "state":"WA"
264 | },
265 | {
266 | "title":"Machine Learning Engineer - REMOTE!",
267 | "skills":["Neo4j",
268 | "Hadoop",
269 | "gensim",
270 | "gensim - C++",
271 | "Java",
272 | "R",
273 | "MongoDB",
274 | "elastic search",
275 | "sci-kit learn",
276 | "Python",
277 | "C++"],
278 | "geocode":"37.777115,-122.41733",
279 | "city":"San Francisco",
280 | "state":"CA"
281 | },...
282 | ]
283 | }
284 | ```
285 |
286 | ### Isn't this just the MLT Handler?
287 | While it is loosely based on the Solr MLT handler code and algorithm (which is just the Rocchio algorithm), there are some key differences in the algorithm design. The MLT handler takes the top k terms across all configured fields when constructing the MLT query. If you have a field that has a broader vocabulary than the other fields, the average document frequency of a term will be lower than in other fields with smaller vocabularies. This means that these terms will have high relative idf scores and tend to dominate the top terms selected by the Solr MLT handler. Our request handler takes the top k terms per field. It also ensure that that no matter how many terms are matched per field (up to the configured limit), that field has the same weighting in the resulting query as all other fields, before the field specific weights specified in the rf.qf parameter are applied. This is the second problem with the Solr MLT handler that we address. We also provide a lot of extra functionality. We allow for passing in of content streams, matching against multiple documents (more like 'THESE' as opposed to more like 'this'), applying the boost query parser to the resulting MLT query to allow for any arbitrary solr boost to be applied (multiplicative). And we support the mm parameter, so we can force documents to come back that only match a set % of the top terms.
288 |
289 | ### Important Considerations When using for Personalized Search
290 | If you wish to use this to perform search personalization, as demonstrated in my Lucene Revolution 2017 talk, you need to pass in the user's current search query using the regular q parameter, and the information used to generate the rocchio query is passed via the rf.q parameter (when using documents to generate the Rocchio query) or via the content stream parameters (rf.stream.head and rf.stream.body, which take strings of content). Note however, that the boosts applied to the terms in the rocchio query are not of comparative weights to those in your user query, due to the process of normalization that the algorithm applies. So you will need to experiment with different rf.qf values until you find the right level of influence on your query, based on your search configuration. Also, given that the rocchio query generated for each user is likely the same across the user's search session (depending on your use case of course), a more efficent way of using this to do personalization is simply to use the RF handler to generate the rochio query for you once when the user logs in, cache this query, and then use it as a boost query (within your regular search request handler) for personalizing subsequent user searches. The handler returns the rocchio query in the rf.query parameter in the response. If you want to use the handler just to get the query (and not execute the search), you can set the rows parameter to 0. You can also iterate over the set of 'interesting terms' returned by the algorithm, along with their weights, if you set rf.interestingTerms=details, and use this to build your boost query.
291 |
292 | ### Potential Enhancements
293 | Aside from ensuring this works with more versions of solr (please leave feedback as to which versions you all want), there are a number of possible enhancements:
294 |
295 | - **Relevancy Feedback Handler** Allow the learning of negative terms from the negative examples (if supplied - needs a separate query parameter), then implement using negative boosting. Another enhancement would be to allow the max terms per field (rf.maxflqt) to be specified on a per field basis, so that you can vary the max number of terms extracted by field.
296 | - **Unsupervised Feedback (Blind Feedback)** Use the *positional relevance model* detailed in this paper: http://dl.acm.org/citation.cfm?id=1835546. This uses only terms found near the query's terms in the document, as these are generally more relevant than using the whole document. The highlighter component can presumably be used as a reference to determine how to get this information from the postings list, or maybe even used directly to get this information.
297 |
298 | ### Contact Details
299 | If you have a feature request, please submit it to the issues list. If you have questions, that is also a good place to post them, but you can also reach out to me at simon.hughes@dice.com if you don't here back.
300 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.dice.relevancyfeedback
8 | DiceRelevancyFeedback
9 | 1.0
10 | jar
11 |
12 |
13 |
14 | com.google.guava
15 | guava
16 | 12.0
17 |
18 |
19 |
20 |
21 | org.apache.solr
22 | solr-core
23 | 6.3.0
24 |
25 |
26 |
27 | org.apache.solr
28 | solr-solrj
29 | 6.3.0
30 |
31 |
32 |
33 |
34 | org.apache.lucene
35 | lucene-analyzers-common
36 | 6.3.0
37 |
38 |
39 | org.apache.lucene
40 | lucene-queryparser
41 | 6.3.0
42 |
43 |
44 | org.apache.lucene
45 | lucene-queries
46 | 6.3.0
47 |
48 |
49 | org.apache.lucene
50 | lucene-core
51 | 6.3.0
52 |
53 |
54 | org.json
55 | json
56 | 20131018
57 |
58 |
59 |
60 | junit
61 | junit
62 | 4.11
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/JarVersion.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements;
2 |
3 | import org.slf4j.Logger;
4 |
5 | import java.io.InputStream;
6 | import java.net.URL;
7 | import java.util.Enumeration;
8 |
9 | /**
10 | * Created by simon.hughes on 7/7/16.
11 | */
12 | public class JarVersion {
13 |
14 | private class stub{
15 |
16 | }
17 |
18 | public static String getVersion(Logger log){
19 |
20 | Enumeration resources;
21 | StringBuilder stringBuilder = new StringBuilder();
22 |
23 | try {
24 | resources = stub.class.getClassLoader().getResources("META-INF/MANIFEST.MF");
25 | while (resources.hasMoreElements()) {
26 | URL url = resources.nextElement();
27 | /* let's not read other jar's manifests */
28 | if (!url.toString().contains("DiceSolrEnhancements")) {
29 | continue;
30 | }
31 | InputStream reader = url.openStream();
32 | while(reader.available() > 0) {
33 | char c = (char) reader.read();
34 | stringBuilder.append(c);
35 | /* skip lines that don't contain the built-date */
36 | if (stringBuilder.toString().contains(System.getProperty("line.separator")) &&
37 | !stringBuilder.toString().contains("Build-Time")) {
38 | stringBuilder.setLength(0);
39 | }
40 | }
41 | }
42 | } catch (Exception e) {
43 | log.warn("Failed to read manifest during request for version!");
44 | return "Error reading manifest!";
45 | }
46 | return stringBuilder.toString();
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/InterestingTerm.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | import org.apache.lucene.index.Term;
4 |
5 | import java.util.Comparator;
6 |
7 | /**
8 | * Created by simon.hughes on 9/2/14.
9 | */
10 | public class InterestingTerm
11 | {
12 | public Term term;
13 | public float boost;
14 |
15 | public static Comparator BOOST_ORDER = new Comparator() {
16 | @Override
17 | public int compare(InterestingTerm t1, InterestingTerm t2) {
18 | float d = t1.boost - t2.boost;
19 | if( d == 0 ) {
20 | return 0;
21 | }
22 | return (d>0)?-1:1;
23 | }
24 | };
25 | }
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RFHelper.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | /**
4 | * Created by simon.hughes on 9/2/14.
5 | */
6 |
7 | import org.apache.lucene.document.Document;
8 | import org.apache.lucene.index.IndexReader;
9 | import org.apache.lucene.index.Term;
10 | import org.apache.lucene.queries.function.BoostedQuery;
11 | import org.apache.lucene.queries.function.FunctionQuery;
12 | import org.apache.lucene.queries.function.ValueSource;
13 | import org.apache.lucene.queries.function.valuesource.QueryValueSource;
14 | import org.apache.lucene.search.*;
15 | import org.apache.solr.common.SolrException;
16 | import org.apache.solr.common.params.FacetParams;
17 | import org.apache.solr.common.params.SolrParams;
18 | import org.apache.solr.schema.SchemaField;
19 | import org.apache.solr.search.*;
20 | import org.apache.solr.util.SolrPluginUtils;
21 |
22 | import java.io.IOException;
23 | import java.io.Reader;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import java.util.regex.Pattern;
27 |
28 | /**
29 | * Helper class for RelevancyFeedback that can be called from other request handlers
30 | */
31 | public class RFHelper
32 | {
33 | // Pattern is thread safe -- TODO? share this with general 'fl' param
34 | private static final Pattern splitList = Pattern.compile(",| ");
35 |
36 | final SolrIndexSearcher searcher;
37 | final QParser qParser;
38 | final RelevancyFeedback relevancyFeedback;
39 | final IndexReader reader;
40 | final SchemaField uniqueKeyField;
41 | final boolean needDocSet;
42 |
43 |
44 | public RFHelper(SolrParams params, SolrIndexSearcher searcher, SchemaField uniqueKeyField, QParser qParser )
45 | {
46 | this.searcher = searcher;
47 | this.qParser = qParser;
48 | this.reader = searcher.getIndexReader();
49 | this.uniqueKeyField = uniqueKeyField;
50 | this.needDocSet = params.getBool(FacetParams.FACET, false);
51 |
52 | SolrParams required = params.required();
53 | String[] fields = splitList.split(required.get(RFParams.SIMILARITY_FIELDS));
54 | if( fields.length < 1 ) {
55 | throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
56 | "RelevancyFeedback requires at least one similarity field: "+ RFParams.SIMILARITY_FIELDS );
57 | }
58 |
59 | this.relevancyFeedback = new RelevancyFeedback( reader );
60 | relevancyFeedback.setFieldNames(fields);
61 |
62 | final String flMustMatch = params.get(RFParams.FL_MUST_MATCH);
63 | if( flMustMatch != null && flMustMatch.trim().length() > 0 ) {
64 | String[] mustMatchFields = splitList.split(flMustMatch.trim());
65 | relevancyFeedback.setMatchFieldNames(mustMatchFields);
66 | }
67 |
68 | final String flMustNOTMatch = params.get(RFParams.FL_MUST_NOT_MATCH);
69 | if( flMustNOTMatch != null && flMustNOTMatch.trim().length() > 0 ) {
70 | String[] differntMatchFields = splitList.split(flMustNOTMatch.trim());
71 | relevancyFeedback.setDifferentFieldNames(differntMatchFields);
72 | }
73 |
74 | String[] payloadFields = getFieldList(RFParams.PAYLOAD_FIELDS, params);
75 | if(payloadFields != null){
76 | throw new RuntimeException("Payload fields are not currently supported");
77 | //relevancyFeedback.setPayloadFields(payloadFields);
78 | }
79 | relevancyFeedback.setAnalyzer( searcher.getSchema().getIndexAnalyzer() );
80 |
81 | // configurable params
82 |
83 | relevancyFeedback.setMm( params.get(RFParams.MM, RelevancyFeedback.DEFAULT_MM));
84 | relevancyFeedback.setMinTermFreq( params.getInt(RFParams.MIN_TERM_FREQ, RelevancyFeedback.DEFAULT_MIN_TERM_FREQ));
85 | relevancyFeedback.setMinDocFreq( params.getInt(RFParams.MIN_DOC_FREQ, RelevancyFeedback.DEFAULT_MIN_DOC_FREQ));
86 | relevancyFeedback.setMaxDocFreq( params.getInt(RFParams.MAX_DOC_FREQ, RelevancyFeedback.DEFAULT_MAX_DOC_FREQ));
87 | relevancyFeedback.setMinWordLen( params.getInt(RFParams.MIN_WORD_LEN, RelevancyFeedback.DEFAULT_MIN_WORD_LENGTH));
88 | relevancyFeedback.setMaxWordLen( params.getInt(RFParams.MAX_WORD_LEN, RelevancyFeedback.DEFAULT_MAX_WORD_LENGTH));
89 |
90 | relevancyFeedback.setBoost( params.getBool(RFParams.BOOST, true ) );
91 |
92 | // new parameters
93 | relevancyFeedback.setBoostFn(params.get(RFParams.BOOST_FN));
94 | relevancyFeedback.setNormalizeFieldBoosts(params.getBool(RFParams.NORMALIZE_FIELD_BOOSTS, RelevancyFeedback.DEFAULT_NORMALIZE_FIELD_BOOSTS));
95 | // new versions of previous parameters moved to the field level
96 | relevancyFeedback.setMaxQueryTermsPerField(params.getInt(RFParams.MAX_QUERY_TERMS_PER_FIELD, RelevancyFeedback.DEFAULT_MAX_QUERY_TERMS_PER_FIELD));
97 | relevancyFeedback.setMaxNumTokensParsedPerField(params.getInt(RFParams.MAX_NUM_TOKENS_PARSED_PER_FIELD, RelevancyFeedback.DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD));
98 | relevancyFeedback.setLogTf(params.getBool(RFParams.IS_LOG_TF, RelevancyFeedback.DEFAULT_IS_LOG_TF));
99 |
100 | relevancyFeedback.setBoostFields(SolrPluginUtils.parseFieldBoosts(params.getParams(RFParams.QF)));
101 | relevancyFeedback.setStreamBoostFields(SolrPluginUtils.parseFieldBoosts(params.getParams(RFParams.STREAM_QF)));
102 |
103 | String streamHead = params.get(RFParams.STREAM_HEAD);
104 | if(streamHead != null) {
105 | relevancyFeedback.setStreamHead(streamHead);
106 | }
107 |
108 | // Set stream fields
109 | String[] streamHeadFields = getFieldList(RFParams.STREAM_HEAD_FL, params);
110 | if(streamHeadFields != null){
111 | relevancyFeedback.setStreamHeadfieldNames(streamHeadFields);
112 | }
113 |
114 | String[] streamBodyFields = getFieldList(RFParams.STREAM_BODY_FL, params);
115 | if(streamBodyFields != null){
116 | relevancyFeedback.setStreamBodyfieldNames(streamBodyFields);
117 | }
118 | }
119 |
120 | private String[] getFieldList(String key, SolrParams params) {
121 | final String fieldList = params.get(key);
122 | if(fieldList != null && fieldList.trim().length() > 0) {
123 | String[] fields = splitList.split(fieldList);
124 | if(fields != null){
125 | return fields;
126 | }
127 | }
128 | return null;
129 | }
130 |
131 | private Query getBoostedFunctionQuery(Query q) throws SyntaxError{
132 |
133 | if (relevancyFeedback.getBoostFn() == null || relevancyFeedback.getBoostFn().trim().length() == 0) {
134 | return q;
135 | }
136 |
137 | Query boost = this.qParser.subQuery(relevancyFeedback.getBoostFn(), FunctionQParserPlugin.NAME).getQuery();
138 | ValueSource vs;
139 | if (boost instanceof FunctionQuery) {
140 | vs = ((FunctionQuery) boost).getValueSource();
141 | } else {
142 | vs = new QueryValueSource(boost, 1.0f);
143 | }
144 | return new BoostedQuery(q, vs);
145 | }
146 |
147 | public RFResult getMatchesFromDocs(DocIterator iterator, int start, int rows, List filters, int flags, Sort lsort, Query userQuery) throws IOException, SyntaxError
148 | {
149 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder();
150 | List ids = new ArrayList();
151 |
152 | while(iterator.hasNext()) {
153 | int id = iterator.nextDoc();
154 | Document doc = reader.document(id);
155 | ids.add(id);
156 |
157 | // add exclusion filters to prevent matching seed documents
158 | TermQuery tq = new TermQuery(new Term(uniqueKeyField.getName(), uniqueKeyField.getType().storedToIndexed(doc.getField(uniqueKeyField.getName()))));
159 | qryBuilder.add(tq, BooleanClause.Occur.MUST_NOT);
160 | }
161 |
162 | RFQuery RFQuery = relevancyFeedback.like(ids);
163 |
164 | Query rawrfQuery = RFQuery.getOrQuery();
165 |
166 | if(RFQuery.getMustMatchQuery() != null){
167 | filters.add(RFQuery.getMustMatchQuery());
168 | }
169 | if(RFQuery.getMustNOTMatchQuery() != null){
170 | filters.add(RFQuery.getMustNOTMatchQuery());
171 | }
172 |
173 | Query boostedrfQuery = getBoostedFunctionQuery(rawrfQuery);
174 | qryBuilder.add(boostedrfQuery, BooleanClause.Occur.MUST);
175 |
176 | Query finalQuery = null;
177 |
178 | if(userQuery != null){
179 | // set user query as a MUST clause, and tack on RF query as a boosted OR (should)
180 | Query rfQuery = qryBuilder.build();
181 |
182 | BooleanQuery.Builder personalizedQryBuilder = new BooleanQuery.Builder();
183 | personalizedQryBuilder.add(userQuery, BooleanClause.Occur.MUST);
184 | personalizedQryBuilder.add(rfQuery, BooleanClause.Occur.SHOULD);
185 |
186 | finalQuery = personalizedQryBuilder.build();
187 | }
188 | else{
189 | finalQuery = qryBuilder.build();
190 | }
191 |
192 | DocListAndSet results = new DocListAndSet();
193 | if (this.needDocSet) {
194 | results = searcher.getDocListAndSet(finalQuery, filters, lsort, start, rows, flags);
195 | } else {
196 | results.docList = searcher.getDocList(finalQuery, filters, lsort, start, rows, flags);
197 | }
198 |
199 | return new RFResult(RFQuery.getRFTerms(), finalQuery, results);
200 | }
201 |
202 |
203 | public RFResult getMatchesFromContentSteam(Reader reader, int start, int rows, List filters, int flags, Sort lsort, Query userQuery) throws IOException, SyntaxError
204 | {
205 | RFQuery RFQuery = relevancyFeedback.like(reader);
206 | Query rawRFQuery = RFQuery.getOrQuery();
207 |
208 | if(RFQuery.getMustMatchQuery() != null || RFQuery.getMustNOTMatchQuery() != null){
209 | throw new RuntimeException(
210 | String.format("The %s and the %s parameters are not supported for content stream queries",
211 | RFParams.FL_MUST_MATCH, RFParams.FL_MUST_NOT_MATCH));
212 | }
213 |
214 | Query boostedRFQuery = getBoostedFunctionQuery(rawRFQuery);
215 | Query finalQuery = boostedRFQuery;
216 | if(userQuery != null){
217 | // set user query as a MUST clause, and tack on RF query as a boosted OR (should)
218 | BooleanQuery.Builder personalizedQryBuilder = new BooleanQuery.Builder();
219 | personalizedQryBuilder.add(userQuery, BooleanClause.Occur.MUST);
220 | personalizedQryBuilder.add(boostedRFQuery, BooleanClause.Occur.SHOULD);
221 |
222 | finalQuery = personalizedQryBuilder.build();
223 | }
224 |
225 | DocListAndSet results = new DocListAndSet();
226 | if (this.needDocSet) {
227 | results = searcher.getDocListAndSet( finalQuery, filters, lsort, start, rows, flags);
228 | } else {
229 | results.docList = searcher.getDocList( finalQuery, filters, lsort, start, rows, flags);
230 | }
231 | return new RFResult(RFQuery.getRFTerms(), finalQuery, results);
232 | }
233 |
234 | public RelevancyFeedback getRelevancyFeedback()
235 | {
236 | return relevancyFeedback;
237 | }
238 | }
239 |
240 |
241 |
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RFParams.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | import org.apache.solr.search.QueryParsing;
4 |
5 | import java.util.Locale;
6 |
7 | /**
8 | * Created by simon.hughes on 9/4/14.
9 | */
10 | public interface RFParams {
11 | java.lang.String RF = "rf";
12 | java.lang.String PREFIX = "rf.";
13 | java.lang.String SIMILARITY_FIELDS = PREFIX + "fl";
14 | java.lang.String MIN_TERM_FREQ =PREFIX + "mintf";
15 | java.lang.String MAX_DOC_FREQ = PREFIX + "maxdf";
16 | java.lang.String MIN_DOC_FREQ = PREFIX + "mindf";
17 | java.lang.String MIN_WORD_LEN = PREFIX + "minwl";
18 | java.lang.String MAX_WORD_LEN = PREFIX + "maxwl";
19 | // don't clash with regular mm
20 | java.lang.String MM = PREFIX + "mm";
21 | //Changed from maxqt
22 | java.lang.String MAX_QUERY_TERMS_PER_FIELD = PREFIX + "maxflqt";
23 | //Changed from maxntp
24 | java.lang.String MAX_NUM_TOKENS_PARSED_PER_FIELD = PREFIX + "maxflntp";
25 | java.lang.String BOOST = PREFIX + "boost";
26 | java.lang.String FQ = PREFIX + "fq";
27 |
28 | java.lang.String QF = PREFIX + "qf";
29 |
30 | // allows user to specify a query, and we use the RF terms to boost that query
31 | java.lang.String RF_QUERY = PREFIX + "q";
32 | java.lang.String RF_DEFTYPE = PREFIX + QueryParsing.DEFTYPE;
33 |
34 | // new to this plugin
35 | java.lang.String FL_MUST_MATCH = PREFIX + "fl.match"; // list of fields that must match the target document
36 | java.lang.String FL_MUST_NOT_MATCH = PREFIX + "fl.different"; // list of fields that must NOT match the target document
37 |
38 | java.lang.String BOOST_FN = PREFIX + "boostfn";
39 | java.lang.String PAYLOAD_FIELDS = PREFIX + "payloadfl";
40 |
41 | // normalize field boosts
42 | java.lang.String NORMALIZE_FIELD_BOOSTS = PREFIX + "normflboosts";
43 | java.lang.String IS_LOG_TF = PREFIX + "logtf";
44 |
45 | java.lang.String STREAM_HEAD = "stream.head";
46 | java.lang.String STREAM_HEAD_FL = "stream.head.fl";
47 | java.lang.String STREAM_BODY_FL = "stream.body.fl";
48 |
49 | java.lang.String STREAM_QF = "stream.qf";
50 | // end new to this plugin
51 |
52 | // the /rf request handler uses 'rows'
53 | public final static String DOC_COUNT = PREFIX + "count";
54 |
55 | // Do you want to include the original document in the results or not
56 | public final static String MATCH_INCLUDE = PREFIX + "match.include";
57 |
58 | // If multiple docs are matched in the query, what offset do you want?
59 | public final static String MATCH_OFFSET = PREFIX + "match.offset";
60 |
61 | // Do you want to include the original document in the results or not
62 | public final static String INTERESTING_TERMS = PREFIX + "interestingTerms"; // false,details,(list or true)
63 |
64 | public enum TermStyle {
65 | NONE,
66 | LIST,
67 | DETAILS;
68 |
69 | public static TermStyle get( String p )
70 | {
71 | if( p != null ) {
72 | p = p.toUpperCase(Locale.ROOT);
73 | if( p.equals( "DETAILS" ) ) {
74 | return DETAILS;
75 | }
76 | else if( p.equals( "LIST" ) ) {
77 | return LIST;
78 | }
79 | }
80 | return NONE;
81 | }
82 | }
83 | }
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RFQuery.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | import org.apache.lucene.queries.payloads.AveragePayloadFunction;
4 | import org.apache.lucene.queries.payloads.PayloadScoreQuery;
5 | import org.apache.lucene.search.*;
6 | import org.apache.lucene.search.spans.SpanTermQuery;
7 | import org.apache.solr.util.SolrPluginUtils;
8 |
9 | import java.util.ArrayList;
10 | import java.util.List;
11 |
12 | /**
13 | * Created by simon.hughes on 11/25/14.
14 | */
15 | public class RFQuery {
16 |
17 | private final List RFTerms;
18 | private final String mm;
19 | private BooleanQuery mustMatchQuery = null;
20 | private BooleanQuery mustNOTMatchQuery = null;
21 |
22 | public RFQuery(List RFTerms, String mm){
23 | this.RFTerms = RFTerms == null? new ArrayList() : RFTerms;
24 | this.mm = mm;
25 | }
26 | public BooleanQuery getMustMatchQuery(){
27 | return this.mustMatchQuery;
28 | }
29 |
30 | public void setMustMatchQuery(BooleanQuery query){
31 | this.mustMatchQuery = query;
32 | }
33 |
34 | public Query getMustNOTMatchQuery(){
35 | return this.mustNOTMatchQuery;
36 | }
37 |
38 | public void setMustNOTMatchQuery(BooleanQuery query){
39 | this.mustNOTMatchQuery = query;
40 | }
41 |
42 | public List getRFTerms(){
43 | return RFTerms;
44 | }
45 |
46 | public Query getOrQuery(){
47 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder();
48 | for(RFTerm RFTerm : this.RFTerms){
49 | qryBuilder.add(toBoostedQuery(RFTerm), BooleanClause.Occur.SHOULD);
50 | }
51 | SolrPluginUtils.setMinShouldMatch(qryBuilder, mm);
52 | return qryBuilder.build();
53 | }
54 |
55 | private Query toBoostedQuery(RFTerm RFTerm){
56 | Query tq = toTermQuery(RFTerm);
57 | return new BoostQuery(tq, RFTerm.getFinalScore());
58 | }
59 |
60 | private Query toTermQuery(RFTerm RFTerm) {
61 | if(RFTerm.hasPayload()) {
62 | return new PayloadScoreQuery(new SpanTermQuery(RFTerm.getTerm()), new AveragePayloadFunction(), false);
63 | }
64 | else{
65 | return new TermQuery(RFTerm.getTerm());
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RFResult.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | import org.apache.lucene.search.Query;
4 | import org.apache.solr.search.DocListAndSet;
5 |
6 | import java.util.List;
7 |
8 | /**
9 | * Created by simon.hughes on 1/6/17.
10 | */
11 | public class RFResult {
12 | private final List RFTerms;
13 | private final Query finalRfQuery;
14 | private DocListAndSet results;
15 |
16 | public RFResult(List RFTerms, Query finalRfQuery, DocListAndSet results){
17 | this.RFTerms = RFTerms;
18 | this.finalRfQuery = finalRfQuery;
19 | this.results = results;
20 | }
21 |
22 | public DocListAndSet getResults() {
23 | return results;
24 | }
25 |
26 | public List getRFTerms(){
27 | return RFTerms;
28 | }
29 |
30 | public Query getQuery() {
31 | return finalRfQuery;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RFTerm.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | import com.google.common.base.Strings;
4 | import org.apache.lucene.index.Term;
5 |
6 | import java.text.DecimalFormat;
7 | import java.util.Comparator;
8 |
9 | /**
10 | * Created by simon.hughes on 9/4/14.
11 | */
12 | public class RFTerm implements Comparable {
13 |
14 | private final String word;
15 | private final String fieldName;
16 | private final float idf;
17 | private final int docFreq;
18 | private final float tf;
19 | private final float fieldBoost;
20 | private final float payload;
21 | private final static DecimalFormat format = new DecimalFormat("#0.00");
22 |
23 | private final static DecimalFormat intFormat = new DecimalFormat("##.##");
24 | private final boolean logTf;
25 | private final boolean hasPayload;
26 | private final boolean useBoost;
27 |
28 | private float vectorLength = 1.0f;
29 |
30 | // non-payload
31 | public RFTerm(String word, String fieldName, float tf, float idf, int docFreq, boolean logTf, float fieldBoost, boolean useBoost){
32 | this(word, fieldName, tf, idf, docFreq, logTf, fieldBoost, 1.0f, useBoost, false);
33 | }
34 |
35 | // with payload
36 | public RFTerm(String word, String fieldName, float tf, float idf, int docFreq, boolean logTf, float fieldBoost, float payload, boolean useBoost, boolean hasPayload){
37 |
38 | this.word = word;
39 | this.fieldName = fieldName;
40 | this.idf = idf;
41 | this.docFreq = docFreq;
42 | this.tf = tf;
43 | this.fieldBoost = fieldBoost;
44 | this.payload = payload;
45 | this.logTf = logTf;
46 | this.useBoost = useBoost;
47 | this.hasPayload = hasPayload;
48 | }
49 |
50 | public String getWord() {
51 | return word;
52 | }
53 |
54 | public String getFieldName() {
55 | return fieldName;
56 | }
57 |
58 | public float getIdf() {
59 | return idf;
60 | }
61 |
62 | public int getDocFreq() {
63 | return docFreq;
64 | }
65 |
66 | public float getTf() {
67 | return tf;
68 | }
69 |
70 | public float getPayload() {
71 | return payload;
72 | }
73 |
74 | public float getFieldBoost() { return fieldBoost; }
75 |
76 | private String padFloat(float f){
77 | String formatted = format.format(f);
78 | return Strings.padStart(formatted, 7, ' ');
79 | }
80 |
81 | private String padInt(float f){
82 | String formatted = intFormat.format(f);
83 | return Strings.padStart(formatted, 5, ' ');
84 | }
85 |
86 | public float getTermWeight(){
87 | if(this.hasPayload()){
88 | // for the payload, typically we want to include the TF but not the IDF. This is what is passed to the payload value
89 | return this.getPayload();
90 | }
91 | else {
92 | if(false == this.useBoost){
93 | return 1.0f;
94 | }
95 | float tfVal = this.tf;
96 | if (this.logTf) {
97 | tfVal = getLogTf();
98 | }
99 | return tfVal * this.idf;
100 | }
101 | }
102 |
103 | public float getNormalizedTermWeight(){
104 | return this.getTermWeight() / this.vectorLength;
105 | }
106 |
107 | private float getLogTf() {
108 | return (float) Math.log(this.tf + 1.0d);
109 | }
110 |
111 | public float getFinalScore(){
112 | return this.getFieldBoost() * this.getNormalizedTermWeight();
113 | }
114 |
115 | public String valuesToString(){
116 | StringBuilder sb = new StringBuilder();
117 | sb.append("score: ").append(padFloat(this.getFinalScore()));
118 | sb.append(" term wt: ").append(padFloat(this.getTermWeight()));
119 |
120 | if(this.useBoost) {
121 | if (this.logTf) {
122 | sb.append(" log(tf): ").append(padFloat(this.getLogTf()));
123 | } else {
124 | sb.append(" tf: ").append(padInt(this.getTf()));
125 | }
126 | sb.append(" df: ").append(padInt((this.getDocFreq())));
127 | sb.append(" idf: ").append(padFloat((this.getIdf())));
128 | }
129 | if(this.hasPayload())
130 | {
131 | sb.append(" pyld: ").append(padFloat((this.getPayload())));
132 | }
133 | sb.append(" fldBst: ").append(padFloat((this.getFieldBoost())));
134 | sb.append(" veclen: ").append(padFloat((this.vectorLength)));
135 | return sb.toString();
136 | }
137 |
138 | public static Comparator FLD_BOOST_X_SCORE_ORDER = new Comparator() {
139 | @Override
140 | public int compare(RFTerm t1, RFTerm t2) {
141 | float d = t2.getFinalScore() - t1.getFinalScore();
142 | if( d == 0 ) {
143 | return 0;
144 | }
145 | return (d>0)?1:-1;
146 | }
147 | };
148 |
149 | public int compareTo(RFTerm o) {
150 | return ((Float)o.getFinalScore()).compareTo(this.getFinalScore());
151 | }
152 |
153 | // used in debug info (relevancyFeedback.interestingTerms = details)
154 | public Term getTerm() {
155 | return new Term(this.getFieldName(), this.getWord());
156 | }
157 |
158 | public boolean hasPayload() {
159 | return hasPayload;
160 | }
161 |
162 | public void setVectorLength(float vectorLength) {
163 | this.vectorLength = vectorLength;
164 | }
165 | }
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RelevancyFeedback.java:
--------------------------------------------------------------------------------
1 | package org.dice.solrenhancements.relevancyfeedback;
2 |
3 | /**
4 | * Created by simon.hughes on 9/2/14.
5 | */
6 | /**
7 | * Copyright 2004-2005 The Apache Software Foundation.
8 | *
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | */
21 |
22 | import org.apache.lucene.analysis.Analyzer;
23 | import org.apache.lucene.analysis.TokenStream;
24 | import org.apache.lucene.analysis.payloads.PayloadHelper;
25 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 | import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
27 | import org.apache.lucene.document.Document;
28 | import org.apache.lucene.index.*;
29 | import org.apache.lucene.search.BooleanClause;
30 | import org.apache.lucene.search.BooleanQuery;
31 | import org.apache.lucene.search.TermQuery;
32 | import org.apache.lucene.search.similarities.ClassicSimilarity;
33 | import org.apache.lucene.search.similarities.TFIDFSimilarity;
34 | import org.apache.lucene.util.*;
35 | import org.apache.lucene.util.PriorityQueue;
36 |
37 | import java.io.IOException;
38 | import java.io.Reader;
39 | import java.io.StringReader;
40 | import java.util.*;
41 |
42 |
43 | /**
44 | * Generate "more queryFromDocuments this" similarity queries.
45 | * Based on this mail:
46 | *
47 | * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
48 | * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
49 | * is usually fast enough. But looking up the docFreq() of every term in the document is
50 | * probably too slow.
51 | *
52 | * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
53 | * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
54 | * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
55 | * reduce the number of terms under consideration. Another heuristic is that terms with a
56 | * high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
57 | * number of characters, not selecting anything less than, e.g., six or seven characters.
58 | * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
59 | * that do a pretty good job of characterizing a document.
60 | *
61 | * It all depends on what you're trying to do. If you're trying to eek out that last percent
62 | * of precision and recall regardless of computational difficulty so that you can win a TREC
63 | * competition, then the techniques I mention above are useless. But if you're trying to
64 | * provide a "more queryFromDocuments this" button on a search results page that does a decent job and has
65 | * good performance, such techniques might be useful.
66 | *
67 | * An efficient, effective "more-queryFromDocuments-this" query generator would be a great contribution, if
68 | * anyone's interested. I'd imagine that it would take a Reader or a String (the document's
69 | * text), analyzer Analyzer, and return a set of representative terms using heuristics queryFromDocuments those
70 | * above. The frequency and length thresholds could be parameters, etc.
71 | *
72 | * Doug
73 | *
74 | *
75 | *
76 | *
77 | *
Initial Usage
78 | *
79 | * This class has lots of options to try to make it efficient and flexible.
80 | * The simplest possible usage is as follows. The bold
81 | * fragment is specific to this class.
82 | *
83 | *
84 | *
85 | * IndexReader ir = ...
86 | * IndexSearcher is = ...
87 | *
88 | * RelevancyFeedback relevancyFeedback = new RelevancyFeedback(ir);
89 | * Reader target = ... // orig source of doc you want to find similarities to
90 | * Query query = relevancyFeedback.queryFromDocuments( target);
91 | *
92 | * Hits hits = is.search(query);
93 | * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
94 | * //you ignore the doc if it matches your 'target' document, as it should be similar to itself
95 | *
96 | *
97 | *
98 | * Thus you:
99 | *
100 | *
do your normal, Lucene setup for searching,
101 | *
create a RelevancyFeedback,
102 | *
get the text of the doc you want to find similarities to
103 | *
then call one of the queryFromDocuments() calls to generate a similarity query
104 | *
call the searcher to find the similar docs
105 | *
106 | *
107 | *
More Advanced Usage
108 | *
109 | * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
110 | * multiple fields (e.g. body and title) for similarity.
111 | *
112 | *
113 | * Depending on the size of your index and the size and makeup of your documents you
114 | * may want to call the other set methods to control how the similarity queries are
115 | * generated:
116 | *
130 | * Changes: Mark Harwood 29/02/04
131 | * Some bugfixing, some refactoring, some optimisation.
132 | * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
133 | * - bugfix: No significant terms being created for fields with a termvector - because
134 | * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
135 | * - refactor: moved common code into isNoiseWord()
136 | * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
137 | *
138 | */
139 | public final class RelevancyFeedback {
140 |
141 | /**
142 | * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
143 | *
144 | * @see #setMm(String)
145 | */
146 | public static final String DEFAULT_MM = "1";
147 |
148 | /**
149 | * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
150 | *
151 | * @see #getMaxNumTokensParsedPerField
152 | */
153 | public static final int DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD = 5000;
154 |
155 | /**
156 | * Ignore terms with less than this frequency in the source doc.
157 | *
158 | * @see #getMinTermFreq
159 | * @see #setMinTermFreq
160 | */
161 | public static final int DEFAULT_MIN_TERM_FREQ = 1;
162 |
163 | /**
164 | * Ignore words which do not occur in at least this many docs.
165 | *
166 | * @see #getMinDocFreq
167 | * @see #setMinDocFreq
168 | */
169 | public static final int DEFAULT_MIN_DOC_FREQ = 5;
170 |
171 | /**
172 | * Ignore words which occur in more than this many docs.
173 | *
174 | * @see #getMaxDocFreq
175 | * @see #setMaxDocFreq
176 | * @see #setMaxDocFreqPct
177 | */
178 | public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
179 |
180 | /**
181 | * Boost terms in query based on score.
182 | *
183 | * @see #isBoost
184 | * @see #setBoost
185 | */
186 | public static final boolean DEFAULT_BOOST = true;
187 |
188 | /**
189 | * Normalize field boosts
190 | *
191 | * @see #isNormalizeFieldBoosts
192 | * @see #setNormalizeFieldBoosts
193 | */
194 | public static final boolean DEFAULT_NORMALIZE_FIELD_BOOSTS = true;
195 |
196 | /**
197 | * Log the term frequency of use the raw frequency?
198 | *
199 | * @see #isLogTf
200 | * @see #setLogTf
201 | */
202 | public static final boolean DEFAULT_IS_LOG_TF = false;
203 |
204 | /**
205 | * Default field names. Null is used to specify that the field names should be looked
206 | * up at runtime from the provided reader.
207 | */
208 | public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
209 |
210 | /**
211 | * Ignore words less than this length or if 0 then this has no effect.
212 | *
213 | * @see #getMinWordLen
214 | * @see #setMinWordLen
215 | */
216 | public static final int DEFAULT_MIN_WORD_LENGTH = 0;
217 |
218 | /**
219 | * Ignore words greater than this length or if 0 then this has no effect.
220 | *
221 | * @see #getMaxWordLen
222 | * @see #setMaxWordLen
223 | */
224 | public static final int DEFAULT_MAX_WORD_LENGTH = 0;
225 |
226 | /**
227 | * Default set of stopwords.
228 | * If null means to allow stop words.
229 | *
230 | * @see #setStopWords
231 | * @see #getStopWords
232 | */
233 | public static final Set> DEFAULT_STOP_WORDS = null;
234 |
235 | /**
236 | * Current set of stop words.
237 | */
238 | private Set> stopWords = DEFAULT_STOP_WORDS;
239 |
240 | /**
241 | * Return a Query with no more than this many terms.
242 | *
243 | * @see org.apache.lucene.search.BooleanQuery#getMaxClauseCount
244 | * @see #getMaxQueryTermsPerField
245 | * @see #setMaxQueryTermsPerField
246 | */
247 | public static final int DEFAULT_MAX_QUERY_TERMS_PER_FIELD = 100;
248 |
249 |
250 | /**
251 | * mm setting for RF query
252 | */
253 | private String mm = null;
254 |
255 | /**
256 | * Analyzer that will be used to parse the doc.
257 | */
258 | private Analyzer analyzer = null;
259 |
260 | /**
261 | * Ignore words less frequent that this.
262 | */
263 | private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
264 |
265 | /**
266 | * Ignore words which do not occur in at least this many docs.
267 | */
268 | private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
269 |
270 | /**
271 | * Ignore words which occur in more than this many docs.
272 | */
273 | private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
274 |
275 | /**
276 | * Should we apply a boost to the Query based on the scores?
277 | */
278 | private boolean boost = DEFAULT_BOOST;
279 |
280 | /**
281 | * Should we normalized the field boosts per field?
282 | */
283 | private boolean normalizeFieldBoosts = DEFAULT_NORMALIZE_FIELD_BOOSTS;
284 |
285 | /**
286 | * Should we normalized the field boosts per field?
287 | */
288 | private boolean isLogTf = DEFAULT_IS_LOG_TF;
289 |
290 | /**
291 | * Field name we'll analyze.
292 | */
293 | private String[] fieldNames = DEFAULT_FIELD_NAMES;
294 | private String[] matchFieldNames = new String[]{};
295 | private String[] differentFieldNames = new String[]{};
296 |
297 | private String streamHead = null;
298 |
299 | private String[] streamBodyfieldNames = new String[0];
300 | private String[] streamHeadfieldNames = new String[0];
301 |
302 | private HashSet payloadFields = new HashSet();
303 |
304 | private Map boostFields;
305 | private Map streamBoostFields;
306 |
307 |
308 | /**
309 | * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
310 | */
311 | private int maxNumTokensParsedPerField = DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD;
312 |
313 | /**
314 | * Ignore words if less than this len.
315 | */
316 | private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
317 |
318 | /**
319 | * Ignore words if greater than this len.
320 | */
321 | private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
322 |
323 | /**
324 | * Don't return a query longer than this.
325 | */
326 | private int maxQueryTermsPerField = DEFAULT_MAX_QUERY_TERMS_PER_FIELD;
327 |
328 | /**
329 | * For idf() calculations.
330 | */
331 | private TFIDFSimilarity similarity;// = new DefaultSimilarity();
332 |
333 | /**
334 | * IndexReader to use
335 | */
336 | private final IndexReader ir;
337 |
338 |
339 |
340 | /**
341 | * Gets the value of the relevancyFeedback.mm parameter (mm for the RF query)
342 | *
343 | * @return - the minimum should match parameter string - follows the normal mm syntax
344 | * @see #setMm(String)
345 | **/
346 | public String getMm() {
347 | return this.mm;
348 | }
349 |
350 | /**
351 | * Sets the text for the relevancyFeedback.mm parameter (mm for the RF query)
352 | *
353 | * @param mm - minimum should match parameter string - follows the normal mm syntax
354 | * @see #getMm()
355 | **/
356 | public void setMm(String mm) {
357 | this.mm = mm;
358 | }
359 |
360 |
361 | /**
362 | * Tie Breaker used in DisjunctionMaxQuery
363 | **/
364 | private String boostFn = "";
365 |
366 | /**
367 | * Gets the text for the Multiplicative Boost Function
368 | *
369 | * @return the multiplicative boostFunction used in the RF query
370 | * @see #setBoostFn(String)
371 | **/
372 | public String getBoostFn() {
373 | return boostFn;
374 | }
375 |
376 | /**
377 | * Sets the text for the Multiplicative Boost Function
378 | *
379 | * @see #getBoostFn()
380 | **/
381 | public void setBoostFn(String boostFn) {
382 | this.boostFn = boostFn;
383 | }
384 |
385 | /**
386 | * Constructor requiring an IndexReader.
387 | */
388 | public RelevancyFeedback(IndexReader ir) {
389 | this(ir, new ClassicSimilarity());
390 | }
391 |
392 | public RelevancyFeedback(IndexReader ir, TFIDFSimilarity sim) {
393 | this.ir = ir;
394 | this.similarity = sim;
395 |
396 | }
397 |
398 |
399 | public TFIDFSimilarity getSimilarity() {
400 | return similarity;
401 | }
402 |
403 | public void setSimilarity(TFIDFSimilarity similarity) {
404 | this.similarity = similarity;
405 | }
406 |
407 | /**
408 | * Returns an analyzer that will be used to parse source doc with. The default analyzer
409 | * is not set.
410 | *
411 | * @return the analyzer that will be used to parse source doc with.
412 | */
413 | public Analyzer getAnalyzer() {
414 | return analyzer;
415 | }
416 |
417 | /**
418 | * Sets the analyzer to use. An analyzer is not required for generating a query with the
419 | * {@link #like(List)} method, all other 'queryFromDocuments' methods require an analyzer.
420 | *
421 | * @param analyzer the analyzer to use to tokenize text.
422 | */
423 | public void setAnalyzer(Analyzer analyzer) {
424 | this.analyzer = analyzer;
425 | }
426 |
427 | /**
428 | * Returns the frequency below which terms will be ignored in the source doc. The default
429 | * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
430 | *
431 | * @return the frequency below which terms will be ignored in the source doc.
432 | */
433 | public int getMinTermFreq() {
434 | return minTermFreq;
435 | }
436 |
437 | /**
438 | * Sets the frequency below which terms will be ignored in the source doc.
439 | *
440 | * @param minTermFreq the frequency below which terms will be ignored in the source doc.
441 | */
442 | public void setMinTermFreq(int minTermFreq) {
443 | this.minTermFreq = minTermFreq;
444 | }
445 |
446 | /**
447 | * Returns the frequency at which words will be ignored which do not occur in at least this
448 | * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}.
449 | *
450 | * @return the frequency at which words will be ignored which do not occur in at least this
451 | * many docs.
452 | */
453 | public int getMinDocFreq() {
454 | return minDocFreq;
455 | }
456 |
457 | /**
458 | * Sets the frequency at which words will be ignored which do not occur in at least this
459 | * many docs.
460 | *
461 | * @param minDocFreq the frequency at which words will be ignored which do not occur in at
462 | * least this many docs.
463 | */
464 | public void setMinDocFreq(int minDocFreq) {
465 | this.minDocFreq = minDocFreq;
466 | }
467 |
468 | /**
469 | * Returns the maximum frequency in which words may still appear.
470 | * Words that appear in more than this many docs will be ignored. The default frequency is
471 | * {@link #DEFAULT_MAX_DOC_FREQ}.
472 | *
473 | * @return get the maximum frequency at which words are still allowed,
474 | * words which occur in more docs than this are ignored.
475 | */
476 | public int getMaxDocFreq() {
477 | return maxDocFreq;
478 | }
479 |
480 | /**
481 | * Set the maximum frequency in which words may still appear. Words that appear
482 | * in more than this many docs will be ignored.
483 | *
484 | * @param maxFreq the maximum count of documents that a term may appear
485 | * in to be still considered relevant
486 | */
487 | public void setMaxDocFreq(int maxFreq) {
488 | this.maxDocFreq = maxFreq;
489 | }
490 |
491 | /**
492 | * Set the maximum percentage in which words may still appear. Words that appear
493 | * in more than this many percent of all docs will be ignored.
494 | *
495 | * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear
496 | * in to be still considered relevant
497 | */
498 | public void setMaxDocFreqPct(int maxPercentage) {
499 | this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
500 | }
501 |
502 | /**
503 | * Returns whether to boost terms in query based on "score" or not. The default is
504 | * {@link #DEFAULT_BOOST}.
505 | *
506 | * @return whether to boost terms in query based on "score" or not.
507 | * @see #setBoost
508 | */
509 | public boolean isBoost() {
510 | return boost;
511 | }
512 |
513 | /**
514 | * Sets whether to boost terms in query based on "score" or not.
515 | *
516 | * @param boost true to boost terms in query based on "score", false otherwise.
517 | * @see #isBoost
518 | */
519 | public void setBoost(boolean boost) {
520 | this.boost = boost;
521 | }
522 |
523 | /**
524 | * Returns whether to normalize the size of field level boosts across all field terms
525 | * {@Link #DEFAULT_NORMALIZE_FIELD_BOOSTS}
526 | *
527 | * @return whether to normalize field boosts to unit length, or not
528 | * @see #setNormalizeFieldBoosts(boolean)
529 | */
530 | public boolean isNormalizeFieldBoosts() {
531 | return normalizeFieldBoosts;
532 | }
533 |
534 | /**
535 | * Sets whether to normalize the size of field level boosts across all field terms or not
536 | *
537 | * @param normalizeFieldBoosts true to field boosts to unit length, or false otherwise.
538 | * @see #isNormalizeFieldBoosts
539 | */
540 | public void setNormalizeFieldBoosts(boolean normalizeFieldBoosts) {
541 | this.normalizeFieldBoosts = normalizeFieldBoosts;
542 | }
543 |
544 | /**
545 | * Returns whether to log the term frequency of the fields
546 | * {@Link #DEFAULT_IS_LOG_TF}
547 | *
548 | * @return whether to take the logarithm of the term frequency or not
549 | * @see #setLogTf(boolean)
550 | */
551 | public boolean isLogTf() {
552 | return isLogTf;
553 | }
554 |
555 | /**
556 | * Sets whether to log the term frequency of the fields
557 | *
558 | * @param isLogTf true to take the logarithm of the term frequency or not, false otherwise
559 | * @see #isLogTf
560 | */
561 | public void setLogTf(boolean isLogTf) {
562 | this.isLogTf = isLogTf;
563 | }
564 |
565 | /**
566 | * Returns the field names that will be used when generating the 'More Like This' query.
567 | * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
568 | *
569 | * @return the field names that will be used when generating the 'More Like This' query.
570 | */
571 | public String[] getFieldNames() {
572 | if (fieldNames == null) {
573 | // gather list of all valid fields from lucene, if none specified
574 | Collection fields = MultiFields.getIndexedFields(ir);
575 | fieldNames = fields.toArray(new String[fields.size()]);
576 | }
577 |
578 | return fieldNames;
579 | }
580 |
581 | /**
582 | * Returns the field names must be matched in the target document
583 | *
584 | * @return the field names that must be matched in the target document
585 | */
586 | public String[] getMatchFieldNames() {
587 | return matchFieldNames;
588 | }
589 |
590 | /**
591 | * Returns the field names must NOT be matched in the target document
592 | *
593 | * @return the field names that must NOT be matched in the target document
594 | */
595 | public String[] getDifferentFieldNames() {
596 | return differentFieldNames;
597 | }
598 |
599 | /**
600 | * Sets the field names that will be used when generating the 'More Like This' query.
601 | * Set this to null for the field names to be determined at runtime from the IndexReader
602 | * provided in the constructor.
603 | *
604 | * @param fieldNames the field names that will be used when generating the 'More Like This'
605 | * query.
606 | */
607 | public void setFieldNames(String[] fieldNames) {
608 | this.fieldNames = fieldNames;
609 | }
610 |
611 | /**
612 | * Sets the field names that must match the target document in the RF query
613 | *
614 | * @param fieldNames the field names that will be used
615 | */
616 | public void setMatchFieldNames(String[] fieldNames) {
617 | this.matchFieldNames = fieldNames;
618 | }
619 |
620 | /**
621 | * Sets the field names that must match the target document in the RF query
622 | *
623 | * @param fieldNames the field names that will be used
624 | */
625 | public void setDifferentFieldNames(String[] fieldNames) {
626 | this.differentFieldNames = fieldNames;
627 | }
628 |
629 | /*
630 | * Returns the field names for processing the stream body.
631 | *
632 | * @return the field names used when parsing terms from the stream.body parameter
633 | */
634 | public String[] getStreamBodyfieldNames() {
635 | if(streamBodyfieldNames.length == 0){
636 | // don't potentially return every field by calling the getter
637 | return fieldNames;
638 | }
639 | return streamBodyfieldNames;
640 | }
641 |
642 | /*
643 | * Sets the field names used for processing the stream body.
644 | *
645 | * @param streamBodyfieldNames the field names used when parsing terms from the stream.body parameter
646 | */
647 | public void setStreamBodyfieldNames(String[] streamBodyfieldNames) {
648 | this.streamBodyfieldNames = streamBodyfieldNames;
649 | }
650 |
651 | /*
652 | * Gets the field names used for processing the stream head.
653 | *
654 | * @return the field names used when parsing terms from the stream.head parameter
655 | */
656 | public String[] getStreamHeadfieldNames() {
657 | if(streamHeadfieldNames.length == 0){
658 | return fieldNames;
659 | }
660 | return streamHeadfieldNames;
661 | }
662 |
663 | /*
664 | * Sets the field names used for processing the stream.head parameter.
665 | *
666 | * @param streamHeadfieldNames the field names used when parsing terms from the stream.head parameter
667 | */
668 | public void setStreamHeadfieldNames(String[] streamHeadfieldNames) {
669 | this.streamHeadfieldNames = streamHeadfieldNames;
670 | }
671 |
672 | /*
673 | * Gets the stream.head value, if specified. This is a string to be parsed if the q parameter is null
674 | * (assumes a document stream as input from stream.body and optionally from stream.head)
675 | *
676 | * @return stream.head value
677 | */
678 | public String getStreamHead() {
679 | return streamHead;
680 | }
681 |
682 | /*
683 | * Sets the stream.head value, if specified. This is a string to be parsed if the q parameter is null
684 | * (assumes a document stream as input from stream.body and optionally from stream.head)
685 | *
686 | * @param streamHead stream.head value
687 | */
688 | public void setStreamHead(String streamHead) {
689 | this.streamHead = streamHead;
690 | }
691 |
692 |
693 | /**
694 | * Returns the minimum word length below which words will be ignored. Set this to 0 for no
695 | * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
696 | *
697 | * @return the minimum word length below which words will be ignored.
698 | */
699 | public int getMinWordLen() {
700 | return minWordLen;
701 | }
702 |
703 | /**
704 | * Sets the minimum word length below which words will be ignored.
705 | *
706 | * @param minWordLen the minimum word length below which words will be ignored.
707 | */
708 | public void setMinWordLen(int minWordLen) {
709 | this.minWordLen = minWordLen;
710 | }
711 |
712 | /**
713 | * Returns the maximum word length above which words will be ignored. Set this to 0 for no
714 | * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
715 | *
716 | * @return the maximum word length above which words will be ignored.
717 | */
718 | public int getMaxWordLen() {
719 | return maxWordLen;
720 | }
721 |
722 | /**
723 | * Sets the maximum word length above which words will be ignored.
724 | *
725 | * @param maxWordLen the maximum word length above which words will be ignored.
726 | */
727 | public void setMaxWordLen(int maxWordLen) {
728 | this.maxWordLen = maxWordLen;
729 | }
730 |
731 | /**
732 | * Set the set of stopwords.
733 | * Any word in this set is considered "uninteresting" and ignored.
734 | * Even if your Analyzer allows stopwords, you might want to tell the RelevancyFeedback code to ignore them, as
735 | * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
736 | *
737 | * @param stopWords set of stopwords, if null it means to allow stop words
738 | * @see #getStopWords
739 | */
740 | public void setStopWords(Set> stopWords) {
741 | this.stopWords = stopWords;
742 | }
743 |
744 | /**
745 | * Get the current stop words being used.
746 | *
747 | * @see #setStopWords
748 | */
749 | public Set> getStopWords() {
750 | return stopWords;
751 | }
752 |
753 |
754 | /**
755 | * Returns the maximum number of query terms that will be included in any generated query.
756 | * The default is {@link #DEFAULT_MAX_QUERY_TERMS_PER_FIELD}.
757 | *
758 | * @return the maximum number of query terms that will be included in any generated query.
759 | */
760 | public int getMaxQueryTermsPerField() {
761 | return maxQueryTermsPerField;
762 | }
763 |
764 | /**
765 | * Sets the maximum number of query terms that will be included in any generated query.
766 | *
767 | * @param maxQueryTermsPerField the maximum number of query terms that will be included in any
768 | * generated query.
769 | */
770 | public void setMaxQueryTermsPerField(int maxQueryTermsPerField) {
771 | this.maxQueryTermsPerField = maxQueryTermsPerField;
772 | }
773 |
774 | /**
775 | * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
776 | * @see #DEFAULT_MAX_NUM_TOKENS_PARSED_PER_FIELD
777 | */
778 | public int getMaxNumTokensParsedPerField() {
779 | return maxNumTokensParsedPerField;
780 | }
781 |
782 | /**
783 | * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
784 | */
785 | public void setMaxNumTokensParsedPerField(int i) {
786 | maxNumTokensParsedPerField = i;
787 | }
788 |
789 | /**
790 | * Gets the field level boosts specified in the request
791 | *
792 | * @return The field level boosts specified in the request
793 | */
794 | public Map getBoostFields() {
795 | return this.boostFields;
796 | }
797 |
798 | private float getFieldBoost(String fieldName) {
799 | Float boost = this.boostFields.get(fieldName);
800 | return boost == null? 1.0f: boost;
801 | }
802 |
803 | private float getStreamFieldBoost(String fieldName) {
804 | Float streamBodyBoost = this.streamBoostFields.get(fieldName);
805 | if(streamBodyBoost == null)
806 | {
807 | streamBodyBoost = this.boostFields.get(fieldName);
808 | }
809 | return streamBodyBoost == null? 1.0f: streamBodyBoost;
810 | }
811 |
812 | /**
813 | * Sets the field level boosts
814 | *
815 | * @param boostFields The field level boosts specified in the request
816 | */
817 | public void setBoostFields(Map boostFields) {
818 | this.boostFields = boostFields;
819 | }
820 |
821 | /**
822 | * Sets the field level boosts
823 | *
824 | * @param boostFields The field level boosts specified in the request
825 | */
826 | public void setStreamBoostFields(Map boostFields) {
827 | this.streamBoostFields = boostFields;
828 | }
829 |
830 | /**
831 | * Gets the payload fields, if specified
832 | *
833 | * @return array of payload fields
834 | */
835 | public String[] getPayloadFields() {
836 | String[] arr = new String[this.payloadFields.size()];
837 | return this.payloadFields.toArray(arr);
838 | }
839 |
840 | /**
841 | * Sets the payload fields. These fields use the stored payload value to apply a multiplicative boost to the term values
842 | *
843 | * @param payloadFields the array of payload field names
844 | */
845 | public void setPayloadFields(String[] payloadFields) {
846 | if(payloadFields == null) {
847 | return;
848 | }
849 | for(String fieldname: payloadFields){
850 | this.payloadFields.add(fieldname.trim().toLowerCase());
851 | }
852 | }
853 |
854 | /**
855 | * Return a query that will return docs queryFromDocuments the passed lucene document ID.
856 | *
857 | * @param docNums the documentIDs of the lucene docs to generate the 'More Like This" query for.
858 | * @return a query that will return docs queryFromDocuments the passed lucene document ID.
859 | */
860 | public RFQuery like(List docNums) throws IOException {
861 |
862 | Map> fieldTermFreq = new HashMap>();
863 | Map> mustMatchTerms = new HashMap>();
864 | Map> mustNOTMatchTerms = new HashMap>();
865 | // don't go over duplicate documents
866 | for(Integer docNum: docNums){
867 | retrieveTerms(docNum, getFieldNames(), fieldTermFreq);
868 | retrieveTerms(docNum, getMatchFieldNames(), mustMatchTerms);
869 | retrieveTerms(docNum, getDifferentFieldNames(), mustNOTMatchTerms);
870 | }
871 |
872 | RFQuery rfResult = buildQueryFromFieldTermFrequencies(fieldTermFreq, false);
873 | if(mustMatchTerms.size() > 0){
874 | rfResult.setMustMatchQuery(buildMustMatchQuery(mustMatchTerms, true));
875 | }
876 | if(mustNOTMatchTerms.size() > 0){
877 | rfResult.setMustNOTMatchQuery(buildMustMatchQuery(mustNOTMatchTerms, false));
878 | }
879 | return rfResult;
880 | }
881 |
882 | /**
883 | * Return a query that will return docs queryFromDocuments the passed Reader.
884 | *
885 | * @param reader a stream reader for the document stream (from the stream.body parameter)
886 | * @return a query that will return docs queryFromDocuments the passed Reader.
887 | */
888 | public RFQuery like(Reader reader) throws IOException {
889 |
890 | return like(getStreamHeadfieldNames(), getStreamBodyfieldNames(), reader);
891 | }
892 |
893 | private RFQuery like(String[] streamHeadfields, String[] streamBodyfields, Reader reader) throws IOException {
894 |
895 | if(streamBodyfields == null){
896 | throw new UnsupportedOperationException(
897 | String.format("To use RelevancyFeedback to process a document stream, a field list must be specified "
898 | + "using either the %s parameter or the %s parameter",
899 | RFParams.SIMILARITY_FIELDS, RFParams.STREAM_BODY_FL));
900 | }
901 |
902 | Map> fieldTermFreq = new HashMap>();
903 | String streamBody = org.apache.commons.io.IOUtils.toString(reader);
904 | for(String fieldName: streamBodyfields){
905 | Map words = new HashMap();
906 | fieldTermFreq.put(fieldName, words);
907 | addTermWeights(new StringReader(streamBody), words, fieldName);
908 | }
909 | if(getStreamHead() != null){
910 | if(streamHeadfields == null){
911 | throw new UnsupportedOperationException(
912 | String.format("To use RelevancyFeedback to process a document stream using the stream.head as input,"
913 | +"a field list must be specified using either the %s parameter or the %s parameter",
914 | RFParams.SIMILARITY_FIELDS, RFParams.STREAM_HEAD_FL));
915 | }
916 | for(String fieldName: streamHeadfields){
917 | Map words = null;
918 | if(fieldTermFreq.containsKey(fieldName)) {
919 | words = fieldTermFreq.get(fieldName);
920 | }
921 | else{
922 | words = new HashMap();
923 | fieldTermFreq.put(fieldName, words);
924 | }
925 | addTermWeights(new StringReader(getStreamHead()), words, fieldName);
926 | }
927 | }
928 | return buildQueryFromFieldTermFrequencies(fieldTermFreq, true);
929 | }
930 |
931 | private RFQuery buildQueryFromFieldTermFrequencies(Map> fieldTermFreq, boolean contentStreamQuery) throws IOException {
932 |
933 | List interestingTerms = new ArrayList();
934 | for(String fieldName: fieldTermFreq.keySet()){
935 | Map words = fieldTermFreq.get(fieldName);
936 | PriorityQueue queue = createQueue(fieldName, words, contentStreamQuery);
937 | interestingTerms.addAll(getMostInterestingTerms(queue));
938 | }
939 |
940 | RFQuery rfResult = new RFQuery(interestingTerms, getMm());
941 | return rfResult;
942 | }
943 |
944 | /**
945 | * Compute the top most interesting terms from the priority queue of all RF Terms
946 | */
947 | private List getMostInterestingTerms(PriorityQueue q) {
948 |
949 | int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField;
950 | double sumQuaredBoost = 0.0f;
951 |
952 | List interestingTerms = new ArrayList();
953 | RFTerm currentTerm = null;
954 | while ((currentTerm = q.pop()) != null
955 | && interestingTerms.size() < maxTerms) {
956 | // if not boost, then set score to 1.0 not tf.idf
957 | // now implemented inside RFTerm
958 |
959 | // if not boost, boostValue == 1.0, so this just adds 1 as desired
960 | sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2);
961 | interestingTerms.add(currentTerm);
962 | }
963 |
964 | float vectorLength = (float) Math.sqrt(sumQuaredBoost);
965 | if(vectorLength <= 0.0){
966 | return new ArrayList();
967 | }
968 |
969 | if(this.isNormalizeFieldBoosts()){
970 | for(RFTerm term: interestingTerms){
971 | term.setVectorLength(vectorLength);
972 | }
973 | }
974 | return interestingTerms;
975 | }
976 |
977 | /**
978 | * Create a PriorityQueue from a word->tf map.
979 | *
980 | * @param words a map of words keyed on the word(String) with Int objects as the values.
981 | */
982 | private PriorityQueue createQueue(String fieldName, Map words, boolean contentStreamQuery) throws IOException {
983 | // have collected all words in doc and their freqs
984 | int numDocs = ir.numDocs();
985 | FreqQ res = new FreqQ(words.size()); // will order words by score
986 |
987 | for (String word : words.keySet()) { // for every word
988 | if(word.trim().length() == 0)
989 | {
990 | continue;
991 | }
992 |
993 | float tf = words.get(word).x; // term freq in the source doc
994 |
995 | if (minTermFreq > 0 && tf < minTermFreq) {
996 | continue; // filter out words that don't occur enough times in the source
997 | }
998 |
999 | int docFreq = ir.docFreq(new Term(fieldName, word));
1000 | if (minDocFreq > 0 && docFreq < minDocFreq) {
1001 | continue; // filter out words that don't occur in enough docs
1002 | }
1003 |
1004 | //if (docFreq == 0 || docFreq > maxDocFreq) {
1005 | if (docFreq > maxDocFreq) {
1006 | continue; // filter out words that occur in too many docs
1007 | }
1008 |
1009 | float idf = similarity.idf(docFreq, numDocs);
1010 | final float fieldBoost = contentStreamQuery? this.getStreamFieldBoost(fieldName): this.getFieldBoost(fieldName);
1011 | final RFTerm RFTerm;
1012 | if(isPayloadField(fieldName)){
1013 | RFTerm = new RFTerm(
1014 | word, // the word
1015 | fieldName, // the field name
1016 | tf, // tf
1017 | idf, // idf
1018 | docFreq, // freq in all docs
1019 | isLogTf(),
1020 | fieldBoost,
1021 | tf, // this is the payload score if a payload field. Code could better reflect this admittedly
1022 | this.boost,
1023 | true
1024 | );
1025 | }
1026 | else{
1027 | RFTerm = new RFTerm(
1028 | word, // the word
1029 | fieldName, // the field name
1030 | tf, // tf
1031 | idf, // idf
1032 | docFreq, // freq in all docs
1033 | this.isLogTf(),
1034 | fieldBoost,
1035 | this.boost
1036 | );
1037 | }
1038 | res.insertWithOverflow(RFTerm);
1039 | }
1040 | return res;
1041 | }
1042 |
1043 | private BooleanQuery buildMustMatchQuery(Map> fieldValues, boolean mustMatch){
1044 | BooleanQuery.Builder qryBuilder = new BooleanQuery.Builder();
1045 | for(Map.Entry> entry: fieldValues.entrySet()){
1046 | String fieldName = entry.getKey();
1047 | for(Map.Entry fieldValue: entry.getValue().entrySet()){
1048 | String value = fieldValue.getKey();
1049 | TermQuery tq = new TermQuery(new Term(fieldName, value));
1050 | if(mustMatch) {
1051 | qryBuilder.add(tq, BooleanClause.Occur.MUST);
1052 | }
1053 | else{
1054 | qryBuilder.add(tq, BooleanClause.Occur.MUST_NOT);
1055 | }
1056 | }
1057 | }
1058 | return qryBuilder.build();
1059 | }
1060 |
1061 | /**
1062 | * Describe the parameters that control how the "more queryFromDocuments this" query is formed.
1063 | */
1064 | public String describeParams() {
1065 | StringBuilder sb = new StringBuilder();
1066 | sb.append("\t").append("maxQueryTermsPerField : ").append(maxQueryTermsPerField).append("\n");
1067 | sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n");
1068 | sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n");
1069 | sb.append("\t").append("fieldNames : ");
1070 | String delim = "";
1071 | for (String fieldName : getFieldNames()) {
1072 | sb.append(delim).append(fieldName);
1073 | delim = ", ";
1074 | }
1075 | sb.append("\n");
1076 | sb.append("\t").append("boost : ").append(boost).append("\n");
1077 | sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n");
1078 | sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n");
1079 | return sb.toString();
1080 | }
1081 |
1082 | /**
1083 | * Find words for a more-queryFromDocuments-this query former.
1084 | *
1085 | * @param docNum the id of the lucene document from which to find terms
1086 | * @param fields the list of field of the lucene document from which to extract terms
1087 | * @param fieldToTermFreqMap data structure to populate with term frequencies
1088 | */
1089 | public Map> retrieveTerms(int docNum, String[] fields, Map> fieldToTermFreqMap) throws IOException {
1090 |
1091 | if(fieldToTermFreqMap == null) {
1092 | fieldToTermFreqMap = new HashMap>();
1093 | }
1094 |
1095 | if(fields == null || fields.length == 0){
1096 | return fieldToTermFreqMap;
1097 | }
1098 |
1099 | final Fields vectors = ir.getTermVectors(docNum);
1100 | final Document document = ir.document(docNum);
1101 |
1102 | for (String fieldName : fields) {
1103 |
1104 | Map termFreqMap = null;
1105 | if(fieldToTermFreqMap.containsKey(fieldName)){
1106 | termFreqMap = fieldToTermFreqMap.get(fieldName);
1107 | }
1108 | else{
1109 | termFreqMap = new HashMap();
1110 | fieldToTermFreqMap.put(fieldName, termFreqMap);
1111 | }
1112 |
1113 | Terms vector = null;
1114 | if (vectors != null) {
1115 | vector = vectors.terms(fieldName);
1116 | }
1117 |
1118 | // field does not store term vector info
1119 | // even if term vectors enabled, need to extract payload from regular field reader
1120 | if (vector == null || isPayloadField(fieldName)) {
1121 | IndexableField docFields[] = document.getFields(fieldName);
1122 | for (IndexableField field : docFields) {
1123 | final String stringValue = field.stringValue();
1124 | if (stringValue != null) {
1125 | addTermWeights(new StringReader(stringValue), termFreqMap, fieldName);
1126 | }
1127 | }
1128 | } else {
1129 | addTermWeights(termFreqMap, vector);
1130 | }
1131 | }
1132 |
1133 | return fieldToTermFreqMap;
1134 | }
1135 |
1136 | /**
1137 | * Adds terms and frequencies found in vector into the Map termWeightMap
1138 | *
1139 | * @param termWeightMap a Map of terms and their weights
1140 | * @param vector List of terms and their weights for a doc/field
1141 | */
1142 | private void addTermWeights(Map termWeightMap, Terms vector) throws IOException {
1143 | final TermsEnum termsEnum = vector.iterator();
1144 | CharsRefBuilder spare = new CharsRefBuilder();
1145 | BytesRef text;
1146 | while((text = termsEnum.next()) != null) {
1147 | spare.copyUTF8Bytes(text);
1148 | final String term = spare.toString();
1149 | if (isNoiseWord(term)) {
1150 | continue;
1151 | }
1152 | final int freq = (int) termsEnum.totalTermFreq();
1153 |
1154 | //TODO try this
1155 | //termsEnum.docsAndPositions(.....).getPayload()
1156 |
1157 | // increment frequency
1158 | Flt cnt = termWeightMap.get(term);
1159 | if (cnt == null) {
1160 | termWeightMap.put(term, new Flt(freq));
1161 | } else {
1162 | cnt.x += freq;
1163 | }
1164 | }
1165 | }
1166 |
1167 | /**
1168 | * Adds term weights found by tokenizing text from reader into the Map words
1169 | *
1170 | * @param reader a source of text to be tokenized
1171 | * @param termWeightMap a Map of terms and their weights
1172 | * @param fieldName Used by analyzer for any special per-field analysis
1173 | */
1174 | private void addTermWeights(Reader reader, Map termWeightMap, String fieldName)
1175 | throws IOException {
1176 | if (analyzer == null) {
1177 | throw new UnsupportedOperationException("To use RelevancyFeedback without " +
1178 | "term vectors, you must provide an Analyzer");
1179 | }
1180 |
1181 | TokenStream ts = analyzer.tokenStream(fieldName, reader);
1182 | try {
1183 | int tokenCount = 0;
1184 | // for every token
1185 | CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
1186 | PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);
1187 |
1188 | ts.reset();
1189 | while (ts.incrementToken()) {
1190 | String word = termAtt.toString();
1191 | tokenCount++;
1192 | if (tokenCount > maxNumTokensParsedPerField) {
1193 | break;
1194 | }
1195 | if(word.trim().length() == 0){
1196 | continue;
1197 | }
1198 | if (isNoiseWord(word)) {
1199 | continue;
1200 | }
1201 |
1202 | BytesRef payload = payloadAttr.getPayload();
1203 | float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
1204 | if(isPayloadField(fieldName) && payload != null){
1205 | tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
1206 | }
1207 | // increment frequency
1208 | Flt termWeight = termWeightMap.get(word);
1209 | if (termWeight == null) {
1210 | termWeightMap.put(word, new Flt(tokenWeight));
1211 | } else {
1212 | termWeight.x += tokenWeight;
1213 | }
1214 | }
1215 | ts.end();
1216 | } finally {
1217 | IOUtils.closeWhileHandlingException(ts);
1218 | }
1219 | }
1220 |
1221 | /**
1222 | * determines if the passed term is likely to be of interest in "more queryFromDocuments" comparisons
1223 | *
1224 | * @param term The word being considered
1225 | * @return true if should be ignored, false if should be used in further analysis
1226 | */
1227 | private boolean isNoiseWord(String term) {
1228 | int len = term.length();
1229 | if (minWordLen > 0 && len < minWordLen) {
1230 | return true;
1231 | }
1232 | if (maxWordLen > 0 && len > maxWordLen) {
1233 | return true;
1234 | }
1235 | return stopWords != null && stopWords.contains(term);
1236 | }
1237 |
1238 | private boolean isPayloadField(String fieldName){
1239 | return this.payloadFields.contains(fieldName.trim().toLowerCase());
1240 | }
1241 |
1242 | /**
1243 | * PriorityQueue that orders words by score.
1244 | */
1245 | private static class FreqQ extends PriorityQueue {
1246 | FreqQ(int s) {
1247 | super(s);
1248 | }
1249 |
1250 | @Override
1251 | protected boolean lessThan(RFTerm aa, RFTerm bb) {
1252 | return aa.getFinalScore() > bb.getFinalScore();
1253 | }
1254 | }
1255 |
1256 | /**
1257 | * Use for frequencies and to avoid renewing Integers.
1258 | */
1259 |
1260 | private static class Flt {
1261 | float x;
1262 |
1263 | Flt(float x) {
1264 | this.x = x;
1265 | }
1266 | }
1267 | }
--------------------------------------------------------------------------------
/src/main/java/org/dice/solrenhancements/relevancyfeedback/RelevancyFeedbackHandler.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.dice.solrenhancements.relevancyfeedback;
19 |
20 | import com.google.common.base.Strings;
21 | import org.apache.lucene.search.Query;
22 | import org.apache.solr.common.SolrException;
23 | import org.apache.solr.common.params.*;
24 | import org.apache.solr.common.util.ContentStream;
25 | import org.apache.solr.common.util.NamedList;
26 | import org.apache.solr.handler.RequestHandlerBase;
27 | import org.apache.solr.handler.component.FacetComponent;
28 | import org.apache.solr.request.SimpleFacets;
29 | import org.apache.solr.request.SolrQueryRequest;
30 | import org.apache.solr.response.SolrQueryResponse;
31 | import org.apache.solr.schema.SchemaField;
32 | import org.apache.solr.search.*;
33 | import org.apache.solr.util.SolrPluginUtils;
34 | import org.dice.solrenhancements.JarVersion;
35 |
36 | import org.slf4j.Logger;
37 | import org.slf4j.LoggerFactory;
38 |
39 | import java.io.IOException;
40 | import java.io.Reader;
41 | import java.net.MalformedURLException;
42 | import java.net.URL;
43 | import java.util.*;
44 |
45 | /**
46 | * Solr RelevancyFeedback --
47 | *
48 | * Return similar documents either based on a single document or based on posted text.
49 | *
50 | * @since solr 1.3
51 | */
52 | public class RelevancyFeedbackHandler extends RequestHandlerBase
53 | {
54 | private final static String EDISMAX = ExtendedDismaxQParserPlugin.NAME;
55 | private String version = null;
56 |
57 | private static final Logger log = LoggerFactory.getLogger( RelevancyFeedbackHandler.class );
58 |
59 |
60 | @Override
61 | public void init(NamedList args) {
62 | super.init(args);
63 | }
64 |
65 | @Override
66 | public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
67 | {
68 | // set and override parameters
69 | SolrIndexSearcher searcher = req.getSearcher();
70 | SchemaField uniqueKeyField = searcher.getSchema().getUniqueKeyField();
71 | ModifiableSolrParams params = new ModifiableSolrParams(req.getParams());
72 | configureSolrParameters(req, params, uniqueKeyField.getName());
73 |
74 | // Set field flags
75 | ReturnFields returnFields = new SolrReturnFields( req );
76 | rsp.setReturnFields( returnFields );
77 | int flags = 0;
78 | if (returnFields.wantsScore()) {
79 | flags |= SolrIndexSearcher.GET_SCORES;
80 | }
81 | // note: set in configureSolrParameters
82 | String userQdefType = params.get(QueryParsing.DEFTYPE, EDISMAX);
83 | String rfDefType = params.get(RFParams.RF_DEFTYPE, EDISMAX);
84 |
85 | String userQ = params.get( CommonParams.Q );
86 | String rfQ = params.get(RFParams.RF_QUERY);
87 |
88 | Query rfQuery = null;
89 | Query userQuery = null;
90 |
91 | SortSpec sortSpec = null;
92 | QParser rfQueryParser = null;
93 | QParser userQueryParser = null;
94 |
95 | List targetFqFilters = null;
96 | List rfFqFilters = null;
97 |
98 | try {
99 | if (rfQ != null) {
100 | rfQueryParser = QParser.getParser(rfQ, rfDefType, req);
101 | rfQuery = rfQueryParser.getQuery();
102 | sortSpec = rfQueryParser.getSort(true);
103 | }
104 | else{
105 | rfQueryParser = QParser.getParser(null, rfDefType, req);
106 | sortSpec = rfQueryParser.getSort(true);
107 | }
108 |
109 | targetFqFilters = getFilters(req, CommonParams.FQ);
110 | rfFqFilters = getFilters(req, RFParams.FQ);
111 | } catch (SyntaxError e) {
112 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
113 | }
114 |
115 | try {
116 | if (userQ != null) {
117 | userQueryParser = QParser.getParser(userQ, userQdefType, req);
118 | userQuery = userQueryParser.getQuery();
119 | }
120 |
121 | } catch (SyntaxError e) {
122 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
123 | }
124 |
125 | RFHelper rfhelper = new RFHelper( params, searcher, uniqueKeyField, rfQueryParser );
126 |
127 | // Hold on to the interesting terms if relevant
128 | RFParams.TermStyle termStyle = RFParams.TermStyle.get(params.get(RFParams.INTERESTING_TERMS));
129 |
130 | RFResult RFResult = null;
131 | DocListAndSet rfDocs = null;
132 |
133 | // Parse Required Params
134 | // This will either have a single Reader or valid query
135 | Reader reader = null;
136 | try {
137 | int start = params.getInt(CommonParams.START, 0);
138 | int rows = params.getInt(CommonParams.ROWS, 10);
139 |
140 | // for use when passed a content stream
141 | if (rfQ == null || rfQ.trim().length() < 1) {
142 | reader = getContentStreamReader(req, reader);
143 | }
144 | // Find documents RelevancyFeedback - either with a reader or a query
145 | // --------------------------------------------------------------------------------
146 | if (reader != null) {
147 | // this will only be initialized if used with a content stream (see above)
148 | rfQ = "NULL - from content stream";
149 | RFResult = rfhelper.getMatchesFromContentSteam(reader, start, rows, rfFqFilters, flags, sortSpec.getSort(), userQuery);
150 | } else if (rfQ != null) {
151 | // Matching options
152 | RFResult = getMatchesFromQuery(rsp, params, flags, rfQ, rfQuery, userQuery, sortSpec,
153 | targetFqFilters, rfFqFilters, searcher, rfhelper, start, rows);
154 | } else {
155 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
156 | "RelevancyFeedback requires either a query (?rf.q=) or text (using stream.head and stream.body fields in a POST) to find similar documents.");
157 | }
158 | if(RFResult != null)
159 | {
160 | rfDocs = RFResult.getResults();
161 | }
162 |
163 | } finally {
164 | if (reader != null) {
165 | reader.close();
166 | }
167 | }
168 |
169 | if( rfDocs == null ) {
170 | rfDocs = new DocListAndSet(); // avoid NPE
171 | }
172 | rsp.add( "response", rfDocs.docList );
173 | if(RFResult != null && RFResult.getQuery() != null) {
174 | rsp.add(RFParams.PREFIX + "query:", RFResult.getQuery().toString());
175 | }
176 |
177 | if( RFResult != null && termStyle != RFParams.TermStyle.NONE) {
178 | addInterestingTerms(rsp, termStyle, RFResult);
179 | }
180 |
181 | // maybe facet the results
182 | if (params.getBool(FacetParams.FACET,false)) {
183 | addFacet(req, rsp, params, rfDocs);
184 | }
185 |
186 | addDebugInfo(req, rsp, rfQ, rfFqFilters, rfhelper, RFResult, rfDocs);
187 | }
188 |
189 | private void configureSolrParameters(SolrQueryRequest req, ModifiableSolrParams params, String uniqueKeyField){
190 |
191 | // default to the the edismax parser
192 | String defType = params.get(QueryParsing.DEFTYPE, EDISMAX);
193 | // allow useage of custom edismax implementations, such as our own
194 | if(defType.toLowerCase().contains(EDISMAX.toLowerCase())){
195 | params.set(DisMaxParams.MM, 0);
196 | // edismax blows up without df field, even if you specify the field to match on in the query
197 | params.set(CommonParams.DF, uniqueKeyField);
198 | }
199 | params.set(QueryParsing.DEFTYPE, defType);
200 | req.setParams(params);
201 | }
202 |
203 | private Reader getContentStreamReader(SolrQueryRequest req, Reader reader) throws IOException {
204 | Iterable streams = req.getContentStreams();
205 | if (streams != null) {
206 | Iterator iter = streams.iterator();
207 | if (iter.hasNext()) {
208 | reader = iter.next().getReader();
209 | }
210 | if (iter.hasNext()) {
211 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
212 | "RelevancyFeedback does not support multiple ContentStreams");
213 | }
214 | }
215 | return reader;
216 | }
217 |
218 | private RFResult getMatchesFromQuery(SolrQueryResponse rsp, SolrParams params, int flags, String q, Query query, Query userQuery, SortSpec sortSpec, List targetFqFilters, List rfFqFilters, SolrIndexSearcher searcher, RFHelper rfHelper, int start, int rows) throws IOException, SyntaxError {
219 |
220 | boolean includeMatch = params.getBool(RFParams.MATCH_INCLUDE, true);
221 | int matchOffset = params.getInt(RFParams.MATCH_OFFSET, 0);
222 | // Find the base match
223 | DocList match = searcher.getDocList(query, targetFqFilters, null, matchOffset, 10000, flags); // only get the first one...
224 | if(match.matches() == 0 && userQuery == null){
225 | throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
226 | String.format("RelevancyFeedback was unable to find any documents matching the query: '%s'.", q));
227 | }
228 |
229 | if (includeMatch) {
230 | rsp.add("match", match);
231 | }
232 |
233 | // This is an iterator, but we only handle the first match
234 | DocIterator iterator = match.iterator();
235 | if (iterator.hasNext() || userQuery != null) {
236 | // do a RelevancyFeedback query for each document in results
237 | return rfHelper.getMatchesFromDocs(iterator, start, rows, rfFqFilters, flags, sortSpec.getSort(), userQuery);
238 | }
239 | return null;
240 | }
241 |
242 | private List extractInterestingTerms(List RFTerms){
243 | List terms = new ArrayList();
244 | for( RFTerm term : RFTerms) {
245 | InterestingTerm it = new InterestingTerm();
246 | it.term = term.getTerm();
247 | it.boost = term.getFinalScore();
248 | terms.add(it);
249 | }
250 | Collections.sort(terms, InterestingTerm.BOOST_ORDER);
251 | return terms;
252 | }
253 |
254 | private void addInterestingTerms(SolrQueryResponse rsp, RFParams.TermStyle termStyle, RFResult RFResult) {
255 |
256 | List RFTerms = RFResult.getRFTerms();
257 | Collections.sort(RFTerms, RFTerm.FLD_BOOST_X_SCORE_ORDER);
258 |
259 | if( termStyle == RFParams.TermStyle.DETAILS ) {
260 | List interesting = extractInterestingTerms(RFResult.getRFTerms());
261 |
262 | int longest = 0;
263 | for( InterestingTerm t : interesting ) {
264 | longest = Math.max(t.term.toString().length(), longest);
265 | }
266 |
267 | NamedList it = new NamedList();
268 | for( InterestingTerm t : interesting ) {
269 | it.add( Strings.padEnd(t.term.toString(), longest, ' '), t.boost );
270 | }
271 | rsp.add( "interestingTerms", it );
272 | }
273 | else {
274 | List it = new ArrayList( RFTerms.size() );
275 | for( RFTerm RFTerm : RFTerms) {
276 | it.add(RFTerm.getWord());
277 | }
278 | rsp.add( "interestingTerms", it );
279 | }
280 | }
281 |
282 | private void addFacet(SolrQueryRequest req, SolrQueryResponse rsp, SolrParams params, DocListAndSet rfDocs) {
283 | if( rfDocs.docSet == null ) {
284 | rsp.add( "facet_counts", null );
285 | }
286 | else {
287 | FacetComponent fct = new FacetComponent();
288 | rsp.add( "facet_counts", fct.getFacetCounts(new SimpleFacets(req, rfDocs.docSet, params )) );
289 | }
290 | }
291 |
292 | private void addDebugInfo(SolrQueryRequest req, SolrQueryResponse rsp, String q, List rfFqFilters, RFHelper rfHelper, RFResult RFResult, DocListAndSet rfDocs) {
293 |
294 | boolean dbg = req.getParams().getBool(CommonParams.DEBUG_QUERY, false);
295 | boolean dbgQuery = false, dbgResults = false;
296 | if (dbg == false){//if it's true, we are doing everything anyway.
297 | String[] dbgParams = req.getParams().getParams(CommonParams.DEBUG);
298 | if (dbgParams != null) {
299 | for (int i = 0; i < dbgParams.length; i++) {
300 | if (dbgParams[i].equals(CommonParams.QUERY)){
301 | dbgQuery = true;
302 | } else if (dbgParams[i].equals(CommonParams.RESULTS)){
303 | dbgResults = true;
304 | }
305 | }
306 | }
307 | } else {
308 | dbgQuery = true;
309 | dbgResults = true;
310 | }
311 | // Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug?
312 | if (dbg == true && RFResult != null) {
313 | try {
314 |
315 | NamedList it = getRFTermsForDebug(RFResult);
316 |
317 | NamedList