├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pom.xml
└── src
├── main
├── assemblies
│ └── plugin.xml
├── java
│ └── org
│ │ └── elasticsearch
│ │ ├── index
│ │ └── analysis
│ │ │ ├── URLPart.java
│ │ │ ├── URLPartComparator.java
│ │ │ ├── URLTokenFilterFactory.java
│ │ │ ├── URLTokenizerFactory.java
│ │ │ └── url
│ │ │ ├── Token.java
│ │ │ ├── URLTokenFilter.java
│ │ │ ├── URLTokenizer.java
│ │ │ └── URLUtils.java
│ │ └── plugin
│ │ └── analysis
│ │ └── AnalysisURLPlugin.java
└── resources
│ └── plugin-descriptor.properties
└── test
├── java
└── org
│ └── elasticsearch
│ └── index
│ └── analysis
│ └── url
│ ├── IsTokenStreamWithTokenAndPosition.java
│ ├── IsTokenizerWithToken.java
│ ├── OptionalMatchers.java
│ ├── URLAnalysisTestCase.java
│ ├── URLTokenFilterIntegrationTest.java
│ ├── URLTokenFilterTest.java
│ ├── URLTokenizerIntegrationTest.java
│ ├── URLTokenizerTest.java
│ └── URLUtilsTest.java
└── resources
├── log4j.properties
├── test-mapping.json
└── test-settings.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io
2 |
3 | ### Elasticsearch ###
4 | /data
5 |
6 | ### Java ###
7 | *.class
8 |
9 | # Mobile Tools for Java (J2ME)
10 | .mtj.tmp/
11 |
12 | # Package Files #
13 | *.jar
14 | *.war
15 | *.ear
16 |
17 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
18 | hs_err_pid*
19 |
20 |
21 | ### Maven ###
22 | target/
23 | pom.xml.tag
24 | pom.xml.releaseBackup
25 | pom.xml.versionsBackup
26 | pom.xml.next
27 | release.properties
28 |
29 |
30 | ### Intellij ###
31 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
32 |
33 | *.iml
34 |
35 | ## Directory-based project format:
36 | .idea/
37 | # if you remove the above rule, at least ignore the following:
38 |
39 | # User-specific stuff:
40 | # .idea/workspace.xml
41 | # .idea/tasks.xml
42 | # .idea/dictionaries
43 |
44 | # Sensitive or high-churn files:
45 | # .idea/dataSources.ids
46 | # .idea/dataSources.xml
47 | # .idea/sqlDataSources.xml
48 | # .idea/dynamic.xml
49 | # .idea/uiDesigner.xml
50 |
51 | # Gradle:
52 | # .idea/gradle.xml
53 | # .idea/libraries
54 |
55 | # Mongo Explorer plugin:
56 | # .idea/mongoSettings.xml
57 |
58 | ## File-based project format:
59 | *.ipr
60 | *.iws
61 |
62 | ## Plugin-specific files:
63 |
64 | # IntelliJ
65 | out/
66 |
67 | # mpeltonen/sbt-idea plugin
68 | .idea_modules/
69 |
70 | # JIRA plugin
71 | atlassian-ide-plugin.xml
72 |
73 | # Crashlytics plugin (for Android Studio and IntelliJ)
74 | com_crashlytics_export_strings.xml
75 | crashlytics.properties
76 | crashlytics-build.properties
77 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | jdk:
4 | - oraclejdk8
5 |
6 | script: mvn test -Dtests.security.manager=false
7 |
8 | sudo: false
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Elasticsearch URL Tokenizer and URL Token Filter
2 | ==============================
3 |
4 | This plugin enables URL tokenization and token filtering by URL part.
5 |
6 | [](http://travis-ci.org/jlinn/elasticsearch-analysis-url)
7 |
8 | ## Compatibility
9 |
10 | | Elasticsearch Version | Plugin Version |
11 | |-----------------------|----------------|
12 | | 5.6.3 | 5.6.3.0 |
13 | | 5.6.1 | 5.6.1.0 |
14 | | 5.5.1 | 5.5.1.0 |
15 | | 5.5.0 | 5.5.0.0 |
16 | | 5.2.2 | 5.2.2.0 |
17 | | 5.2.1 | 5.2.1.1 |
18 | | 5.1.1 | 5.1.1.0 |
19 | | 5.0.0 | 5.0.0.1 |
20 | | 2.4.3 | 2.4.3.0 |
21 | | 2.4.1 | 2.4.1.0 |
22 | | 2.4.0 | 2.4.0.0 |
23 | | 2.3.5 | 2.3.5.0 |
24 | | 2.3.4 | 2.3.4.3 |
25 | | 2.3.3 | 2.3.3.5 |
26 | | 2.3.2 | 2.3.2.1 |
27 | | 2.3.1 | 2.3.1.1 |
28 | | 2.3.0 | 2.3.0.1 |
29 | | 2.2.2 | 2.2.3 |
30 | | 2.2.1 | 2.2.2.1 |
31 | | 2.2.0 | 2.2.1 |
32 | | 2.1.1 | 2.2.0 |
33 | | 2.1.1 | 2.1.1 |
34 | | 2.0.0 | 2.1.0 |
35 | | 1.6.x, 1.7.x | 2.0.0 |
36 | | 1.6.0 | 1.2.1 |
37 | | 1.5.2 | 1.1.0 |
38 | | 1.4.2 | 1.0.0 |
39 |
40 | ## Installation
41 | ### Elasticsearch v5
42 | ```bash
43 | bin/elasticsearch-plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v5.6.3.0/elasticsearch-analysis-url-5.6.3.0.zip
44 | ```
45 |
46 | ### Elasticsearch v2
47 | ```bash
48 | bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.4.3.0/elasticsearch-analysis-url-2.4.3.0.zip
49 | ```
50 |
51 | ## Usage
52 | ### URL Tokenizer
53 | #### Options:
54 | * `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Can be either a string (single URL part) or an array of multiple URL parts. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
55 | * `url_decode`: Defaults to `false`. If `true`, URL tokens will be URL decoded.
56 | * `allow_malformed`: Defaults to `false`. If `true`, malformed URLs will not be rejected, but will be passed through without being tokenized.
57 | * `tokenize_malformed`: Defaults to `false`. Has no effect if `allow_malformed` is `false`. If both are `true`, an attempt will be made to tokenize malformed URLs using regular expressions.
58 | * `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
59 | * `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
60 | * `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
61 |
62 | #### Example:
63 | Index settings:
64 | ```json
65 | {
66 | "settings": {
67 | "analysis": {
68 | "tokenizer": {
69 | "url_host": {
70 | "type": "url",
71 | "part": "host"
72 | }
73 | },
74 | "analyzer": {
75 | "url_host": {
76 | "tokenizer": "url_host"
77 | }
78 | }
79 | }
80 | }
81 | }
82 | ```
83 |
84 | Make an analysis request:
85 | ```bash
86 | curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'https://foo.bar.com/baz.html'
87 |
88 | {
89 | "tokens" : [ {
90 | "token" : "foo.bar.com",
91 | "start_offset" : 8,
92 | "end_offset" : 19,
93 | "type" : "host",
94 | "position" : 1
95 | }, {
96 | "token" : "bar.com",
97 | "start_offset" : 12,
98 | "end_offset" : 19,
99 | "type" : "host",
100 | "position" : 2
101 | }, {
102 | "token" : "com",
103 | "start_offset" : 16,
104 | "end_offset" : 19,
105 | "type" : "host",
106 | "position" : 3
107 | } ]
108 | }
109 | ```
110 |
111 | ### URL Token Filter
112 | #### Options:
113 | * `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
114 | `protocol`, `host`, `port`, `path`, `query`, and `ref`. Can be either a single URL part (string) or an array of URL parts.
115 | * `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
116 | * `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
117 | If the desired part cannot be found, no value will be indexed for that field.
118 | * `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter. Valid URLs will be tokenized according to the filter's other settings.
119 | * `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
120 | * `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
121 | * `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
122 |
123 | #### Example:
124 | Set up your index like so:
125 | ```json
126 | {
127 | "settings": {
128 | "analysis": {
129 | "filter": {
130 | "url_host": {
131 | "type": "url",
132 | "part": "host",
133 | "url_decode": true,
134 | "tokenize_host": false
135 | }
136 | },
137 | "analyzer": {
138 | "url_host": {
139 | "filter": ["url_host"],
140 | "tokenizer": "whitespace"
141 | }
142 | }
143 | }
144 | },
145 | "mappings": {
146 | "example_type": {
147 | "properties": {
148 | "url": {
149 | "type": "multi_field",
150 | "fields": {
151 | "url": {"type": "string"},
152 | "host": {"type": "string", "analyzer": "url_host"}
153 | }
154 | }
155 | }
156 | }
157 | }
158 | }
159 | ```
160 |
161 | Make an analysis request:
162 | ```bash
163 | curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'https://foo.bar.com/baz.html'
164 |
165 | {
166 | "tokens" : [ {
167 | "token" : "foo.bar.com",
168 | "start_offset" : 0,
169 | "end_offset" : 32,
170 | "type" : "word",
171 | "position" : 1
172 | } ]
173 | }
174 | ```
175 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.elasticsearch
8 | elasticsearch-analysis-url
9 | 5.6.3.0
10 | jar
11 | Elasticsearch URL token filter plugin
12 |
13 |
14 | org.sonatype.oss
15 | oss-parent
16 | 9
17 |
18 |
19 |
20 | UTF-8
21 | 5.6.3
22 | 6.6.1
23 | 1.3
24 | 19.0
25 | onerror
26 | true
27 | elasticsearch.yml
28 | INFO
29 |
30 |
31 |
32 |
33 | sonatype
34 | http://oss.sonatype.org/content/repositories/releases
35 |
36 |
37 |
38 |
39 |
40 | com.google.guava
41 | guava
42 | ${guava.version}
43 |
44 |
45 |
46 |
47 |
48 | org.elasticsearch
49 | elasticsearch
50 | ${elasticsearch.version}
51 | compile
52 |
53 |
54 |
55 | org.elasticsearch.test
56 | framework
57 | ${elasticsearch.version}
58 | test
59 |
60 |
61 | org.hamcrest
62 | hamcrest-all
63 |
64 |
65 | junit
66 | junit
67 |
68 |
69 |
70 |
71 |
72 | org.hamcrest
73 | hamcrest-all
74 | ${hamcrest.version}
75 | test
76 |
77 |
78 |
79 | junit
80 | junit
81 | 4.12
82 | test
83 |
84 |
85 | org.hamcrest
86 | hamcrest-core
87 |
88 |
89 |
90 |
91 |
92 | org.apache.logging.log4j
93 | log4j-core
94 | 2.9.1
95 | test
96 |
97 |
98 |
99 | org.slf4j
100 | slf4j-simple
101 | 1.7.12
102 | test
103 |
104 |
105 |
106 |
107 |
108 |
109 | src/main/resources
110 | true
111 |
112 | *.properties
113 |
114 |
115 |
116 | src/main/resources
117 | false
118 |
119 | *.properties
120 |
121 |
122 |
123 |
124 |
125 | ${basedir}/src/test/java
126 |
127 | **/*.json
128 | **/*.yml
129 | **/*.txt
130 | **/*.properties
131 |
132 | true
133 |
134 |
135 | ${basedir}/src/test/resources
136 |
137 | **/*.*
138 |
139 |
140 |
141 |
142 |
143 |
144 | org.apache.maven.plugins
145 | maven-compiler-plugin
146 | 3.2
147 |
148 | 1.8
149 | 1.8
150 |
151 |
152 |
153 |
154 | com.carrotsearch.randomizedtesting
155 | junit4-maven-plugin
156 | 2.1.11
157 |
158 |
159 |
160 | org.apache.maven.plugins
161 | maven-surefire-plugin
162 | 2.19.1
163 |
164 |
165 | -Dtests.security.manager=false
166 |
167 |
168 |
169 |
170 | org.apache.maven.plugins
171 | maven-source-plugin
172 | 2.4
173 |
174 |
175 |
176 | org.apache.maven.plugins
177 | maven-assembly-plugin
178 | 2.5.3
179 |
180 | false
181 | ${project.build.directory}/releases/
182 |
183 | ${basedir}/src/main/assemblies/plugin.xml
184 |
185 |
186 |
187 |
188 | package
189 |
190 | single
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | plugin
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | src/main/resources/plugin-descriptor.properties
11 | /elasticsearch/
12 | true
13 |
14 |
15 |
16 |
17 | /elasticsearch/
18 | true
19 | true
20 |
21 | org.elasticsearch:elasticsearch
22 |
23 |
24 |
25 | /elasticsearch/
26 | true
27 | true
28 |
29 | ${project.name}-${project.version}.jar
30 |
31 |
32 |
33 | /elasticsearch/
34 | true
35 | true
36 |
37 | com.google.guava:guava
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLPart.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | /**
4 | * Joe Linn
5 | * 1/17/2015
6 | */
7 | public enum URLPart {
8 | PROTOCOL((short) 1),
9 | HOST((short) 2),
10 | PORT((short) 3),
11 | PATH((short) 4),
12 | REF((short) 5),
13 | QUERY((short) 6),
14 | WHOLE((short) 7);
15 |
16 | private final short order;
17 |
18 | URLPart(short order) {
19 | this.order = order;
20 | }
21 |
22 | public short getOrder() {
23 | return order;
24 | }
25 |
26 | public static URLPart fromString(String part) {
27 | for (URLPart urlPart : URLPart.values()) {
28 | if (urlPart.name().equalsIgnoreCase(part)) {
29 | return urlPart;
30 | }
31 | }
32 | throw new IllegalArgumentException(String.format("Unrecognized URL part: %s", part));
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLPartComparator.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.util.Comparator;
4 |
5 | /**
6 | * @author Joe Linn
7 | * 11/13/2016
8 | */
9 | public class URLPartComparator implements Comparator {
10 | @Override
11 | public int compare(URLPart o1, URLPart o2) {
12 | return o1.getOrder() - o2.getOrder();
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.TokenStream;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.index.IndexSettings;
7 | import org.elasticsearch.index.analysis.url.URLTokenFilter;
8 |
9 | import java.util.Arrays;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 |
13 | /**
14 | * Joe Linn
15 | * 1/17/2015
16 | */
17 | public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
18 | private final List parts;
19 | private final boolean urlDecode;
20 | private boolean tokenizeHost;
21 | private boolean tokenizePath;
22 | private boolean tokenizeQuery;
23 | private final boolean allowMalformed;
24 | private final boolean tokenizeMalformed;
25 | private final boolean passthrough;
26 |
27 |
28 | public URLTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
29 | super(indexSettings, name, settings);
30 |
31 | this.parts = Arrays.stream(settings.getAsArray("part", new String[]{"whole"}))
32 | .map(URLPart::fromString)
33 | .collect(Collectors.toList());
34 |
35 | this.urlDecode = settings.getAsBoolean("url_decode", false);
36 | this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
37 | this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
38 | this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
39 | this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
40 | this.tokenizeMalformed = settings.getAsBoolean("tokenize_malformed", false);
41 | this.passthrough = settings.getAsBoolean("passthrough", false);
42 | }
43 |
44 |
45 | @Override
46 | public TokenStream create(TokenStream tokenStream) {
47 | return new URLTokenFilter(tokenStream, null, urlDecode, allowMalformed, passthrough)
48 | .setParts(parts)
49 | .setTokenizeMalformed(tokenizeMalformed)
50 | .setTokenizeHost(tokenizeHost)
51 | .setTokenizePath(tokenizePath)
52 | .setTokenizeQuery(tokenizeQuery);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.index.IndexSettings;
7 | import org.elasticsearch.index.analysis.url.URLTokenizer;
8 |
9 | import java.util.Arrays;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 |
13 | /**
14 | * Joe Linn
15 | * 8/1/2015
16 | */
17 | public class URLTokenizerFactory extends AbstractTokenizerFactory {
18 | private List parts;
19 | private boolean urlDecode;
20 | private boolean tokenizeHost;
21 | private boolean tokenizePath;
22 | private boolean tokenizeQuery;
23 | private boolean allowMalformed;
24 | private boolean tokenizeMalformed;
25 |
26 |
27 | public URLTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
28 | super(indexSettings, name, settings);
29 |
30 | String[] parts = settings.getAsArray("part");
31 | if (parts != null && parts.length > 0) {
32 | this.parts = Arrays.stream(parts)
33 | .map(URLPart::fromString)
34 | .collect(Collectors.toList());
35 | }
36 | this.urlDecode = settings.getAsBoolean("url_decode", false);
37 | this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
38 | this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
39 | this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
40 | this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
41 | this.tokenizeMalformed = settings.getAsBoolean("tokenize_malformed", false);
42 | }
43 |
44 |
45 | @Override
46 | public Tokenizer create() {
47 | URLTokenizer tokenizer = new URLTokenizer();
48 | tokenizer.setParts(parts);
49 | tokenizer.setUrlDecode(urlDecode);
50 | tokenizer.setTokenizeHost(tokenizeHost);
51 | tokenizer.setTokenizePath(tokenizePath);
52 | tokenizer.setTokenizeQuery(tokenizeQuery);
53 | tokenizer.setAllowMalformed(allowMalformed);
54 | tokenizer.setTokenizeMalformed(tokenizeMalformed);
55 | return tokenizer;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/Token.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import com.google.common.base.Objects;
4 | import org.elasticsearch.index.analysis.URLPart;
5 |
6 | /**
7 | * @author Joe Linn
8 | * 8/14/2016
9 | */
10 | class Token {
11 | private final String token;
12 | private final URLPart part;
13 | private final int start;
14 | private final int end;
15 |
16 | public Token(String token, URLPart part, int start, int end) {
17 | this.token = token;
18 | this.part = part;
19 | this.start = start;
20 | this.end = end;
21 | }
22 |
23 | public String getToken() {
24 | return token;
25 | }
26 |
27 | public URLPart getPart() {
28 | return part;
29 | }
30 |
31 | public int getStart() {
32 | return start;
33 | }
34 |
35 | public int getEnd() {
36 | return end;
37 | }
38 |
39 |
40 | @Override
41 | public boolean equals(Object obj) {
42 | if (obj == null || !(obj instanceof Token)) {
43 | return false;
44 | }
45 | Token that = (Token) obj;
46 | return this.start == that.start
47 | && this.end == that.end
48 | && Objects.equal(this.token, that.token)
49 | && Objects.equal(this.part, that.part);
50 | }
51 |
52 | @Override
53 | public int hashCode() {
54 | int result = token != null ? token.hashCode() : 0;
55 | result = 31 * result + part.hashCode();
56 | result = 31 * result + start;
57 | result = 31 * result + end;
58 | return result;
59 | }
60 |
61 |
62 | @Override
63 | public String toString() {
64 | return "Token{" +
65 | "token='" + token + '\'' +
66 | ", part=" + part +
67 | ", start=" + start +
68 | ", end=" + end +
69 | '}';
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.apache.lucene.analysis.TokenFilter;
4 | import org.apache.lucene.analysis.TokenStream;
5 | import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
6 | import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10 | import org.elasticsearch.common.Strings;
11 | import org.elasticsearch.index.analysis.URLPart;
12 |
13 | import java.io.IOException;
14 | import java.io.StringReader;
15 | import java.net.MalformedURLException;
16 | import java.util.ArrayList;
17 | import java.util.Collections;
18 | import java.util.Iterator;
19 | import java.util.List;
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | /**
24 | * Joe Linn
25 | * 1/17/2015
26 | */
27 | public final class URLTokenFilter extends TokenFilter {
28 | public static final String NAME = "url";
29 |
30 | private List parts;
31 |
32 | private boolean urlDeocde;
33 |
34 | /**
35 | * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
36 | */
37 | private boolean tokenizeHost = true;
38 |
39 | /**
40 | * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
41 | */
42 | private boolean tokenizePath = true;
43 |
44 | /**
45 | * If true, the url's query string will be split on &
46 | */
47 | private boolean tokenizeQuery = true;
48 |
49 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
50 | private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
51 | private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
52 |
53 | private final boolean allowMalformed;
54 |
55 | private boolean tokenizeMalformed;
56 |
57 | private boolean passthrough;
58 |
59 | private List tokens;
60 | private Iterator iterator;
61 |
62 | public URLTokenFilter(TokenStream input, URLPart part) {
63 | this(input, part, false);
64 | }
65 |
66 | public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
67 | this(input, part, urlDecode, false);
68 | }
69 |
70 | public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
71 | this(input, part, urlDecode, allowMalformed, false);
72 | }
73 |
74 | public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
75 | super(input);
76 | if (part != null) {
77 | this.parts = Collections.singletonList(part);
78 | } else {
79 | parts = null;
80 | }
81 | this.urlDeocde = urlDecode;
82 | this.allowMalformed = allowMalformed;
83 | this.passthrough = passthrough;
84 | }
85 |
86 |
87 | public URLTokenFilter setParts(List parts) {
88 | this.parts = parts;
89 | return this;
90 | }
91 |
92 | public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
93 | this.tokenizeHost = tokenizeHost;
94 | return this;
95 | }
96 |
97 | public URLTokenFilter setTokenizePath(boolean tokenizePath) {
98 | this.tokenizePath = tokenizePath;
99 | return this;
100 | }
101 |
102 | public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
103 | this.tokenizeQuery = tokenizeQuery;
104 | return this;
105 | }
106 |
107 |
108 | public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
109 | this.tokenizeMalformed = tokenizeMalformed;
110 | return this;
111 | }
112 |
113 | public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
114 | this.urlDeocde = urlDeocde;
115 | return this;
116 | }
117 |
118 |
119 | @Override
120 | public boolean incrementToken() throws IOException {
121 | if (iterator == null || !iterator.hasNext()) {
122 | if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
123 | return false;
124 | }
125 | }
126 | clearAttributes();
127 | Token next = iterator.next();
128 | termAttribute.append(next.getToken());
129 | typeAttribute.setType(next.getPart().name().toLowerCase());
130 | offsetAttribute.setOffset(next.getStart(), next.getEnd());
131 | return true;
132 | }
133 |
134 |
135 | /**
136 | * Advance to the next token, if any
137 | * @return true if more tokens are forthcoming, false otherwise
138 | * @throws IOException
139 | */
140 | private boolean advance() throws IOException {
141 | if (input.incrementToken()) {
142 | String urlString = termAttribute.toString();
143 | if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
144 | return false;
145 | }
146 | try {
147 | tokens = tokenize(urlString);
148 | } catch (IOException e) {
149 | if (e.getMessage().contains("Malformed URL")) {
150 | if (allowMalformed) {
151 | tokens = Collections.singletonList(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
152 | } else {
153 | throw new MalformedURLException("Malformed URL: " + urlString);
154 | }
155 | }
156 | throw e;
157 | }
158 | if (tokens.isEmpty()) {
159 | return false;
160 | }
161 | iterator = tokens.iterator();
162 | return true;
163 | } else {
164 | return false;
165 | }
166 | }
167 |
168 |
169 | /**
170 | * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
171 | * will be passed along to the tokenizer.
172 | * @param input a string to be tokenized
173 | * @return a list of tokens extracted from the input string
174 | * @throws IOException
175 | */
176 | private List tokenize(String input) throws IOException {
177 | List tokens = new ArrayList<>();
178 | URLTokenizer tokenizer = new URLTokenizer();
179 | // create a copy of the parts list to avoid ConcurrentModificationException when sorting
180 | tokenizer.setParts(new ArrayList<>(parts));
181 | tokenizer.setUrlDecode(urlDeocde);
182 | tokenizer.setTokenizeHost(tokenizeHost);
183 | tokenizer.setTokenizePath(tokenizePath);
184 | tokenizer.setTokenizeQuery(tokenizeQuery);
185 | tokenizer.setAllowMalformed(allowMalformed || passthrough);
186 | tokenizer.setTokenizeMalformed(tokenizeMalformed);
187 | tokenizer.setReader(new StringReader(input));
188 | tokenizer.reset();
189 |
190 | String term;
191 | URLPart part;
192 | OffsetAttribute offset;
193 | while (tokenizer.incrementToken()) {
194 | term = tokenizer.getAttribute(CharTermAttribute.class).toString();
195 | part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
196 | offset = tokenizer.getAttribute(OffsetAttribute.class);
197 | tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
198 | }
199 | return tokens;
200 | }
201 |
202 |
203 | @Override
204 | public void reset() throws IOException {
205 | super.reset();
206 | tokens = null;
207 | iterator = null;
208 | }
209 |
210 | private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
211 | private static final Pattern REGEX_PORT = Pattern.compile(":([0-9]{1,5})");
212 | private static final Pattern REGEX_QUERY = Pattern.compile("\\?(.+)");
213 |
214 | /**
215 | * Attempt to parse a malformed url string
216 | * @param urlString the malformed url string
217 | * @return the url part if it can be parsed, null otherwise
218 | * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
219 | */
220 | private String parseMalformed(String urlString) {
221 | if (parts != null && !parts.isEmpty()) {
222 | String ret;
223 | for (URLPart part : parts) {
224 | switch (part) {
225 | case PROTOCOL:
226 | ret = applyPattern(REGEX_PROTOCOL, urlString);
227 | break;
228 | case PORT:
229 | ret = applyPattern(REGEX_PORT, urlString);
230 | break;
231 | case QUERY:
232 | ret = applyPattern(REGEX_QUERY, urlString);
233 | break;
234 | case WHOLE:
235 | ret = urlString;
236 | break;
237 | default:
238 | ret = urlString;
239 | }
240 | if (!Strings.isNullOrEmpty(ret)) {
241 | return ret;
242 | }
243 | }
244 | }
245 | return urlString;
246 | }
247 |
248 | /**
249 | * Apply the given regex pattern to the given malformed url string and return the first match
250 | * @param pattern the pattern to match
251 | * @param urlString the malformed url to which the pattern should be applied
252 | * @return the first match if one exists, null otherwise
253 | */
254 | private String applyPattern(Pattern pattern, String urlString) {
255 | Matcher matcher = pattern.matcher(urlString);
256 | if (matcher.find()) {
257 | return matcher.group(1);
258 | }
259 | return null;
260 | }
261 | }
262 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import com.google.common.base.Strings;
4 | import com.google.common.collect.Lists;
5 | import com.google.common.net.InetAddresses;
6 | import org.apache.lucene.analysis.Tokenizer;
7 | import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
8 | import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
9 | import org.apache.lucene.analysis.pattern.PatternTokenizer;
10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
13 | import org.apache.lucene.util.AttributeFactory;
14 | import org.elasticsearch.index.analysis.URLPart;
15 | import org.elasticsearch.index.analysis.URLPartComparator;
16 |
17 | import java.io.IOException;
18 | import java.io.Reader;
19 | import java.io.StringReader;
20 | import java.net.MalformedURLException;
21 | import java.net.URL;
22 | import java.net.URLDecoder;
23 | import java.util.*;
24 | import java.util.regex.Pattern;
25 |
26 | import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
27 |
28 | /**
29 | * Joe Linn
30 | * 7/30/2015
31 | */
32 | public final class URLTokenizer extends Tokenizer {
33 | private static final URLPartComparator PART_COMPARATOR = new URLPartComparator();
34 |
35 | /**
36 | * If set, only the given part of the url will be tokenized.
37 | */
38 | private List parts;
39 |
40 | /**
41 | * If true, url parts will be url decoded prior to tokenization.
42 | */
43 | private boolean urlDecode;
44 |
45 | /**
46 | * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
47 | */
48 | private boolean tokenizeHost = true;
49 |
50 | /**
51 | * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
52 | */
53 | private boolean tokenizePath = true;
54 |
55 | /**
56 | * If true, the url's query string will be split on &
57 | */
58 | private boolean tokenizeQuery = true;
59 |
60 | /**
61 | * If true, {@link MalformedURLException} will be suppressed, and the given string will be returned as a single token
62 | */
63 | private boolean allowMalformed;
64 |
65 | /**
66 | * Has no effect if {@link #allowMalformed} is false. If both are true, an attempt will be made to tokenize malformed
67 | * URLs using regular expressions.
68 | */
69 | private boolean tokenizeMalformed;
70 |
71 |
72 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
73 | private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
74 | private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
75 |
76 | private List tokens;
77 | private Iterator iterator;
78 |
79 |
80 | public URLTokenizer() {
81 |
82 | }
83 |
84 | public URLTokenizer(URLPart part) {
85 | setPart(part);
86 | }
87 |
88 |
89 | public URLTokenizer(AttributeFactory factory) {
90 | super(factory);
91 | }
92 |
93 | public void setParts(List parts) {
94 | if (parts != null) {
95 | parts.sort(PART_COMPARATOR);
96 | this.parts = parts;
97 | }
98 | }
99 |
100 | public void setPart(URLPart part) {
101 | if (part != null) {
102 | this.parts = Collections.singletonList(part);
103 | }
104 | }
105 |
106 | public void setUrlDecode(boolean urlDecode) { this.urlDecode = urlDecode; }
107 |
108 | public void setTokenizeHost(boolean tokenizeHost) { this.tokenizeHost = tokenizeHost; }
109 |
110 | public void setTokenizePath(boolean tokenizePath) { this.tokenizePath = tokenizePath; }
111 |
112 | public void setTokenizeQuery(boolean tokenizeQuery) { this.tokenizeQuery = tokenizeQuery; }
113 |
114 | public void setAllowMalformed(boolean allowMalformed) { this.allowMalformed = allowMalformed; }
115 |
116 | public void setTokenizeMalformed(boolean tokenizeMalformed) { this.tokenizeMalformed = tokenizeMalformed; }
117 |
118 | @Override
119 | public boolean incrementToken() throws IOException {
120 | if (iterator == null) {
121 | String urlString = readerToString(input);
122 | if (Strings.isNullOrEmpty(urlString)) {
123 | return false;
124 | }
125 | tokens = tokenize(urlString);
126 | iterator = tokens.iterator();
127 | }
128 | if (!iterator.hasNext()) {
129 | return false;
130 | }
131 |
132 | clearAttributes();
133 | Token token = iterator.next();
134 | termAttribute.append(token.getToken());
135 | typeAttribute.setType(token.getPart().name().toLowerCase());
136 | offsetAttribute.setOffset(token.getStart(), token.getEnd());
137 | return true;
138 | }
139 |
140 |
141 | @Override
142 | public void reset() throws IOException {
143 | super.reset();
144 | tokens = null;
145 | iterator = null;
146 | }
147 |
148 |
149 | /**
150 | * Read the contents of a {@link Reader} into a string
151 | * @param reader the reader to be converted
152 | * @return the entire contents of the given reader
153 | * @throws IOException
154 | */
155 | private String readerToString(Reader reader) throws IOException {
156 | char[] arr = new char[8 * 1024];
157 | StringBuilder buffer = new StringBuilder();
158 | int numCharsRead;
159 | while ((numCharsRead = reader.read(arr, 0, arr.length)) != -1) {
160 | buffer.append(arr, 0, numCharsRead);
161 | }
162 | return buffer.toString();
163 | }
164 |
165 |
166 | /**
167 | * Tokenize the given URL string according to the options which have been set.
168 | * @param urlString the string to be tokenized
169 | * @return a list of {@link Token}s parsed from the string
170 | * @throws IOException
171 | */
172 | private List tokenize(String urlString) throws IOException {
173 | try {
174 | URL url = new URL(urlString);
175 | if (parts != null && !parts.isEmpty()) {
176 | List tokensList = new ArrayList<>();
177 | for (URLPart part : parts) {
178 | tokensList.addAll(tokenize(url, part));
179 | }
180 | return tokensList;
181 | }
182 | // No part is specified. Tokenize all parts.
183 | Set tokens = new LinkedHashSet<>();
184 | for (URLPart urlPart : URLPart.values()) {
185 | tokens.addAll(tokenize(url, urlPart));
186 | }
187 | tokens.addAll(tokenizeSpecial(url));
188 | return Lists.newArrayList(tokens);
189 | } catch (MalformedURLException e) {
190 | if (allowMalformed) {
191 | if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
192 | return tokenizePartsMalformed(urlString, parts);
193 | }
194 | return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
195 | }
196 | throw new IOException("Malformed URL: " + urlString, e);
197 | }
198 | }
199 |
200 |
201 | /**
202 | * Tokenize all given parts of the given URL while ensuring that duplicate tokens are not created when the whole
203 | * malformed URL is is identical to a single part token.
204 | * @param urlString the malformed URL to be tokenized
205 | * @param parts the desired {@link URLPart}s in proper part order
206 | * @return a list of {@link Token}s
207 | * @throws IOException
208 | */
209 | private List tokenizePartsMalformed(String urlString, List parts) throws IOException {
210 | List tokens = new ArrayList<>();
211 | Set tokenStrings = new HashSet<>();
212 | for (URLPart part : parts) {
213 | for (Token token : tokenizeMalformed(urlString, part)) {
214 | if (part != URLPart.WHOLE) {
215 | tokens.add(token);
216 | tokenStrings.add(token.getToken());
217 | } else if (tokenStrings.isEmpty()) {
218 | // If we couldn't tokenize any of the parts, add the whole thing.
219 | tokens.add(token);
220 | }
221 | }
222 | }
223 | return tokens;
224 | }
225 |
226 |
227 | /**
228 | * Attempt to tokenize the given malformed URL.
229 | * @param url the URL to be tokenized
230 | * @param part the desired part of the URL
231 | * @return {@link List} of {@link Token}s gleaned from the given URL
232 | * @throws IOException
233 | */
234 | private List tokenizeMalformed(String url, URLPart part) throws IOException {
235 | if (part == null) {
236 | // No part is specified. Tokenize all parts.
237 | List urlParts = Arrays.asList(URLPart.values());
238 | urlParts.sort(new URLPartComparator());
239 | return tokenizePartsMalformed(url, urlParts);
240 | }
241 | Optional partOptional = getPart(url, part);
242 | if (!partOptional.isPresent() || partOptional.get().equals("")) {
243 | // desired part was not found
244 | return new ArrayList<>();
245 | }
246 | final String partStringRaw = partOptional.get();
247 | int start = 0;
248 | int end = 0;
249 | String partString = urlDecode(partOptional.get());
250 | switch (part) {
251 | case HOST:
252 | return getHostTokens(url, partStringRaw, partString);
253 | case PORT:
254 | return getPortTokens(url, partStringRaw);
255 | case PATH:
256 | return getPathTokens(url, partStringRaw, partString);
257 | case REF:
258 | return getRefTokens(url, partStringRaw, partString);
259 | case QUERY:
260 | return getQueryTokens(url, partStringRaw, partString);
261 | case PROTOCOL:
262 | return Collections.singletonList(new Token(partString, part, start, partString.length()));
263 | case WHOLE:
264 | return Collections.singletonList(new Token(url, URLPart.WHOLE, 0, url.length() - 1));
265 | default:
266 | }
267 | return Collections.singletonList(new Token(partString, part, start, end));
268 | }
269 |
270 |
271 | /**
272 | * URL decode the given string if {@link #urlDecode} is true. The given partString
is passed through
273 | * unaltered otherwise.
274 | * @param partString string to be URL decoded
275 | * @return URL decoded string if {@link #urlDecode} is true; unaltered string otherwise.
276 | * @throws IOException if malformed URL encoding is present and {@link #allowMalformed} is false.
277 | */
278 | private String urlDecode(String partString) throws IOException {
279 | if (urlDecode) {
280 | try {
281 | partString = URLDecoder.decode(partString, "UTF-8");
282 | } catch (IllegalArgumentException e) {
283 | if (!allowMalformed) {
284 | throw new IOException("Error performing URL decoding on string: " + partString, e);
285 | }
286 | }
287 | }
288 | return partString;
289 | }
290 |
291 |
292 | private static final Pattern QUERY_SEPARATOR = Pattern.compile("&");
293 |
294 | /**
295 | * Tokenize the given {@link URL} based on the desired {@link URLPart} and currently set tokenizer options.
296 | * @param url the url to be tokenized
297 | * @param part the desired part of the url
298 | * @return a list of {@link Token}s parsed from the given url
299 | * @throws IOException
300 | */
301 | private List tokenize(URL url, URLPart part) throws IOException {
302 | String partString = getPart(url, part);
303 | if (Strings.isNullOrEmpty(partString)) {
304 | // desired part was not found
305 | return new ArrayList<>();
306 | }
307 | final String partStringRaw = partString;
308 | int start = 0;
309 | int end = 0;
310 | partString = urlDecode(partString);
311 | switch (part) {
312 | case HOST:
313 | return getHostTokens(url, partStringRaw, partString);
314 | case PORT:
315 | return getPortTokens(url, getPart(url, part));
316 | case PATH:
317 | return getPathTokens(url, partStringRaw, partString);
318 | case QUERY:
319 | return getQueryTokens(url, partStringRaw, partString);
320 | case PROTOCOL:
321 | case WHOLE:
322 | end = partString.length();
323 | break;
324 | case REF:
325 | return getRefTokens(url, partStringRaw, partString);
326 | default:
327 | }
328 | return Collections.singletonList(new Token(partString, part, start, end));
329 | }
330 |
331 |
332 | /**
333 | * Retrieve tokens representing the host of the given URL
334 | * @param url URL to be tokenized
335 | * @param partStringRaw raw (not url decoded) string containing the host
336 | * @param partString potentially url decoded string containing the host
337 | * @return host tokens
338 | * @throws IOException
339 | */
340 | private List getHostTokens(URL url, String partStringRaw, String partString) throws IOException {
341 | return getHostTokens(url.toString(), partStringRaw, partString);
342 | }
343 |
344 |
345 | /**
346 | * Retrieve tokens representing the host of the given URL
347 | * @param url URL to be tokenized
348 | * @param partStringRaw raw (not url decoded) string containing the host
349 | * @param partString potentially url decoded string containing the host
350 | * @return host tokens
351 | * @throws IOException
352 | */
353 | private List getHostTokens(String url, String partStringRaw, String partString) throws IOException {
354 | int start = getStartIndex(url, partStringRaw);
355 | if (!tokenizeHost || InetAddresses.isInetAddress(partString)) {
356 | int end = getEndIndex(start, partStringRaw);
357 | return Collections.singletonList(new Token(partString, URLPart.HOST, start, end));
358 | }
359 | return tokenize(URLPart.HOST, addReader(new ReversePathHierarchyTokenizer('.', '.'), new StringReader(partString)), start);
360 | }
361 |
362 |
363 | private List getPortTokens(URL url, String port) {
364 | return getPortTokens(url.toString(), port);
365 | }
366 |
367 |
368 | private List getPortTokens(String url, String port) {
369 | int start = url.indexOf(":" + port);
370 | int end = 0;
371 | if (start == -1) {
372 | // port was inferred
373 | start = 0;
374 | } else {
375 | // explicit port
376 | start++; // account for :
377 | end = getEndIndex(start, port);
378 | }
379 | return Collections.singletonList(new Token(port, URLPart.PORT, start, end));
380 | }
381 |
382 |
383 | private List getPathTokens(URL url, String partStringRaw, String partString) throws IOException {
384 | return getPathTokens(url.toString(), partStringRaw, partString);
385 | }
386 |
387 |
388 | private List getPathTokens(String url, String partStringRaw, String partString) throws IOException {
389 | int start = getStartIndex(url, partStringRaw);
390 | if (!tokenizePath) {
391 | int end = getEndIndex(start, partStringRaw);
392 | return Collections.singletonList(new Token(partString, URLPart.PATH, start, end));
393 | }
394 | return tokenize(URLPart.PATH, addReader(new PathHierarchyTokenizer('/', '/'), new StringReader(partString)), start);
395 | }
396 |
397 |
398 | private List getRefTokens(URL url, String partStringRaw, String partString) {
399 | return getRefTokens(url.toString(), partStringRaw, partString);
400 | }
401 |
402 |
403 | private List getRefTokens(String url, String partStringRaw, String partString) {
404 | int start = getStartIndex(url, "#" + partStringRaw) + 1;
405 | int end = url.length();
406 | return Collections.singletonList(new Token(partString, URLPart.REF, start, end));
407 | }
408 |
409 |
410 | private List getQueryTokens(URL url, String partStringRaw, String partString) throws IOException {
411 | return getQueryTokens(url.toString(), partStringRaw, partString);
412 | }
413 |
414 |
415 | private List getQueryTokens(String url, String partStringRaw, String partString) throws IOException {
416 | int start = getStartIndex(url, partStringRaw);
417 | if (!tokenizeQuery) {
418 | int end = getEndIndex(start, partStringRaw);
419 | return Collections.singletonList(new Token(partString, URLPart.QUERY, start, end));
420 | }
421 | return tokenize(URLPart.QUERY, addReader(new PatternTokenizer(QUERY_SEPARATOR, -1), new StringReader(partString)), start);
422 | }
423 |
424 |
425 | /**
426 | * Set the given reader on the given tokenizer
427 | * @param tokenizer tokenizer on which the reader is to be set
428 | * @param input the reader to set
429 | * @return the given tokenizer with the given reader set
430 | * @throws IOException
431 | */
432 | private Tokenizer addReader(Tokenizer tokenizer, Reader input) throws IOException {
433 | tokenizer.setReader(input);
434 | return tokenizer;
435 | }
436 |
437 |
438 | /**
439 | * Get the start index of the given string in the given url
440 | * @param url the url
441 | * @param partStringRaw the url part
442 | * @return the starting index of the part string if it is found in the given url, -1 if it is not found
443 | */
444 | private int getStartIndex(URL url, String partStringRaw) {
445 | return getStartIndex(url.toString(), partStringRaw);
446 | }
447 |
448 |
449 | private int getStartIndex(String url, String partStringRaw) {
450 | return url.indexOf(partStringRaw);
451 | }
452 |
453 |
454 | /**
455 | * Get the end index of the given part string
456 | * @param start the start index of the part string
457 | * @param partStringRaw the part string
458 | * @return the end index
459 | */
460 | private int getEndIndex(int start, String partStringRaw) {
461 | return start + partStringRaw.length();
462 | }
463 |
464 |
465 | /**
466 | * Get a list of {@link Token}s from the given {@link Tokenizer}
467 | * @param part the url part which should be used in {@link Token} creation
468 | * @param tokenizer the tokenizer from which tokens will be gleaned
469 | * @return a list of tokens
470 | * @throws IOException
471 | */
472 | private List tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
473 | tokenizer.reset();
474 | List tokens = new ArrayList<>();
475 | OffsetAttribute offset;
476 | String token;
477 | while (tokenizer.incrementToken()) {
478 | token = tokenizer.getAttribute(CharTermAttribute.class).toString();
479 | offset = tokenizer.getAttribute(OffsetAttribute.class);
480 | tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
481 | }
482 | return tokens;
483 | }
484 |
485 |
486 | /**
487 | * Perform non-standard tokenization.
488 | * @param url the URL to be tokenized
489 | * @return a list of {@link Token}s. Since tokens created in this method do not pertain to a specific part of the url,
490 | * {@link URLPart#WHOLE} will be used.
491 | */
492 | private List tokenizeSpecial(URL url) {
493 | List tokens = new ArrayList<>();
494 | // host:port
495 | String token = getPart(url, URLPart.HOST) + ":" + getPart(url, URLPart.PORT);
496 | int start = getStartIndex(url, token);
497 | int end = 0;
498 | if(start == -1){
499 | // implicit port
500 | start = 0;
501 | } else {
502 | end = getEndIndex(start, token);
503 | }
504 | tokens.add(new Token(token, URLPart.WHOLE, start, end));
505 |
506 | // protocol://host
507 | token = getPart(url, URLPart.PROTOCOL) + "://" + getPart(url, URLPart.HOST);
508 | start = getStartIndex(url, token);
509 | end = getEndIndex(start, token);
510 | tokens.add(new Token(token, URLPart.WHOLE, start, end));
511 | return tokens;
512 | }
513 |
514 |
515 | }
516 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLUtils.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.elasticsearch.index.analysis.URLPart;
4 |
5 | import java.net.URL;
6 | import java.util.Optional;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | /**
11 | * Joe Linn
12 | * 7/30/2015
13 | */
14 | public class URLUtils {
15 | private static final Pattern PATTERN_PROTOCOL = Pattern.compile("(^[a-zA-Z]*)://");
16 | private static final Pattern PATTERN_HOST = Pattern.compile("^(?:^[a-zA-Z]*://)?((?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)/?(?:.*)");
17 | private static final Pattern PATTERN_PORT = Pattern.compile("^(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)(?::([0-9]*))?/?(?:.*)");
18 | private static final Pattern PATTERN_PATH = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?([^\\?\\#&]*)");
19 | private static final Pattern PATTERN_REF = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?(?:[^\\?\\#&]*)(#[^\\?\\&]*)?");
20 | private static final Pattern PATTERN_QUERY = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?(?:[^\\?\\#&]*)(?:#[^\\?\\&]*)?(\\?.*)");
21 |
22 | private URLUtils() {}
23 |
24 |
25 | /**
26 | * Attempt to retrieve the desired part of the given URL
27 | * @param url URL to parse
28 | * @param part desired URL part
29 | * @return the part of the URL, if it could be found
30 | */
31 | public static Optional getPart(String url, URLPart part) {
32 | switch (part) {
33 | case PROTOCOL:
34 | return captureFirst(url, PATTERN_PROTOCOL);
35 | case HOST:
36 | return captureFirst(url, PATTERN_HOST);
37 | case PORT:
38 | return getPort(url);
39 | case PATH:
40 | return captureFirst(url, PATTERN_PATH);
41 | case REF:
42 | Optional refOptional = captureFirst(url, PATTERN_REF);
43 | if (refOptional.isPresent()) {
44 | refOptional = Optional.of(refOptional.get().replaceFirst("#", ""));
45 | }
46 | return refOptional;
47 | case QUERY:
48 | Optional queryOptional = captureFirst(url, PATTERN_QUERY);
49 | if (queryOptional.isPresent()) {
50 | queryOptional = Optional.of(queryOptional.get().replaceFirst("\\?", ""));
51 | }
52 | return queryOptional;
53 | case WHOLE:
54 | default:
55 | return Optional.of(url);
56 | }
57 | }
58 |
59 |
60 | private static Optional captureFirst(String input, Pattern pattern) {
61 | Matcher matcher = pattern.matcher(input);
62 | if (matcher.find()) {
63 | String group = matcher.group(1);
64 | if (group == null) {
65 | return Optional.empty();
66 | }
67 | return Optional.of(group);
68 | }
69 | return Optional.empty();
70 | }
71 |
72 |
73 | /**
74 | * Retrieve the given {@link URLPart} from the given {@link URL}
75 | * @param url the url from which a part is to be taken
76 | * @param part the part to be taken from the url
77 | * @return a part of the given url
78 | */
79 | public static String getPart(URL url, URLPart part) {
80 | switch (part) {
81 | case PROTOCOL:
82 | return url.getProtocol();
83 | case HOST:
84 | return url.getHost();
85 | case PORT:
86 | return getPort(url);
87 | case PATH:
88 | return url.getPath();
89 | case REF:
90 | return url.getRef();
91 | case QUERY:
92 | return url.getQuery();
93 | case WHOLE:
94 | default:
95 | return url.toString();
96 | }
97 | }
98 |
99 |
100 | /**
101 | * Parse the port from the given {@link URL}. If the port is not explicitly given, it will be inferred from the
102 | * protocol.
103 | *
104 | * @param url the url
105 | * @return the port
106 | */
107 | public static String getPort(URL url) {
108 | int port = url.getPort();
109 | if (port == -1) {
110 | // infer port from protocol
111 | Optional portOptional = portFromProtocol(url.getProtocol());
112 | return portOptional.orElse(null);
113 | }
114 | return String.valueOf(port);
115 | }
116 |
117 |
118 | public static Optional getPort(String url) {
119 | Optional portOptional = captureFirst(url, PATTERN_PORT);
120 | if (portOptional.isPresent()) {
121 | return portOptional;
122 | }
123 | // attempt to infer port form protocol
124 | Optional protocolOptional = getPart(url, URLPart.PROTOCOL);
125 | if (protocolOptional.isPresent()) {
126 | return portFromProtocol(protocolOptional.get());
127 | }
128 | return Optional.empty();
129 | }
130 |
131 |
132 | private static Optional portFromProtocol(final String protocol) {
133 | int port = -1;
134 | if (protocol.equals("http")) {
135 | port = 80;
136 | } else if (protocol.equals("https")) {
137 | port = 443;
138 | }
139 | if (port == -1) {
140 | // port could not be inferred
141 | return Optional.empty();
142 | }
143 | return Optional.of(String.valueOf(port));
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/AnalysisURLPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis;
2 |
3 | import org.elasticsearch.index.analysis.TokenFilterFactory;
4 | import org.elasticsearch.index.analysis.TokenizerFactory;
5 | import org.elasticsearch.index.analysis.URLTokenFilterFactory;
6 | import org.elasticsearch.index.analysis.URLTokenizerFactory;
7 | import org.elasticsearch.indices.analysis.AnalysisModule;
8 | import org.elasticsearch.plugins.AnalysisPlugin;
9 | import org.elasticsearch.plugins.Plugin;
10 |
11 | import java.util.Map;
12 |
13 | import static java.util.Collections.singletonMap;
14 |
15 | /**
16 | * Joe Linn
17 | * 1/17/2015
18 | */
19 | public class AnalysisURLPlugin extends Plugin implements AnalysisPlugin {
20 | @Override
21 | public Map> getTokenFilters() {
22 | return singletonMap("url", URLTokenFilterFactory::new);
23 | }
24 |
25 | @Override
26 | public Map> getTokenizers() {
27 | return singletonMap("url", URLTokenizerFactory::new);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | #plugin=org.elasticsearch.plugin.analysis.AnalysisURLPlugin
2 | version=${project.version}
3 | description=URL tokenizer and token filter.
4 | name=analysis-url
5 | site=false
6 | jvm=true
7 | classname=org.elasticsearch.plugin.analysis.AnalysisURLPlugin
8 | java.version=1.8
9 | elasticsearch.version=${elasticsearch.version}
10 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.apache.lucene.analysis.TokenStream;
4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 | import org.hamcrest.Description;
7 | import org.hamcrest.Factory;
8 | import org.hamcrest.TypeSafeMatcher;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.io.IOException;
13 |
14 | /**
15 | * Joe Linn
16 | * 8/2/2015
17 | */
18 | public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher {
19 | private static final Logger log = LoggerFactory.getLogger(IsTokenStreamWithTokenAndPosition.class);
20 |
21 | private final String token;
22 | private final int start;
23 | private final int end;
24 |
25 | private boolean foundToken;
26 | private int actualStart;
27 | private int actualEnd;
28 |
29 | public IsTokenStreamWithTokenAndPosition(String token, int start, int end) {
30 | this.token = token;
31 | this.start = start;
32 | this.end = end;
33 | }
34 |
35 | @Override
36 | protected boolean matchesSafely(TokenStream tokenizer) {
37 | CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
38 | OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
39 | try {
40 | tokenizer.reset();
41 | } catch (IOException e) {
42 | log.error("Unable to reset tokenizer.", e);
43 | return false;
44 | }
45 | tokenizer.clearAttributes();
46 | try {
47 | while (tokenizer.incrementToken()) {
48 | if (termAttribute.toString().equals(token)) {
49 | foundToken = true;
50 | actualStart = offset.startOffset();
51 | actualEnd = offset.endOffset();
52 | if (actualStart == start && actualEnd == end) {
53 | return true;
54 | }
55 | }
56 | }
57 | } catch (IOException e) {
58 | log.error("Unable to increment tokenizer.", e);
59 | }
60 | return false;
61 | }
62 |
63 | @Override
64 | public void describeTo(Description description) {
65 | description.appendText("tokenizer containing token '")
66 | .appendText(token)
67 | .appendText("' starting at offset ")
68 | .appendValue(start)
69 | .appendText(" and ending at offset ")
70 | .appendValue(end);
71 | }
72 |
73 |
74 | @Override
75 | protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) {
76 | if(!foundToken){
77 | mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token);
78 | } else {
79 | mismatchDescription.appendText("tokenizer containing token ")
80 | .appendValue(token)
81 | .appendText(" starting at offset ")
82 | .appendValue(actualStart)
83 | .appendText(" and ending at offset ")
84 | .appendValue(actualEnd);
85 | }
86 | }
87 |
88 | @Factory
89 | public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
90 | return new IsTokenStreamWithTokenAndPosition(token, start, end);
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithToken.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
5 | import org.hamcrest.Description;
6 | import org.hamcrest.Factory;
7 | import org.hamcrest.TypeSafeMatcher;
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 | import java.io.IOException;
12 |
13 | /**
14 | * Joe Linn
15 | * 8/2/2015
16 | */
17 | public class IsTokenizerWithToken extends TypeSafeMatcher {
18 | private static final Logger log = LoggerFactory.getLogger(IsTokenizerWithToken.class);
19 |
20 | private final String token;
21 |
22 |
23 | public IsTokenizerWithToken(String token) {
24 | this.token = token;
25 | }
26 |
27 |
28 | @Override
29 | protected boolean matchesSafely(Tokenizer tokenizer) {
30 | CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
31 | try {
32 | tokenizer.reset();
33 | } catch (IOException e) {
34 | log.error("Unable to reset tokenizer.", e);
35 | return false;
36 | }
37 | tokenizer.clearAttributes();
38 | try {
39 | while (tokenizer.incrementToken()) {
40 | if (termAttribute.toString().equals(token)) {
41 | return true;
42 | }
43 | }
44 | } catch (IOException e) {
45 | log.error("Unable to increment tokenizer.", e);
46 | }
47 | return false;
48 | }
49 |
50 |
51 | @Override
52 | public void describeTo(Description description) {
53 | description.appendText("tokenized the string '").appendText(token).appendText("'");
54 | }
55 |
56 |
57 | @Factory
58 | public static IsTokenizerWithToken hasToken(String token){
59 | return new IsTokenizerWithToken(token);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/OptionalMatchers.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.hamcrest.Description;
4 | import org.hamcrest.Matcher;
5 | import org.hamcrest.TypeSafeMatcher;
6 |
7 | import java.util.Optional;
8 |
9 | /**
10 | * @author Joe Linn
11 | * 6/25/2016
12 | */
13 | public class OptionalMatchers {
14 | public static Matcher> isPresent() {
15 | return new PresenceMatcher();
16 | }
17 |
18 |
19 | @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
20 | private static class PresenceMatcher extends TypeSafeMatcher> {
21 |
22 | @Override
23 | protected boolean matchesSafely(Optional> optional) {
24 | return optional.isPresent();
25 | }
26 |
27 | @Override
28 | public void describeTo(Description description) {
29 | description.appendText("is ");
30 | }
31 |
32 |
33 | @Override
34 | protected void describeMismatchSafely(Optional> item, Description mismatchDescription) {
35 | mismatchDescription.appendText("was ");
36 | }
37 | }
38 |
39 |
40 | public static Matcher> isEmpty() {
41 | return new EmptyMatcher();
42 | }
43 |
44 |
45 | @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
46 | private static class EmptyMatcher extends PresenceMatcher {
47 | @Override
48 | protected boolean matchesSafely(Optional> optional) {
49 | return !super.matchesSafely(optional);
50 | }
51 |
52 | @Override
53 | public void describeTo(Description description) {
54 | description.appendText("is ");
55 | }
56 |
57 | @SuppressWarnings("OptionalGetWithoutIsPresent")
58 | @Override
59 | protected void describeMismatchSafely(Optional> item, Description mismatchDescription) {
60 | mismatchDescription.appendText("had value ")
61 | .appendValue(item.get());
62 | }
63 | }
64 |
65 |
66 | public static Matcher> hasValue(Matcher super T> matcher) {
67 | return new HasValue<>(matcher);
68 | }
69 |
70 |
71 | @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
72 | private static class HasValue extends TypeSafeMatcher> {
73 | private final Matcher super T> matcher;
74 |
75 |
76 | private HasValue(Matcher super T> matcher) {
77 | this.matcher = matcher;
78 | }
79 |
80 |
81 | @Override
82 | protected boolean matchesSafely(Optional tOptional) {
83 | return tOptional.isPresent() && matcher.matches(tOptional.get());
84 | }
85 |
86 | @Override
87 | public void describeTo(Description description) {
88 | description.appendText("has value that is ");
89 | matcher.describeTo(description);
90 | }
91 |
92 |
93 | @Override
94 | protected void describeMismatchSafely(Optional item, Description mismatchDescription) {
95 | if (item.isPresent()) {
96 | mismatchDescription.appendText("value ")
97 | .appendValue(item.get());
98 | matcher.describeTo(mismatchDescription);
99 | } else {
100 | mismatchDescription.appendText("was ");
101 | }
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLAnalysisTestCase.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
4 | import org.elasticsearch.plugin.analysis.AnalysisURLPlugin;
5 | import org.elasticsearch.plugins.Plugin;
6 | import org.elasticsearch.test.ESIntegTestCase;
7 | import org.elasticsearch.test.StreamsUtils;
8 | import org.junit.Before;
9 |
10 | import java.util.Collection;
11 | import java.util.Collections;
12 | import java.util.List;
13 |
14 | /**
15 | * Joe Linn
16 | * 8/1/2015
17 | */
18 | public abstract class URLAnalysisTestCase extends ESIntegTestCase {
19 | protected static final String INDEX = "url_token_filter";
20 | protected static final String TYPE = "test";
21 |
22 |
23 | @Override
24 | protected Collection> nodePlugins() {
25 | return Collections.singletonList(AnalysisURLPlugin.class);
26 | }
27 |
28 | /**
29 | * For subclasses to override. Overrides must call {@code super.setUp()}.
30 | */
31 | @Before
32 | @Override
33 | public void setUp() throws Exception {
34 | super.setUp();
35 | String settings = StreamsUtils.copyToStringFromClasspath("/test-settings.json");
36 | String mapping = StreamsUtils.copyToStringFromClasspath("/test-mapping.json");
37 | client().admin().indices().prepareCreate(INDEX).setSettings(settings).addMapping(TYPE, mapping).get();
38 | refresh();
39 | Thread.sleep(75); // Ensure that the shard is available before we start making analyze requests.
40 | }
41 |
42 | protected List analyzeURL(String url, String analyzer) {
43 | return client().admin().indices().prepareAnalyze(INDEX, url).setAnalyzer(analyzer).get().getTokens();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.elasticsearch.ElasticsearchException;
4 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
5 | import org.elasticsearch.index.query.QueryBuilders;
6 | import org.elasticsearch.search.SearchHits;
7 | import org.junit.Test;
8 |
9 | import java.util.HashMap;
10 | import java.util.List;
11 | import java.util.Map;
12 |
13 | import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTPS_URL;
14 | import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTP_URL;
15 | import static org.hamcrest.Matchers.equalTo;
16 | import static org.hamcrest.Matchers.hasSize;
17 |
18 | /**
19 | * Joe Linn
20 | * 1/17/2015
21 | */
22 | public class URLTokenFilterIntegrationTest extends URLAnalysisTestCase {
23 |
24 | @Test
25 | public void testAnalyze() throws InterruptedException {
26 |
27 | assertURLAnalyzesTo(TEST_HTTP_URL, "url_protocol", "http");
28 | assertURLAnalyzesTo(TEST_HTTPS_URL, "url_protocol", "https");
29 |
30 | assertURLAnalyzesTo(TEST_HTTP_URL, "url_host", "www.foo.bar.com");
31 |
32 | assertURLAnalyzesTo(TEST_HTTP_URL, "url_port", "9200");
33 | }
34 |
35 | @Test(expected = ElasticsearchException.class)
36 | public void testInvalidURL() {
37 | analyzeURL("foobar", "url_protocol");
38 | }
39 |
40 | @Test
41 | public void testEmptyString() {
42 | List tokens = analyzeURL("", "url_protocol");
43 | assertThat("no tokens", tokens, hasSize(0));
44 | }
45 |
46 | @Test
47 | public void testUrlDecode() {
48 | assertURLAnalyzesTo("https://foo.bar.com?email=foo%40bar.com", "url_query", "email=foo@bar.com");
49 | assertURLAnalyzesTo("https://ssl.google-analytics.com/r/__utm.gif?utmwv=5.6.4&utms=1&utmn=1031590447&utmhn=www.linkedin.com&utmcs=-&utmsr=1024x768&utmvp=1256x2417&utmsc=24-bit&utmul=en-us&utmje=1&utmfl=-&utmdt=Wells%20Fargo%20Capital%20Finance%20%7C%20LinkedIn&utmhid=735221740&utmr=http%3A%2F%2Fwww.google.com%2Fsearch%3Fq%3Dsite%253Alinkedin.com%2Bwells%2Bfargo%26rls%3Dcom.microsoft%3Aen-us%26ie%3DUTF-8%26oe%3DUTF-8%26startIndex%3D%26startPage%3D1&utmp=biz-overview-public&utmht=1428449620694&utmac=UA-3242811-1&utmcc=__utma%3D23068709.1484257758.1428449621.1428449621.1428449621.1%3B%2B__utmz%3D23068709.1428449621.1.1.utmcsr%3Dgoogle%7Cutmccn%3D(organic)%7Cutmcmd%3Dorganic%7Cutmctr%3Dsite%253Alinkedin.com%2520wells%2520fargo%3B&utmjid=1336170366&utmredir=1&utmu=qBCAAAAAAAAAAAAAAAAAAAAE~", "url_port", "443");
50 | }
51 |
52 | @Test
53 | public void testMalformed() {
54 | assertURLAnalyzesTo("foo.bar.com:444/baz", "url_port_malformed", "444");
55 |
56 | Map doc = new HashMap<>();
57 | doc.put("url_malformed", "foo.bar/baz/bat");
58 | client().prepareIndex(INDEX, "test").setSource(doc).get();
59 | refresh();
60 |
61 | SearchHits hits = client()
62 | .prepareSearch(INDEX)
63 | .setQuery(QueryBuilders.boolQuery().mustNot(QueryBuilders.existsQuery("http_malformed.port")))
64 | .get()
65 | .getHits();
66 | assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
67 | }
68 |
69 |
70 | @Test
71 | public void testPassthrough() {
72 | List tokens = analyzeURL("http://foo.com:9200/foo.bar baz bat.blah", "url_host_passthrough");
73 | assertThat(tokens, hasSize(4));
74 | assertThat(tokens.get(0).getTerm(), equalTo("foo.com"));
75 | assertThat(tokens.get(1).getTerm(), equalTo("com"));
76 | assertThat(tokens.get(2).getTerm(), equalTo("baz"));
77 | assertThat(tokens.get(3).getTerm(), equalTo("bat.blah"));
78 | }
79 |
80 |
81 | @Test
82 | public void testIndex() {
83 | Map doc = new HashMap<>();
84 | doc.put("url", "http://foo.bar/baz/bat");
85 | client().prepareIndex(INDEX, "test").setSource(doc).get();
86 | doc.put("url", "https://foo.bar.com");
87 | client().prepareIndex(INDEX, "test").setSource(doc).get();
88 | refresh();
89 |
90 | SearchHits hits = client().prepareSearch(INDEX).setQuery(QueryBuilders.matchAllQuery()).get().getHits();
91 | assertEquals("both docs indexed", 2, hits.getTotalHits());
92 | }
93 |
94 | private void assertURLAnalyzesTo(String url, String analyzer, String expected) {
95 | List tokens = analyzeURL(url, analyzer);
96 | assertThat("a URL part was parsed", tokens, hasSize(1));
97 | assertEquals("term value", expected, tokens.get(0).getTerm());
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
4 | import org.apache.lucene.analysis.CannedTokenStream;
5 | import org.apache.lucene.analysis.Token;
6 | import org.apache.lucene.analysis.TokenStream;
7 | import org.elasticsearch.index.analysis.URLPart;
8 | import org.junit.Test;
9 |
10 | import java.io.IOException;
11 | import java.net.MalformedURLException;
12 |
13 | import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
14 |
15 | public class URLTokenFilterTest extends BaseTokenStreamTestCase {
16 | public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
17 | public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com";
18 | public static final String TEST_HTTPS_URL = "https://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
19 |
20 | @Test
21 | public void testFilterProtocol() throws IOException {
22 | URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.PROTOCOL);
23 | assertTokenStreamContents(filter, "http");
24 |
25 | filter = createFilter(TEST_HTTPS_URL, URLPart.PROTOCOL);
26 | assertTokenStreamContents(filter, "https");
27 | }
28 |
29 | @Test
30 | public void testFilterHost() throws IOException {
31 | assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
32 |
33 | URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
34 | .setUrlDeocde(false);
35 | assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22));
36 | filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
37 | .setUrlDeocde(false);
38 | assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22));
39 | filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
40 | .setUrlDeocde(false);
41 | assertThat(filter, hasTokenAtOffset("bar.com", 15, 22));
42 | filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
43 | .setUrlDeocde(false);
44 | assertThat(filter, hasTokenAtOffset("com", 19, 22));
45 | }
46 |
47 | @Test
48 | public void testFilterPort() throws IOException {
49 | assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PORT), "9200");
50 | }
51 |
52 | @Test
53 | public void testFilterPath() throws IOException {
54 | assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH).setTokenizePath(false), "/index_name/type_name/_search.html");
55 | assertTokenStreamContents(createFilter(TEST_HTTP_URL2, URLPart.PATH).setTokenizePath(false), new String[]{});
56 | }
57 |
58 | @Test
59 | public void testFilterRef() throws IOException {
60 | assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.REF), "tag");
61 | }
62 |
63 | @Test
64 | public void testFilterQuery() throws IOException {
65 | assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY).setTokenizeQuery(false), "foo=bar&baz=bat");
66 | }
67 |
68 | @Test(expected = MalformedURLException.class)
69 | public void testInvalidURL() throws IOException {
70 | URLTokenFilter filter = createFilter("foobar", URLPart.HOST);
71 | filter.incrementToken();
72 | }
73 |
74 | @Test
75 | public void testNullURL() throws IOException {
76 | URLTokenFilter filter = createFilter(null, URLPart.HOST);
77 | filter.incrementToken();
78 | }
79 |
80 | @Test
81 | public void testUrlDecode() throws IOException {
82 | assertTokenStreamContents(createFilter("https://www.foo.com?email=foo%40bar.com", URLPart.QUERY, true), "email=foo@bar.com");
83 | }
84 |
85 | @Test
86 | public void testInferPort() throws IOException {
87 | assertTokenStreamContents(createFilter("http://www.foo.bar.com/baz/bat.html", URLPart.PORT), "80");
88 | assertTokenStreamContents(createFilter("https://www.foo.bar.com/baz/bat.html", URLPart.PORT), "443");
89 | assertTokenStreamContents(createFilter("https://foo.bar.com", URLPart.PORT), "443");
90 | }
91 |
92 | @Test
93 | public void testMalformed() throws IOException {
94 | URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true);
95 | filter.setTokenizeMalformed(true);
96 | assertTokenStreamContents(filter, "http");
97 |
98 | filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true);
99 | filter.setTokenizeMalformed(true);
100 | assertTokenStreamContents(filter, "baz=bat");
101 |
102 | filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true);
103 | filter.setTokenizeMalformed(true);
104 | assertTokenStreamContents(filter, "3456");
105 | }
106 |
107 | private URLTokenFilter createFilter(final String url, final URLPart part) {
108 | return createFilter(url, part, false);
109 | }
110 |
111 | private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode) {
112 | return createFilter(url, part, urlDecode, false);
113 | }
114 |
115 | private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode, final boolean allowMalformed) {
116 | int length = 0;
117 | if (url != null) {
118 | length = url.length();
119 | }
120 | return new URLTokenFilter(new CannedTokenStream(new Token(url, 0, length)), part, urlDecode, allowMalformed);
121 | }
122 |
123 | private static void assertTokenStreamContents(TokenStream in, String output) throws IOException {
124 | assertTokenStreamContents(in, new String[]{output});
125 | }
126 | }
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
4 | import org.elasticsearch.action.bulk.BulkRequestBuilder;
5 | import org.elasticsearch.action.bulk.BulkResponse;
6 | import org.elasticsearch.action.search.SearchResponse;
7 | import org.elasticsearch.common.text.Text;
8 | import org.elasticsearch.index.query.QueryBuilders;
9 | import org.elasticsearch.search.SearchHit;
10 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
11 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
12 | import org.hamcrest.Matchers;
13 | import org.junit.Test;
14 |
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 |
19 | import static org.hamcrest.CoreMatchers.equalTo;
20 | import static org.hamcrest.CoreMatchers.notNullValue;
21 | import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
22 | import static org.hamcrest.collection.IsMapContaining.hasKey;
23 | import static org.hamcrest.core.IsCollectionContaining.hasItem;
24 |
25 | /**
26 | * Joe Linn
27 | * 8/1/2015
28 | */
29 | public class URLTokenizerIntegrationTest extends URLAnalysisTestCase {
30 | @Test
31 | public void testAnalyze() {
32 | assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol", "http");
33 | assertTokensContain(URLTokenizerTest.TEST_HTTPS_URL, "tokenizer_url_protocol", "https");
34 |
35 | assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_host", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
36 | List hostTokens = assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_host_single", "www.foo.bar.com");
37 | assertThat(hostTokens, hasSize(1));
38 |
39 | assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
40 |
41 | assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
42 |
43 | assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
44 | }
45 |
46 |
47 | @Test
48 | public void testAnalyzeWhole() throws Exception {
49 | List tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed");
50 | assertThat(tokens, notNullValue());
51 | assertThat(tokens, hasSize(7));
52 | }
53 |
54 |
55 | @Test
56 | public void testHighlight() throws Exception {
57 | final String field = "url_highlight_test";
58 | Map docContent = new HashMap<>();
59 | final String url = "http://www.foo.bar.com:8080/baz/bat?bob=blah";
60 | docContent.put(field, url);
61 | client().prepareIndex(INDEX, TYPE).setSource(docContent).get();
62 | refresh(INDEX);
63 |
64 | SearchResponse response = client().prepareSearch(INDEX).setQuery(QueryBuilders.matchQuery(field, "www.foo.bar.com:8080"))
65 | .highlighter(new HighlightBuilder().preTags("").postTags("").field("*").forceSource(true))
66 | .get();
67 |
68 | SearchHit[] hits = response.getHits().getHits();
69 | assertThat(hits.length, equalTo(1));
70 |
71 | SearchHit hit = hits[0];
72 | Map source = hit.getSource();
73 | assertThat(source.size(), equalTo(1));
74 | assertThat(source, hasKey(field));
75 | assertThat("URL was stored correctly", source.get(field), equalTo(url));
76 | assertThat(hit.highlightFields(), hasKey(field));
77 | HighlightField highlightField = hit.highlightFields().get(field);
78 | Text[] fragments = highlightField.getFragments();
79 | assertThat(fragments.length, equalTo(1));
80 | Text fragment = fragments[0];
81 | assertThat("URL was highlighted correctly", fragment.string(), equalTo("http://www.foo.bar.com:8080/baz/bat?bob=blah"));
82 | }
83 |
84 |
85 | @Test
86 | public void testBulkIndexing() throws Exception {
87 | final String field = "bulk_indexing_test";
88 | Map content;
89 | final int numDocs = 100;
90 | BulkRequestBuilder bulkBuilder = client().prepareBulk();
91 | for (int i = 0; i < numDocs; i++) {
92 | content = new HashMap<>();
93 | content.put(field, "http://domain" + i + ".com/foo" + i + "/bar.html");
94 | bulkBuilder.add(client().prepareIndex(INDEX, TYPE).setSource(content));
95 | }
96 | BulkResponse bulkResponse = bulkBuilder.get();
97 | assertThat(bulkResponse.buildFailureMessage(), bulkResponse.hasFailures(), equalTo(false));
98 | }
99 |
100 |
101 | private List assertTokensContain(String url, String analyzer, String... expected) {
102 | List tokens = analyzeURL(url, analyzer);
103 | for (String e : expected) {
104 | assertThat(tokens, hasItem(Matchers.hasProperty("term", equalTo(e))));
105 | }
106 | return tokens;
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import com.google.common.collect.Lists;
4 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
5 | import org.apache.lucene.analysis.TokenStream;
6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 | import org.elasticsearch.index.analysis.URLPart;
8 | import org.junit.Test;
9 |
10 | import java.io.IOException;
11 | import java.io.StringReader;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 |
15 | import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
16 | import static org.hamcrest.CoreMatchers.equalTo;
17 | import static org.hamcrest.core.IsCollectionContaining.hasItem;
18 |
19 | /**
20 | * Joe Linn
21 | * 7/30/2015
22 | */
23 | public class URLTokenizerTest extends BaseTokenStreamTestCase {
24 | public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
25 | public static final String TEST_HTTPS_URL = "https://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
26 |
27 |
28 | @Test
29 | public void testTokenizeProtocol() throws IOException {
30 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL);
31 | assertTokenStreamContents(tokenizer, "http");
32 |
33 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL);
34 | assertThat(tokenizer, hasTokenAtOffset("http", 0, 4));
35 |
36 | tokenizer = createTokenizer(TEST_HTTPS_URL, URLPart.PROTOCOL);
37 | assertTokenStreamContents(tokenizer, "https");
38 | }
39 |
40 |
41 | @Test
42 | public void testTokenizeHost() throws IOException {
43 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
44 | assertTokenStreamContents(tokenizer, stringArray("www.foo.bar.com", "foo.bar.com", "bar.com", "com"));
45 |
46 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
47 | assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com", 7, 22));
48 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
49 | assertThat(tokenizer, hasTokenAtOffset("foo.bar.com", 11, 22));
50 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
51 | assertThat(tokenizer, hasTokenAtOffset("bar.com", 15, 22));
52 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
53 | assertThat(tokenizer, hasTokenAtOffset("com", 19, 22));
54 | }
55 |
56 |
57 | @Test
58 | public void testTokenizePort() throws IOException {
59 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PORT);
60 | assertThat(tokenizer, hasTokenAtOffset("9200", 23, 27));
61 |
62 | tokenizer = createTokenizer("http://foo.bar.com", URLPart.PORT);
63 | assertThat(tokenizer, hasTokenAtOffset("80", 0, 0));
64 | }
65 |
66 |
67 | @Test
68 | public void testTokenizePath() throws IOException {
69 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
70 | assertTokenStreamContents(tokenizer, stringArray("/index_name", "/index_name/type_name", "/index_name/type_name/_search.html"));
71 |
72 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
73 | assertThat(tokenizer, hasTokenAtOffset("/index_name", 27, 38));
74 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
75 | assertThat(tokenizer, hasTokenAtOffset("/index_name/type_name", 27, 48));
76 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
77 | assertThat(tokenizer, hasTokenAtOffset("/index_name/type_name/_search.html", 27, 61));
78 |
79 | tokenizer.reset();
80 | tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
81 | tokenizer.setTokenizePath(false);
82 |
83 | assertTokenStreamContents(tokenizer, stringArray("/index_name/type_name/_search.html"));
84 | }
85 |
86 |
87 | @Test
88 | public void testTokenizeNoPath() throws Exception {
89 | final String url = "http://www.foo.bar.com:9200";
90 | URLTokenizer tokenizer = createTokenizer(url, URLPart.PATH);
91 | assertTokenStreamContents(tokenizer, stringArray());
92 | }
93 |
94 |
95 | @Test
96 | public void testTokenizeQuery() throws IOException {
97 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
98 | assertTokenStreamContents(tokenizer, stringArray("foo=bar", "baz=bat"));
99 |
100 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
101 | assertThat(tokenizer, hasTokenAtOffset("foo=bar", 62, 69));
102 | tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
103 | assertThat(tokenizer, hasTokenAtOffset("baz=bat", 70, 77));
104 | }
105 |
106 |
107 | @Test
108 | public void testTokenizeRef() throws IOException {
109 | URLTokenizer tokenizer = createTokenizer("http://foo.com#baz", URLPart.REF);
110 | assertThat(tokenizer, hasTokenAtOffset("baz", 15, 18));
111 | }
112 |
113 |
114 | @Test
115 | public void testAll() throws IOException {
116 | URLTokenizer tokenizer = new URLTokenizer();
117 | tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
118 | CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
119 | tokenizer.reset();
120 | tokenizer.clearAttributes();
121 | List tokens = new ArrayList<>();
122 | while(tokenizer.incrementToken()){
123 | tokens.add(termAttribute.toString());
124 | }
125 |
126 | assertThat(tokens, hasItem(equalTo("https")));
127 | assertThat(tokens, hasItem(equalTo("foo.bar.com")));
128 | assertThat(tokens, hasItem(equalTo("www.foo.bar.com:9200")));
129 | assertThat(tokens, hasItem(equalTo("https://www.foo.bar.com")));
130 |
131 | tokenizer = createTokenizer("https://foo.com", null);
132 | assertThat(tokenizer, hasTokenAtOffset("https", 0, 5));
133 | }
134 |
135 |
136 | @Test(expected = IOException.class)
137 | public void testMalformed() throws IOException {
138 | URLTokenizer tokenizer = createTokenizer("://foo.com", URLPart.QUERY);
139 | assertTokenStreamContents(tokenizer, stringArray("foo=bar", "baz=bat"));
140 | }
141 |
142 |
143 | @Test
144 | public void testAllowMalformed() throws IOException {
145 | URLTokenizer tokenizer = createTokenizer("://foo.com", URLPart.QUERY);
146 | tokenizer.setAllowMalformed(true);
147 | assertTokenStreamContents(tokenizer, stringArray("://foo.com"));
148 | }
149 |
150 |
151 | @Test
152 | public void testUrlDecode() throws Exception {
153 | String url = "http://foo.com?baz=foo%20bat";
154 | URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
155 | tokenizer.setUrlDecode(true);
156 | assertTokenStreamContents(tokenizer, stringArray("baz=foo bat"));
157 | }
158 |
159 |
160 | @Test(expected = IOException.class)
161 | public void testUrlDecodeIllegalCharacters() throws Exception {
162 | String url = "http://foo.com?baz=foo%2vbat";
163 | URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
164 | tokenizer.setUrlDecode(true);
165 | assertTokenStreamContents(tokenizer, "");
166 | }
167 |
168 |
169 | @Test
170 | public void testUrlDecodeAllowMalformed() throws Exception {
171 | String url = "http://foo.com?baz=foo%2vbat";
172 | URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
173 | tokenizer.setUrlDecode(true);
174 | tokenizer.setAllowMalformed(true);
175 | assertTokenStreamContents(tokenizer, "baz=foo%2vbat");
176 | }
177 |
178 |
179 | @Test
180 | public void testPartialUrl() throws Exception {
181 | final String url = "http://";
182 | URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
183 | assertTokenStreamContents(tokenizer, new String[]{});
184 | }
185 |
186 |
187 | @Test
188 | public void testNoProtocol() throws Exception {
189 | final String url = "foo.bar.baz/bat/blah.html";
190 | URLTokenizer tokenizer = createTokenizer(url, URLPart.PATH);
191 | tokenizer.setAllowMalformed(true);
192 | tokenizer.setTokenizeMalformed(true);
193 | assertTokenStreamContents(tokenizer, stringArray("/bat", "/bat/blah.html"));
194 | }
195 |
196 |
197 | @Test
198 | public void testMalformedGetRef() throws Exception {
199 | String url = "/bat/blah.html#tag?baz=bat";
200 | URLTokenizer tokenizer = createTokenizer(url, URLPart.REF);
201 | tokenizer.setAllowMalformed(true);
202 | tokenizer.setTokenizeMalformed(true);
203 | assertTokenStreamContents(tokenizer, stringArray("tag"));
204 | }
205 |
206 |
207 | @Test
208 | public void testMalformedWhole() throws Exception {
209 | String url = "foo.bar.com/baz.html/query?a=1";
210 | URLTokenizer tokenizer = createTokenizer(url, URLPart.WHOLE);
211 | tokenizer.setAllowMalformed(true);
212 | tokenizer.setTokenizeMalformed(true);
213 | assertTokenStreamContents(tokenizer, stringArray("foo.bar.com/baz.html/query?a=1"));
214 | }
215 |
216 |
217 | @Test
218 | public void testProtocolAndPort() throws Exception {
219 | URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL, URLPart.PORT);
220 | assertTokenStreamContents(tokenizer, stringArray("http", "9200"));
221 | }
222 |
223 |
224 | @Test
225 | public void testMalformedHostAndWhole() throws Exception {
226 | URLTokenizer tokenizer = createTokenizer("example.com", URLPart.WHOLE, URLPart.HOST);
227 | tokenizer.setAllowMalformed(true);
228 | tokenizer.setTokenizeMalformed(true);
229 | tokenizer.setTokenizeHost(false);
230 | assertTokenStreamContents(tokenizer, stringArray("example.com"));
231 | }
232 |
233 |
234 | @Test
235 | public void testTokenizeMalformedNoPartSpecified() throws Exception {
236 | URLTokenizer tokenizer = createTokenizer("example.com");
237 | tokenizer.setAllowMalformed(true);
238 | tokenizer.setTokenizeMalformed(true);
239 | tokenizer.setTokenizeHost(false);
240 | assertTokenStreamContents(tokenizer, stringArray("example.com"));
241 | }
242 |
243 |
244 | @Test
245 | public void testAllowMalformedNoPartsSpecified() throws Exception {
246 | URLTokenizer tokenizer = createTokenizer("example.com");
247 | tokenizer.setAllowMalformed(true);
248 | tokenizer.setTokenizeHost(false);
249 | assertTokenStreamContents(tokenizer, stringArray("example.com"));
250 | }
251 |
252 |
253 | @Test
254 | public void testTokenizeSpecial() throws Exception {
255 | final String url = "http://www.foo.bar.com:8080/baz/bat?bob=blah";
256 | URLTokenizer tokenizer = createEverythingTokenizer(url);
257 | assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com:8080", 7, 27));
258 | tokenizer = createEverythingTokenizer(url);
259 | assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com", 7, 22));
260 | tokenizer = createEverythingTokenizer(url);
261 | assertThat(tokenizer, hasTokenAtOffset("foo.bar.com", 11, 22));
262 | tokenizer = createEverythingTokenizer(url);
263 | assertThat(tokenizer, hasTokenAtOffset("bar.com", 15, 22));
264 | }
265 |
266 |
267 | private URLTokenizer createEverythingTokenizer(String input) throws IOException {
268 | URLTokenizer tokenizer = createTokenizer(input);
269 | tokenizer.setAllowMalformed(true);
270 | tokenizer.setUrlDecode(true);
271 | tokenizer.setTokenizeMalformed(true);
272 | tokenizer.setTokenizeHost(true);
273 | tokenizer.setTokenizePath(true);
274 | tokenizer.setTokenizeQuery(true);
275 | return tokenizer;
276 | }
277 |
278 |
279 | private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
280 | URLTokenizer tokenizer = new URLTokenizer();
281 | if (parts != null) {
282 | tokenizer.setParts(Lists.newArrayList(parts));
283 | }
284 | tokenizer.setReader(new StringReader(input));
285 | return tokenizer;
286 | }
287 |
288 |
289 | private String[] stringArray(String... strings) {
290 | return strings;
291 | }
292 |
293 |
294 | private static void assertTokenStreamContents(TokenStream in, String output) throws IOException {
295 | assertTokenStreamContents(in, new String[]{output});
296 | }
297 | }
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLUtilsTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.url;
2 |
3 | import org.elasticsearch.index.analysis.URLPart;
4 | import org.junit.Test;
5 |
6 | import static org.elasticsearch.index.analysis.url.OptionalMatchers.hasValue;
7 | import static org.elasticsearch.index.analysis.url.OptionalMatchers.isEmpty;
8 | import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
9 | import static org.hamcrest.CoreMatchers.equalTo;
10 | import static org.hamcrest.MatcherAssert.assertThat;
11 |
12 | /**
13 | * @author Joe Linn
14 | * 6/25/2016
15 | */
16 | public class URLUtilsTest {
17 | private static final String URL_1 = "http://foo.bar.com/baz/bat.html#whee?bob=loblaw&this=that";
18 | private static final String URL_2 = "foo.bar.com/baz/bat.html#whee?bob=loblaw&this=that";
19 | private static final String URL_3 = "/baz/bat.html#whee?bob=loblaw&this=that";
20 | private static final String URL_4 = "/baz/bat.html?bob=loblaw&this=that";
21 |
22 | @Test
23 | public void testGetProtocol() {
24 | final URLPart part = URLPart.PROTOCOL;
25 | assertThat(getPart(URL_1, part), hasValue(equalTo("http")));
26 | assertThat(getPart(URL_2, part), isEmpty());
27 | }
28 |
29 |
30 | @Test
31 | public void testGetHost() {
32 | final URLPart part = URLPart.HOST;
33 | assertThat(getPart(URL_1, part), hasValue(equalTo("foo.bar.com")));
34 | assertThat(getPart(URL_2, part), hasValue(equalTo("foo.bar.com")));
35 | }
36 |
37 |
38 | @Test
39 | public void testGetPort() {
40 | final URLPart part = URLPart.PORT;
41 | assertThat(getPart(URL_1, part), hasValue(equalTo("80")));
42 | assertThat(getPart(URL_2, part), isEmpty());
43 | }
44 |
45 |
46 | @Test
47 | public void testGetPath() {
48 | final URLPart part = URLPart.PATH;
49 | assertThat(getPart(URL_1, part), hasValue(equalTo("/baz/bat.html")));
50 | assertThat(getPart(URL_2, part), hasValue(equalTo("/baz/bat.html")));
51 | assertThat(getPart(URL_3, part), hasValue(equalTo("/baz/bat.html")));
52 | }
53 |
54 |
55 | @Test
56 | public void testGetRef() {
57 | final URLPart part = URLPart.REF;
58 | assertThat(getPart(URL_1, part), hasValue(equalTo("whee")));
59 | assertThat(getPart(URL_2, part), hasValue(equalTo("whee")));
60 | assertThat(getPart(URL_3, part), hasValue(equalTo("whee")));
61 | }
62 |
63 |
64 | @Test
65 | public void testGetQuery() {
66 | final URLPart part = URLPart.QUERY;
67 | assertThat(getPart(URL_1, part), hasValue(equalTo("bob=loblaw&this=that")));
68 | assertThat(getPart(URL_2, part), hasValue(equalTo("bob=loblaw&this=that")));
69 | assertThat(getPart(URL_3, part), hasValue(equalTo("bob=loblaw&this=that")));
70 | assertThat(getPart(URL_4, part), hasValue(equalTo("bob=loblaw&this=that")));
71 | }
72 | }
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | es.logger.level=INFO
2 | log4j.rootLogger=${es.logger.level}, out
3 |
4 | log4j.appender.out=org.apache.log4j.ConsoleAppender
5 | log4j.appender.out.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n
--------------------------------------------------------------------------------
/src/test/resources/test-mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "properties": {
3 | "url": {
4 | "type": "string",
5 | "fields": {
6 | "url": {
7 | "type": "string",
8 | "index": "not_analyzed"
9 | },
10 | "port": {
11 | "type": "string",
12 | "analyzer": "url_port"
13 | }
14 | }
15 | },
16 | "url_tokenized": {
17 | "type": "string",
18 | "fields": {
19 | "url_tokenized": {
20 | "type": "string",
21 | "index": "not_analyzed"
22 | },
23 | "protocol": {
24 | "type": "string",
25 | "analyzer": "tokenizer_url_protocol"
26 | }
27 | }
28 | },
29 | "url_malformed": {
30 | "type": "string",
31 | "fields": {
32 | "url": {
33 | "type": "string",
34 | "index": "not_analyzed"
35 | },
36 | "port": {
37 | "type": "string",
38 | "analyzer": "url_port_malformed"
39 | }
40 | }
41 | },
42 | "url_highlight_test": {
43 | "type": "string",
44 | "analyzer": "url_highlight_test"
45 | },
46 | "bulk_indexing_test": {
47 | "type": "string",
48 | "analyzer": "bulk_indexing_test"
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/src/test/resources/test-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "analysis": {
3 | "tokenizer": {
4 | "url_protocol": {
5 | "type": "url",
6 | "part": "protocol"
7 | },
8 | "url_host": {
9 | "type": "url",
10 | "part": "host"
11 | },
12 | "url_host_single": {
13 | "type": "url",
14 | "part": "host",
15 | "tokenize_host": false
16 | },
17 | "url_protocol_and_host": {
18 | "type": "url",
19 | "part": ["protocol", "host"]
20 | },
21 | "url_all": {
22 | "type": "url"
23 | },
24 | "url_all_malformed": {
25 | "type": "url",
26 | "allow_malformed": true,
27 | "tokenize_malformed": true
28 | },
29 | "url_highlight_test": {
30 | "type": "url",
31 | "url_decode": true,
32 | "allow_malformed": true,
33 | "tokenize_malformed": true,
34 | "tokenize_host": true,
35 | "tokenize_path": true,
36 | "tokenize_query": true
37 | }
38 | },
39 | "filter": {
40 | "url_protocol": {
41 | "type": "url",
42 | "part": "protocol"
43 | },
44 | "url_host": {
45 | "type": "url",
46 | "part": "host",
47 | "tokenize_host": false
48 | },
49 | "url_port": {
50 | "type": "url",
51 | "part": "port"
52 | },
53 | "url_query": {
54 | "type": "url",
55 | "part": "query",
56 | "url_decode": true,
57 | "tokenize_query": false
58 | },
59 | "url_port_malformed": {
60 | "type": "url",
61 | "part": "port",
62 | "allow_malformed": true,
63 | "tokenize_malformed": true
64 | },
65 | "url_host_passthrough": {
66 | "type": "url",
67 | "part": "host",
68 | "passthrough": "true"
69 | },
70 | "bulk_indexing_test": {
71 | "type": "url",
72 | "part": ["protocol", "host", "port", "path", "query", "ref"],
73 | "url_decode": true,
74 | "allow_malformed": true,
75 | "tokenize_malformed": true
76 | }
77 | },
78 | "analyzer": {
79 | "url_protocol": {
80 | "filter": [
81 | "url_protocol"
82 | ],
83 | "tokenizer": "whitespace"
84 | },
85 | "url_host": {
86 | "filter": [
87 | "url_host"
88 | ],
89 | "tokenizer": "whitespace"
90 | },
91 | "url_port": {
92 | "filter": [
93 | "url_port"
94 | ],
95 | "tokenizer": "whitespace"
96 | },
97 | "url_query": {
98 | "filter": [
99 | "url_query"
100 | ],
101 | "tokenizer": "whitespace"
102 | },
103 | "url_port_malformed": {
104 | "filter": [
105 | "url_port_malformed"
106 | ],
107 | "tokenizer": "whitespace"
108 | },
109 | "url_host_passthrough": {
110 | "filter": [
111 | "url_host_passthrough"
112 | ],
113 | "tokenizer": "whitespace"
114 | },
115 | "tokenizer_url_protocol": {
116 | "tokenizer": "url_protocol"
117 | },
118 | "tokenizer_url_host": {
119 | "tokenizer": "url_host"
120 | },
121 | "tokenizer_url_host_single": {
122 | "tokenizer": "url_host_single"
123 | },
124 | "tokenizer_url_protocol_and_host": {
125 | "tokenizer": "url_protocol_and_host"
126 | },
127 | "tokenizer_url_all": {
128 | "tokenizer": "url_all"
129 | },
130 | "tokenizer_url_all_malformed": {
131 | "tokenizer": "url_all_malformed"
132 | },
133 | "url_highlight_test": {
134 | "tokenizer": "url_highlight_test"
135 | },
136 | "bulk_indexing_test": {
137 | "type": "custom",
138 | "tokenizer": "whitespace",
139 | "filter": ["bulk_indexing_test"]
140 | }
141 | }
142 | }
143 | }
--------------------------------------------------------------------------------