├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pom.xml
└── src
    ├── main
        ├── assemblies
        │   └── plugin.xml
        ├── java
        │   └── org
        │   │   └── elasticsearch
        │   │       ├── index
        │   │           └── analysis
        │   │           │   ├── URLPart.java
        │   │           │   ├── URLPartComparator.java
        │   │           │   ├── URLTokenFilterFactory.java
        │   │           │   ├── URLTokenizerFactory.java
        │   │           │   └── url
        │   │           │       ├── Token.java
        │   │           │       ├── URLTokenFilter.java
        │   │           │       ├── URLTokenizer.java
        │   │           │       └── URLUtils.java
        │   │       └── plugin
        │   │           └── analysis
        │   │               └── AnalysisURLPlugin.java
        └── resources
        │   └── plugin-descriptor.properties
    └── test
        ├── java
            └── org
            │   └── elasticsearch
            │       └── index
            │           └── analysis
            │               └── url
            │                   ├── IsTokenStreamWithTokenAndPosition.java
            │                   ├── IsTokenizerWithToken.java
            │                   ├── OptionalMatchers.java
            │                   ├── URLAnalysisTestCase.java
            │                   ├── URLTokenFilterIntegrationTest.java
            │                   ├── URLTokenFilterTest.java
            │                   ├── URLTokenizerIntegrationTest.java
            │                   ├── URLTokenizerTest.java
            │                   └── URLUtilsTest.java
        └── resources
            ├── log4j.properties
            ├── test-mapping.json
            └── test-settings.json


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.gitignore.io
 2 | 
 3 | ### Elasticsearch ###
 4 | /data
 5 | 
 6 | ### Java ###
 7 | *.class
 8 | 
 9 | # Mobile Tools for Java (J2ME)
10 | .mtj.tmp/
11 | 
12 | # Package Files #
13 | *.jar
14 | *.war
15 | *.ear
16 | 
17 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
18 | hs_err_pid*
19 | 
20 | 
21 | ### Maven ###
22 | target/
23 | pom.xml.tag
24 | pom.xml.releaseBackup
25 | pom.xml.versionsBackup
26 | pom.xml.next
27 | release.properties
28 | 
29 | 
30 | ### Intellij ###
31 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
32 | 
33 | *.iml
34 | 
35 | ## Directory-based project format:
36 | .idea/
37 | # if you remove the above rule, at least ignore the following:
38 | 
39 | # User-specific stuff:
40 | # .idea/workspace.xml
41 | # .idea/tasks.xml
42 | # .idea/dictionaries
43 | 
44 | # Sensitive or high-churn files:
45 | # .idea/dataSources.ids
46 | # .idea/dataSources.xml
47 | # .idea/sqlDataSources.xml
48 | # .idea/dynamic.xml
49 | # .idea/uiDesigner.xml
50 | 
51 | # Gradle:
52 | # .idea/gradle.xml
53 | # .idea/libraries
54 | 
55 | # Mongo Explorer plugin:
56 | # .idea/mongoSettings.xml
57 | 
58 | ## File-based project format:
59 | *.ipr
60 | *.iws
61 | 
62 | ## Plugin-specific files:
63 | 
64 | # IntelliJ
65 | out/
66 | 
67 | # mpeltonen/sbt-idea plugin
68 | .idea_modules/
69 | 
70 | # JIRA plugin
71 | atlassian-ide-plugin.xml
72 | 
73 | # Crashlytics plugin (for Android Studio and IntelliJ)
74 | com_crashlytics_export_strings.xml
75 | crashlytics.properties
76 | crashlytics-build.properties
77 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | 
3 | jdk:
4 |   - oraclejdk8
5 | 
6 | script: mvn test -Dtests.security.manager=false
7 | 
8 | sudo: false


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Elasticsearch URL Tokenizer and URL Token Filter
  2 | ==============================
  3 | 
  4 | This plugin enables URL tokenization and token filtering by URL part.
  5 | 
  6 | [![Build Status](https://secure.travis-ci.org/jlinn/elasticsearch-analysis-url.png?branch=master)](http://travis-ci.org/jlinn/elasticsearch-analysis-url)
  7 | 
  8 | ## Compatibility
  9 | 
 10 | | Elasticsearch Version | Plugin Version |
 11 | |-----------------------|----------------|
 12 | | 5.6.3 | 5.6.3.0 |
 13 | | 5.6.1 | 5.6.1.0 |
 14 | | 5.5.1 | 5.5.1.0 |
 15 | | 5.5.0 | 5.5.0.0 |
 16 | | 5.2.2 | 5.2.2.0 |
 17 | | 5.2.1 | 5.2.1.1 |
 18 | | 5.1.1 | 5.1.1.0 |
 19 | | 5.0.0 | 5.0.0.1 |
 20 | | 2.4.3 | 2.4.3.0 |
 21 | | 2.4.1 | 2.4.1.0 |
 22 | | 2.4.0 | 2.4.0.0 |
 23 | | 2.3.5 | 2.3.5.0 |
 24 | | 2.3.4 | 2.3.4.3 |
 25 | | 2.3.3 | 2.3.3.5 |
 26 | | 2.3.2 | 2.3.2.1 |
 27 | | 2.3.1 | 2.3.1.1 |
 28 | | 2.3.0 | 2.3.0.1 |
 29 | | 2.2.2 | 2.2.3 |
 30 | | 2.2.1 | 2.2.2.1 |
 31 | | 2.2.0 | 2.2.1 |
 32 | | 2.1.1 | 2.2.0 |
 33 | | 2.1.1 | 2.1.1 |
 34 | | 2.0.0 | 2.1.0 |
 35 | | 1.6.x, 1.7.x | 2.0.0 |
 36 | | 1.6.0 | 1.2.1 |
 37 | | 1.5.2 | 1.1.0 |
 38 | | 1.4.2 | 1.0.0 |
 39 | 
 40 | ## Installation
 41 | ### Elasticsearch v5
 42 | ```bash
 43 | bin/elasticsearch-plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v5.6.3.0/elasticsearch-analysis-url-5.6.3.0.zip
 44 | ```
 45 | 
 46 | ### Elasticsearch v2
 47 | ```bash
 48 | bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.4.3.0/elasticsearch-analysis-url-2.4.3.0.zip
 49 | ```
 50 | 
 51 | ## Usage
 52 | ### URL Tokenizer
 53 | #### Options:
 54 | * `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Can be either a string (single URL part) or an array of multiple URL parts. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
 55 | * `url_decode`: Defaults to `false`. If `true`, URL tokens will be URL decoded.
 56 | * `allow_malformed`: Defaults to `false`. If `true`, malformed URLs will not be rejected, but will be passed through without being tokenized.
 57 | * `tokenize_malformed`: Defaults to `false`. Has no effect if `allow_malformed` is `false`. If both are `true`, an attempt will be made to tokenize malformed URLs using regular expressions.
 58 | * `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
 59 | * `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
 60 | * `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
 61 | 
 62 | #### Example:
 63 | Index settings:
 64 | ```json
 65 | {
 66 | 	"settings": {
 67 | 		"analysis": {
 68 | 			"tokenizer": {
 69 | 				"url_host": {
 70 | 					"type": "url",
 71 | 					"part": "host"
 72 | 				}
 73 | 			},
 74 | 			"analyzer": {
 75 | 				"url_host": {
 76 | 					"tokenizer": "url_host"
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 	}
 81 | }
 82 | ```
 83 | 
 84 | Make an analysis request:
 85 | ```bash
 86 | curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'https://foo.bar.com/baz.html'
 87 | 
 88 | {
 89 |   "tokens" : [ {
 90 |     "token" : "foo.bar.com",
 91 |     "start_offset" : 8,
 92 |     "end_offset" : 19,
 93 |     "type" : "host",
 94 |     "position" : 1
 95 |   }, {
 96 |     "token" : "bar.com",
 97 |     "start_offset" : 12,
 98 |     "end_offset" : 19,
 99 |     "type" : "host",
100 |     "position" : 2
101 |   }, {
102 |     "token" : "com",
103 |     "start_offset" : 16,
104 |     "end_offset" : 19,
105 |     "type" : "host",
106 |     "position" : 3
107 |   } ]
108 | }
109 | ```
110 | 
111 | ### URL Token Filter
112 | #### Options:
113 | * `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
114 | `protocol`, `host`, `port`, `path`, `query`, and `ref`. Can be either a single URL part (string) or an array of URL parts.
115 | * `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
116 | * `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
117 | If the desired part cannot be found, no value will be indexed for that field.
118 | * `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter.  Valid URLs will be tokenized according to the filter's other settings.
119 | * `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
120 | * `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
121 | * `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
122 | 
123 | #### Example:
124 | Set up your index like so:
125 | ```json
126 | {
127 |     "settings": {
128 |         "analysis": {
129 |             "filter": {
130 |                 "url_host": {
131 |                     "type": "url",
132 |                     "part": "host",
133 |                     "url_decode": true,
134 |                     "tokenize_host": false
135 |                 }
136 |             },
137 |             "analyzer": {
138 |                 "url_host": {
139 |                     "filter": ["url_host"],
140 |                     "tokenizer": "whitespace"
141 |                 }
142 |             }
143 |         }
144 |     },
145 |     "mappings": {
146 |         "example_type": {
147 |             "properties": {
148 |                 "url": {
149 |                     "type": "multi_field",
150 |                     "fields": {
151 |                         "url": {"type": "string"},
152 |                         "host": {"type": "string", "analyzer": "url_host"}
153 |                     }
154 |                 }
155 |             }
156 |         }
157 |     }
158 | }
159 | ```
160 | 
161 | Make an analysis request:
162 | ```bash
163 | curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'https://foo.bar.com/baz.html'
164 | 
165 | {
166 |   "tokens" : [ {
167 |     "token" : "foo.bar.com",
168 |     "start_offset" : 0,
169 |     "end_offset" : 32,
170 |     "type" : "word",
171 |     "position" : 1
172 |   } ]
173 | }
174 | ```
175 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>org.elasticsearch</groupId>
  8 |     <artifactId>elasticsearch-analysis-url</artifactId>
  9 |     <version>5.6.3.0</version>
 10 |     <packaging>jar</packaging>
 11 |     <description>Elasticsearch URL token filter plugin</description>
 12 | 
 13 |     <parent>
 14 |         <groupId>org.sonatype.oss</groupId>
 15 |         <artifactId>oss-parent</artifactId>
 16 |         <version>9</version>
 17 |     </parent>
 18 | 
 19 |     <properties>
 20 |         <project.build.sourceEncodint>UTF-8</project.build.sourceEncodint>
 21 |         <elasticsearch.version>5.6.3</elasticsearch.version>
 22 |         <lucene.version>6.6.1</lucene.version>
 23 |         <hamcrest.version>1.3</hamcrest.version>
 24 |         <guava.version>19.0</guava.version>
 25 |         <tests.output>onerror</tests.output>
 26 |         <tests.shuffle>true</tests.shuffle>
 27 |         <es.config>elasticsearch.yml</es.config>
 28 |         <es.logger.level>INFO</es.logger.level>
 29 |     </properties>
 30 | 
 31 |     <repositories>
 32 |         <repository>
 33 |             <id>sonatype</id>
 34 |             <url>http://oss.sonatype.org/content/repositories/releases</url>
 35 |         </repository>
 36 |     </repositories>
 37 | 
 38 |     <dependencies>
 39 |         <dependency>
 40 |             <groupId>com.google.guava</groupId>
 41 |             <artifactId>guava</artifactId>
 42 |             <version>${guava.version}</version>
 43 |         </dependency>
 44 | 
 45 |         <!--test dependencies -->
 46 | 
 47 |         <dependency>
 48 |             <groupId>org.elasticsearch</groupId>
 49 |             <artifactId>elasticsearch</artifactId>
 50 |             <version>${elasticsearch.version}</version>
 51 |             <scope>compile</scope>
 52 |         </dependency>
 53 | 
 54 |         <dependency>
 55 |             <groupId>org.elasticsearch.test</groupId>
 56 |             <artifactId>framework</artifactId>
 57 |             <version>${elasticsearch.version}</version>
 58 |             <scope>test</scope>
 59 |             <exclusions>
 60 |                 <exclusion>
 61 |                     <groupId>org.hamcrest</groupId>
 62 |                     <artifactId>hamcrest-all</artifactId>
 63 |                 </exclusion>
 64 |                 <exclusion>
 65 |                     <groupId>junit</groupId>
 66 |                     <artifactId>junit</artifactId>
 67 |                 </exclusion>
 68 |             </exclusions>
 69 |         </dependency>
 70 | 
 71 |         <dependency>
 72 |             <groupId>org.hamcrest</groupId>
 73 |             <artifactId>hamcrest-all</artifactId>
 74 |             <version>${hamcrest.version}</version>
 75 |             <scope>test</scope>
 76 |         </dependency>
 77 | 
 78 |         <dependency>
 79 |             <groupId>junit</groupId>
 80 |             <artifactId>junit</artifactId>
 81 |             <version>4.12</version>
 82 |             <scope>test</scope>
 83 |             <exclusions>
 84 |                 <exclusion>
 85 |                     <groupId>org.hamcrest</groupId>
 86 |                     <artifactId>hamcrest-core</artifactId>
 87 |                 </exclusion>
 88 |             </exclusions>
 89 |         </dependency>
 90 | 
 91 |         <dependency>
 92 |             <groupId>org.apache.logging.log4j</groupId>
 93 |             <artifactId>log4j-core</artifactId>
 94 |             <version>2.9.1</version>
 95 |             <scope>test</scope>
 96 |         </dependency>
 97 | 
 98 |         <dependency>
 99 |             <groupId>org.slf4j</groupId>
100 |             <artifactId>slf4j-simple</artifactId>
101 |             <version>1.7.12</version>
102 |             <scope>test</scope>
103 |         </dependency>
104 |     </dependencies>
105 | 
106 |     <build>
107 |         <resources>
108 |             <resource>
109 |                 <directory>src/main/resources</directory>
110 |                 <filtering>true</filtering>
111 |                 <includes>
112 |                     <include>*.properties</include>
113 |                 </includes>
114 |             </resource>
115 |             <resource>
116 |                 <directory>src/main/resources</directory>
117 |                 <filtering>false</filtering>
118 |                 <excludes>
119 |                     <exclude>*.properties</exclude>
120 |                 </excludes>
121 |             </resource>
122 |         </resources>
123 |         <testResources>
124 |             <testResource>
125 |                 <directory>${basedir}/src/test/java</directory>
126 |                 <includes>
127 |                     <include>**/*.json</include>
128 |                     <include>**/*.yml</include>
129 |                     <include>**/*.txt</include>
130 |                     <include>**/*.properties</include>
131 |                 </includes>
132 |                 <filtering>true</filtering>
133 |             </testResource>
134 |             <testResource>
135 |                 <directory>${basedir}/src/test/resources</directory>
136 |                 <includes>
137 |                     <include>**/*.*</include>
138 |                 </includes>
139 |             </testResource>
140 |         </testResources>
141 | 
142 |         <plugins>
143 |             <plugin>
144 |                 <groupId>org.apache.maven.plugins</groupId>
145 |                 <artifactId>maven-compiler-plugin</artifactId>
146 |                 <version>3.2</version>
147 |                 <configuration>
148 |                     <source>1.8</source>
149 |                     <target>1.8</target>
150 |                 </configuration>
151 |             </plugin>
152 | 
153 |             <plugin>
154 |                 <groupId>com.carrotsearch.randomizedtesting</groupId>
155 |                 <artifactId>junit4-maven-plugin</artifactId>
156 |                 <version>2.1.11</version>
157 |             </plugin>
158 | 
159 |             <plugin>
160 |                 <groupId>org.apache.maven.plugins</groupId>
161 |                 <artifactId>maven-surefire-plugin</artifactId>
162 |                 <version>2.19.1</version>
163 |                 <configuration>
164 |                     <!-- disable security manager for tests -->
165 |                     <argLine>-Dtests.security.manager=false</argLine>
166 |                 </configuration>
167 |             </plugin>
168 | 
169 |             <plugin>
170 |                 <groupId>org.apache.maven.plugins</groupId>
171 |                 <artifactId>maven-source-plugin</artifactId>
172 |                 <version>2.4</version>
173 |             </plugin>
174 | 
175 |             <plugin>
176 |                 <groupId>org.apache.maven.plugins</groupId>
177 |                 <artifactId>maven-assembly-plugin</artifactId>
178 |                 <version>2.5.3</version>
179 |                 <configuration>
180 |                     <appendAssemblyId>false</appendAssemblyId>
181 |                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
182 |                     <descriptors>
183 |                         <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
184 |                     </descriptors>
185 |                 </configuration>
186 |                 <executions>
187 |                     <execution>
188 |                         <phase>package</phase>
189 |                         <goals>
190 |                             <goal>single</goal>
191 |                         </goals>
192 |                     </execution>
193 |                 </executions>
194 |             </plugin>
195 |         </plugins>
196 |     </build>
197 | </project>
198 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>plugin</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <files>
 9 |         <file>
10 |             <source>src/main/resources/plugin-descriptor.properties</source>
11 |             <outputDirectory>/elasticsearch/</outputDirectory>
12 |             <filtered>true</filtered>
13 |         </file>
14 |     </files>
15 |     <dependencySets>
16 |         <dependencySet>
17 |             <outputDirectory>/elasticsearch/</outputDirectory>
18 |             <useProjectArtifact>true</useProjectArtifact>
19 |             <useTransitiveFiltering>true</useTransitiveFiltering>
20 |             <excludes>
21 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
22 |             </excludes>
23 |         </dependencySet>
24 |         <dependencySet>
25 |             <outputDirectory>/elasticsearch/</outputDirectory>
26 |             <useProjectArtifact>true</useProjectArtifact>
27 |             <useTransitiveFiltering>true</useTransitiveFiltering>
28 |             <includes>
29 |                 <include>${project.name}-${project.version}.jar</include>
30 |             </includes>
31 |         </dependencySet>
32 |         <dependencySet>
33 |             <outputDirectory>/elasticsearch/</outputDirectory>
34 |             <useProjectArtifact>true</useProjectArtifact>
35 |             <useTransitiveFiltering>true</useTransitiveFiltering>
36 |             <includes>
37 |                 <include>com.google.guava:guava</include>
38 |             </includes>
39 |         </dependencySet>
40 |     </dependencySets>
41 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLPart.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | /**
 4 |  * Joe Linn
 5 |  * 1/17/2015
 6 |  */
 7 | public enum URLPart {
 8 |     PROTOCOL((short) 1),
 9 |     HOST((short) 2),
10 |     PORT((short) 3),
11 |     PATH((short) 4),
12 |     REF((short) 5),
13 |     QUERY((short) 6),
14 |     WHOLE((short) 7);
15 | 
16 |     private final short order;
17 | 
18 |     URLPart(short order) {
19 |         this.order = order;
20 |     }
21 | 
22 |     public short getOrder() {
23 |         return order;
24 |     }
25 | 
26 |     public static URLPart fromString(String part) {
27 |         for (URLPart urlPart : URLPart.values()) {
28 |             if (urlPart.name().equalsIgnoreCase(part)) {
29 |                 return urlPart;
30 |             }
31 |         }
32 |         throw new IllegalArgumentException(String.format("Unrecognized URL part: %s", part));
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLPartComparator.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * @author Joe Linn
 7 |  *         11/13/2016
 8 |  */
 9 | public class URLPartComparator implements Comparator<URLPart> {
10 |     @Override
11 |     public int compare(URLPart o1, URLPart o2) {
12 |         return o1.getOrder() - o2.getOrder();
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.url.URLTokenFilter;
 8 | 
 9 | import java.util.Arrays;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 | 
13 | /**
14 |  * Joe Linn
15 |  * 1/17/2015
16 |  */
17 | public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
18 |     private final List<URLPart> parts;
19 |     private final boolean urlDecode;
20 |     private boolean tokenizeHost;
21 |     private boolean tokenizePath;
22 |     private boolean tokenizeQuery;
23 |     private final boolean allowMalformed;
24 |     private final boolean tokenizeMalformed;
25 |     private final boolean passthrough;
26 | 
27 | 
28 |     public URLTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
29 |         super(indexSettings, name, settings);
30 | 
31 |         this.parts = Arrays.stream(settings.getAsArray("part", new String[]{"whole"}))
32 |                 .map(URLPart::fromString)
33 |                 .collect(Collectors.toList());
34 | 
35 |         this.urlDecode = settings.getAsBoolean("url_decode", false);
36 |         this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
37 |         this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
38 |         this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
39 |         this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
40 |         this.tokenizeMalformed = settings.getAsBoolean("tokenize_malformed", false);
41 |         this.passthrough = settings.getAsBoolean("passthrough", false);
42 |     }
43 | 
44 | 
45 |     @Override
46 |     public TokenStream create(TokenStream tokenStream) {
47 |         return new URLTokenFilter(tokenStream, null, urlDecode, allowMalformed, passthrough)
48 |                 .setParts(parts)
49 |                 .setTokenizeMalformed(tokenizeMalformed)
50 |                 .setTokenizeHost(tokenizeHost)
51 |                 .setTokenizePath(tokenizePath)
52 |                 .setTokenizeQuery(tokenizeQuery);
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.url.URLTokenizer;
 8 | 
 9 | import java.util.Arrays;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 | 
13 | /**
14 |  * Joe Linn
15 |  * 8/1/2015
16 |  */
17 | public class URLTokenizerFactory extends AbstractTokenizerFactory {
18 |     private List<URLPart> parts;
19 |     private boolean urlDecode;
20 |     private boolean tokenizeHost;
21 |     private boolean tokenizePath;
22 |     private boolean tokenizeQuery;
23 |     private boolean allowMalformed;
24 |     private boolean tokenizeMalformed;
25 | 
26 | 
27 |     public URLTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
28 |         super(indexSettings, name, settings);
29 | 
30 |         String[] parts = settings.getAsArray("part");
31 |         if (parts != null && parts.length > 0) {
32 |             this.parts = Arrays.stream(parts)
33 |                     .map(URLPart::fromString)
34 |                     .collect(Collectors.toList());
35 |         }
36 |         this.urlDecode = settings.getAsBoolean("url_decode", false);
37 |         this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
38 |         this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
39 |         this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
40 |         this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
41 |         this.tokenizeMalformed = settings.getAsBoolean("tokenize_malformed", false);
42 |     }
43 | 
44 | 
45 |     @Override
46 |     public Tokenizer create() {
47 |         URLTokenizer tokenizer = new URLTokenizer();
48 |         tokenizer.setParts(parts);
49 |         tokenizer.setUrlDecode(urlDecode);
50 |         tokenizer.setTokenizeHost(tokenizeHost);
51 |         tokenizer.setTokenizePath(tokenizePath);
52 |         tokenizer.setTokenizeQuery(tokenizeQuery);
53 |         tokenizer.setAllowMalformed(allowMalformed);
54 |         tokenizer.setTokenizeMalformed(tokenizeMalformed);
55 |         return tokenizer;
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/Token.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.url;
 2 | 
 3 | import com.google.common.base.Objects;
 4 | import org.elasticsearch.index.analysis.URLPart;
 5 | 
 6 | /**
 7 |  * @author Joe Linn
 8 |  *         8/14/2016
 9 |  */
10 | class Token {
11 |     private final String token;
12 |     private final URLPart part;
13 |     private final int start;
14 |     private final int end;
15 | 
16 |     public Token(String token, URLPart part, int start, int end) {
17 |         this.token = token;
18 |         this.part = part;
19 |         this.start = start;
20 |         this.end = end;
21 |     }
22 | 
23 |     public String getToken() {
24 |         return token;
25 |     }
26 | 
27 |     public URLPart getPart() {
28 |         return part;
29 |     }
30 | 
31 |     public int getStart() {
32 |         return start;
33 |     }
34 | 
35 |     public int getEnd() {
36 |         return end;
37 |     }
38 | 
39 | 
40 |     @Override
41 |     public boolean equals(Object obj) {
42 |         if (obj == null || !(obj instanceof Token)) {
43 |             return false;
44 |         }
45 |         Token that = (Token) obj;
46 |         return this.start == that.start
47 |                 && this.end == that.end
48 |                 && Objects.equal(this.token, that.token)
49 |                 && Objects.equal(this.part, that.part);
50 |     }
51 | 
52 |     @Override
53 |     public int hashCode() {
54 |         int result = token != null ? token.hashCode() : 0;
55 |         result = 31 * result + part.hashCode();
56 |         result = 31 * result + start;
57 |         result = 31 * result + end;
58 |         return result;
59 |     }
60 | 
61 | 
62 |     @Override
63 |     public String toString() {
64 |         return "Token{" +
65 |                 "token='" + token + '\'' +
66 |                 ", part=" + part +
67 |                 ", start=" + start +
68 |                 ", end=" + end +
69 |                 '}';
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.apache.lucene.analysis.TokenFilter;
  4 | import org.apache.lucene.analysis.TokenStream;
  5 | import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
  6 | import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 10 | import org.elasticsearch.common.Strings;
 11 | import org.elasticsearch.index.analysis.URLPart;
 12 | 
 13 | import java.io.IOException;
 14 | import java.io.StringReader;
 15 | import java.net.MalformedURLException;
 16 | import java.util.ArrayList;
 17 | import java.util.Collections;
 18 | import java.util.Iterator;
 19 | import java.util.List;
 20 | import java.util.regex.Matcher;
 21 | import java.util.regex.Pattern;
 22 | 
 23 | /**
 24 |  * Joe Linn
 25 |  * 1/17/2015
 26 |  */
 27 | public final class URLTokenFilter extends TokenFilter {
 28 |     public static final String NAME = "url";
 29 | 
 30 |     private List<URLPart> parts;
 31 | 
 32 |     private boolean urlDeocde;
 33 | 
 34 |     /**
 35 |      * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
 36 |      */
 37 |     private boolean tokenizeHost = true;
 38 | 
 39 |     /**
 40 |      * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
 41 |      */
 42 |     private boolean tokenizePath = true;
 43 | 
 44 |     /**
 45 |      * If true, the url's query string will be split on &
 46 |      */
 47 |     private boolean tokenizeQuery = true;
 48 | 
 49 |     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
 50 |     private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
 51 |     private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
 52 | 
 53 |     private final boolean allowMalformed;
 54 | 
 55 |     private boolean tokenizeMalformed;
 56 | 
 57 |     private boolean passthrough;
 58 | 
 59 |     private List<Token> tokens;
 60 |     private Iterator<Token> iterator;
 61 | 
 62 |     public URLTokenFilter(TokenStream input, URLPart part) {
 63 |         this(input, part, false);
 64 |     }
 65 | 
 66 |     public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
 67 |         this(input, part, urlDecode, false);
 68 |     }
 69 | 
 70 |     public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
 71 |         this(input, part, urlDecode, allowMalformed, false);
 72 |     }
 73 | 
 74 |     public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
 75 |         super(input);
 76 |         if (part != null) {
 77 |             this.parts = Collections.singletonList(part);
 78 |         } else {
 79 |             parts = null;
 80 |         }
 81 |         this.urlDeocde = urlDecode;
 82 |         this.allowMalformed = allowMalformed;
 83 |         this.passthrough = passthrough;
 84 |     }
 85 | 
 86 | 
 87 |     public URLTokenFilter setParts(List<URLPart> parts) {
 88 |         this.parts = parts;
 89 |         return this;
 90 |     }
 91 | 
 92 |     public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
 93 |         this.tokenizeHost = tokenizeHost;
 94 |         return this;
 95 |     }
 96 | 
 97 |     public URLTokenFilter setTokenizePath(boolean tokenizePath) {
 98 |         this.tokenizePath = tokenizePath;
 99 |         return this;
100 |     }
101 | 
102 |     public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
103 |         this.tokenizeQuery = tokenizeQuery;
104 |         return this;
105 |     }
106 | 
107 | 
108 |     public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
109 |         this.tokenizeMalformed = tokenizeMalformed;
110 |         return this;
111 |     }
112 | 
113 |     public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
114 |         this.urlDeocde = urlDeocde;
115 |         return this;
116 |     }
117 | 
118 | 
119 |     @Override
120 |     public boolean incrementToken() throws IOException {
121 |         if (iterator == null || !iterator.hasNext()) {
122 |             if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
123 |                 return false;
124 |             }
125 |         }
126 |         clearAttributes();
127 |         Token next = iterator.next();
128 |         termAttribute.append(next.getToken());
129 |         typeAttribute.setType(next.getPart().name().toLowerCase());
130 |         offsetAttribute.setOffset(next.getStart(), next.getEnd());
131 |         return true;
132 |     }
133 | 
134 | 
135 |     /**
136 |      * Advance to the next token, if any
137 |      * @return true if more tokens are forthcoming, false otherwise
138 |      * @throws IOException
139 |      */
140 |     private boolean advance() throws IOException {
141 |         if (input.incrementToken()) {
142 |             String urlString = termAttribute.toString();
143 |             if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
144 |                 return false;
145 |             }
146 |             try {
147 |                 tokens = tokenize(urlString);
148 |             } catch (IOException e) {
149 |                 if (e.getMessage().contains("Malformed URL")) {
150 |                     if (allowMalformed) {
151 |                         tokens = Collections.singletonList(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
152 |                     } else {
153 |                         throw new MalformedURLException("Malformed URL: " + urlString);
154 |                     }
155 |                 }
156 |                 throw e;
157 |             }
158 |             if (tokens.isEmpty()) {
159 |                 return false;
160 |             }
161 |             iterator = tokens.iterator();
162 |             return true;
163 |         } else {
164 |             return false;
165 |         }
166 |     }
167 | 
168 | 
169 |     /**
170 |      * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
171 |      * will be passed along to the tokenizer.
172 |      * @param input a string to be tokenized
173 |      * @return a list of tokens extracted from the input string
174 |      * @throws IOException
175 |      */
176 |     private List<Token> tokenize(String input) throws IOException {
177 |         List<Token> tokens = new ArrayList<>();
178 |         URLTokenizer tokenizer = new URLTokenizer();
179 |         // create a copy of the parts list to avoid ConcurrentModificationException when sorting
180 |         tokenizer.setParts(new ArrayList<>(parts));
181 |         tokenizer.setUrlDecode(urlDeocde);
182 |         tokenizer.setTokenizeHost(tokenizeHost);
183 |         tokenizer.setTokenizePath(tokenizePath);
184 |         tokenizer.setTokenizeQuery(tokenizeQuery);
185 |         tokenizer.setAllowMalformed(allowMalformed || passthrough);
186 |         tokenizer.setTokenizeMalformed(tokenizeMalformed);
187 |         tokenizer.setReader(new StringReader(input));
188 |         tokenizer.reset();
189 | 
190 |         String term;
191 |         URLPart part;
192 |         OffsetAttribute offset;
193 |         while (tokenizer.incrementToken()) {
194 |             term = tokenizer.getAttribute(CharTermAttribute.class).toString();
195 |             part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
196 |             offset = tokenizer.getAttribute(OffsetAttribute.class);
197 |             tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
198 |         }
199 |         return tokens;
200 |     }
201 | 
202 | 
203 |     @Override
204 |     public void reset() throws IOException {
205 |         super.reset();
206 |         tokens = null;
207 |         iterator = null;
208 |     }
209 | 
210 |     private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
211 |     private static final Pattern REGEX_PORT = Pattern.compile(":([0-9]{1,5})");
212 |     private static final Pattern REGEX_QUERY = Pattern.compile("\\?(.+)");
213 | 
214 |     /**
215 |      * Attempt to parse a malformed url string
216 |      * @param urlString the malformed url string
217 |      * @return the url part if it can be parsed, null otherwise
218 |      * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
219 |      */
220 |     private String parseMalformed(String urlString) {
221 |         if (parts != null && !parts.isEmpty()) {
222 |             String ret;
223 |             for (URLPart part : parts) {
224 |                 switch (part) {
225 |                     case PROTOCOL:
226 |                         ret = applyPattern(REGEX_PROTOCOL, urlString);
227 |                         break;
228 |                     case PORT:
229 |                         ret = applyPattern(REGEX_PORT, urlString);
230 |                         break;
231 |                     case QUERY:
232 |                         ret = applyPattern(REGEX_QUERY, urlString);
233 |                         break;
234 |                     case WHOLE:
235 |                         ret = urlString;
236 |                         break;
237 |                     default:
238 |                         ret = urlString;
239 |                 }
240 |                 if (!Strings.isNullOrEmpty(ret)) {
241 |                     return ret;
242 |                 }
243 |             }
244 |         }
245 |         return urlString;
246 |     }
247 | 
248 |     /**
249 |      * Apply the given regex pattern to the given malformed url string and return the first match
250 |      * @param pattern the pattern to match
251 |      * @param urlString the malformed url to which the pattern should be applied
252 |      * @return the first match if one exists, null otherwise
253 |      */
254 |     private String applyPattern(Pattern pattern, String urlString) {
255 |         Matcher matcher = pattern.matcher(urlString);
256 |         if (matcher.find()) {
257 |             return matcher.group(1);
258 |         }
259 |         return null;
260 |     }
261 | }
262 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import com.google.common.base.Strings;
  4 | import com.google.common.collect.Lists;
  5 | import com.google.common.net.InetAddresses;
  6 | import org.apache.lucene.analysis.Tokenizer;
  7 | import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
  8 | import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
  9 | import org.apache.lucene.analysis.pattern.PatternTokenizer;
 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 13 | import org.apache.lucene.util.AttributeFactory;
 14 | import org.elasticsearch.index.analysis.URLPart;
 15 | import org.elasticsearch.index.analysis.URLPartComparator;
 16 | 
 17 | import java.io.IOException;
 18 | import java.io.Reader;
 19 | import java.io.StringReader;
 20 | import java.net.MalformedURLException;
 21 | import java.net.URL;
 22 | import java.net.URLDecoder;
 23 | import java.util.*;
 24 | import java.util.regex.Pattern;
 25 | 
 26 | import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
 27 | 
 28 | /**
 29 |  * Joe Linn
 30 |  * 7/30/2015
 31 |  */
 32 | public final class URLTokenizer extends Tokenizer {
 33 |     private static final URLPartComparator PART_COMPARATOR = new URLPartComparator();
 34 | 
 35 |     /**
 36 |      * If set, only the given part of the url will be tokenized.
 37 |      */
 38 |     private List<URLPart> parts;
 39 | 
 40 |     /**
 41 |      * If true, url parts will be url decoded prior to tokenization.
 42 |      */
 43 |     private boolean urlDecode;
 44 | 
 45 |     /**
 46 |      * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
 47 |      */
 48 |     private boolean tokenizeHost = true;
 49 | 
 50 |     /**
 51 |      * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
 52 |      */
 53 |     private boolean tokenizePath = true;
 54 | 
 55 |     /**
 56 |      * If true, the url's query string will be split on &
 57 |      */
 58 |     private boolean tokenizeQuery = true;
 59 | 
 60 |     /**
 61 |      * If true, {@link MalformedURLException} will be suppressed, and the given string will be returned as a single token
 62 |      */
 63 |     private boolean allowMalformed;
 64 | 
 65 |     /**
 66 |      * Has no effect if {@link #allowMalformed} is false. If both are true, an attempt will be made to tokenize malformed
 67 |      * URLs using regular expressions.
 68 |      */
 69 |     private boolean tokenizeMalformed;
 70 | 
 71 | 
 72 |     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
 73 |     private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
 74 |     private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
 75 | 
 76 |     private List<Token> tokens;
 77 |     private Iterator<Token> iterator;
 78 | 
 79 | 
 80 |     public URLTokenizer() {
 81 | 
 82 |     }
 83 | 
 84 |     public URLTokenizer(URLPart part) {
 85 |         setPart(part);
 86 |     }
 87 | 
 88 | 
 89 |     public URLTokenizer(AttributeFactory factory) {
 90 |         super(factory);
 91 |     }
 92 | 
 93 |     public void setParts(List<URLPart> parts) {
 94 |         if (parts != null) {
 95 |             parts.sort(PART_COMPARATOR);
 96 |             this.parts = parts;
 97 |         }
 98 |     }
 99 | 
100 |     public void setPart(URLPart part) {
101 |         if (part != null) {
102 |             this.parts = Collections.singletonList(part);
103 |         }
104 |     }
105 | 
106 |     public void setUrlDecode(boolean urlDecode) { this.urlDecode = urlDecode; }
107 | 
108 |     public void setTokenizeHost(boolean tokenizeHost) { this.tokenizeHost = tokenizeHost; }
109 | 
110 |     public void setTokenizePath(boolean tokenizePath) { this.tokenizePath = tokenizePath; }
111 | 
112 |     public void setTokenizeQuery(boolean tokenizeQuery) { this.tokenizeQuery = tokenizeQuery; }
113 | 
114 |     public void setAllowMalformed(boolean allowMalformed) { this.allowMalformed = allowMalformed; }
115 | 
116 |     public void setTokenizeMalformed(boolean tokenizeMalformed) { this.tokenizeMalformed = tokenizeMalformed; }
117 | 
118 |     @Override
119 |     public boolean incrementToken() throws IOException {
120 |         if (iterator == null) {
121 |             String urlString = readerToString(input);
122 |             if (Strings.isNullOrEmpty(urlString)) {
123 |                 return false;
124 |             }
125 |             tokens = tokenize(urlString);
126 |             iterator = tokens.iterator();
127 |         }
128 |         if (!iterator.hasNext()) {
129 |             return false;
130 |         }
131 | 
132 |         clearAttributes();
133 |         Token token = iterator.next();
134 |         termAttribute.append(token.getToken());
135 |         typeAttribute.setType(token.getPart().name().toLowerCase());
136 |         offsetAttribute.setOffset(token.getStart(), token.getEnd());
137 |         return true;
138 |     }
139 | 
140 | 
141 |     @Override
142 |     public void reset() throws IOException {
143 |         super.reset();
144 |         tokens = null;
145 |         iterator = null;
146 |     }
147 | 
148 | 
149 |     /**
150 |      * Read the contents of a {@link Reader} into a string
151 |      * @param reader the reader to be converted
152 |      * @return the entire contents of the given reader
153 |      * @throws IOException
154 |      */
155 |     private String readerToString(Reader reader) throws IOException {
156 |         char[] arr = new char[8 * 1024];
157 |         StringBuilder buffer = new StringBuilder();
158 |         int numCharsRead;
159 |         while ((numCharsRead = reader.read(arr, 0, arr.length)) != -1) {
160 |             buffer.append(arr, 0, numCharsRead);
161 |         }
162 |         return buffer.toString();
163 |     }
164 | 
165 | 
166 |     /**
167 |      * Tokenize the given URL string according to the options which have been set.
168 |      * @param urlString the string to be tokenized
169 |      * @return a list of {@link Token}s parsed from the string
170 |      * @throws IOException
171 |      */
172 |     private List<Token> tokenize(String urlString) throws IOException {
173 |         try {
174 |             URL url = new URL(urlString);
175 |             if (parts != null && !parts.isEmpty()) {
176 |                 List<Token> tokensList = new ArrayList<>();
177 |                 for (URLPart part : parts) {
178 |                     tokensList.addAll(tokenize(url, part));
179 |                 }
180 |                 return tokensList;
181 |             }
182 |             // No part is specified. Tokenize all parts.
183 |             Set<Token> tokens = new LinkedHashSet<>();
184 |             for (URLPart urlPart : URLPart.values()) {
185 |                 tokens.addAll(tokenize(url, urlPart));
186 |             }
187 |             tokens.addAll(tokenizeSpecial(url));
188 |             return Lists.newArrayList(tokens);
189 |         } catch (MalformedURLException e) {
190 |             if (allowMalformed) {
191 |                 if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
192 |                     return tokenizePartsMalformed(urlString, parts);
193 |                 }
194 |                 return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
195 |             }
196 |             throw new IOException("Malformed URL: " + urlString, e);
197 |         }
198 |     }
199 | 
200 | 
201 |     /**
202 |      * Tokenize all given parts of the given URL while ensuring that duplicate tokens are not created when the whole
203 |      * malformed URL is is identical to a single part token.
204 |      * @param urlString the malformed URL to be tokenized
205 |      * @param parts the desired {@link URLPart}s in proper part order
206 |      * @return a list of {@link Token}s
207 |      * @throws IOException
208 |      */
209 |     private List<Token> tokenizePartsMalformed(String urlString, List<URLPart> parts) throws IOException {
210 |         List<Token> tokens = new ArrayList<>();
211 |         Set<String> tokenStrings = new HashSet<>();
212 |         for (URLPart part : parts) {
213 |             for (Token token : tokenizeMalformed(urlString, part)) {
214 |                 if (part != URLPart.WHOLE) {
215 |                     tokens.add(token);
216 |                     tokenStrings.add(token.getToken());
217 |                 } else if (tokenStrings.isEmpty()) {
218 |                     // If we couldn't tokenize any of the parts, add the whole thing.
219 |                     tokens.add(token);
220 |                 }
221 |             }
222 |         }
223 |         return tokens;
224 |     }
225 | 
226 | 
227 |     /**
228 |      * Attempt to tokenize the given malformed URL.
229 |      * @param url the URL to be tokenized
230 |      * @param part the desired part of the URL
231 |      * @return {@link List} of {@link Token}s gleaned from the given URL
232 |      * @throws IOException
233 |      */
234 |     private List<Token> tokenizeMalformed(String url, URLPart part) throws IOException {
235 |         if (part == null) {
236 |             // No part is specified. Tokenize all parts.
237 |             List<URLPart> urlParts = Arrays.asList(URLPart.values());
238 |             urlParts.sort(new URLPartComparator());
239 |             return tokenizePartsMalformed(url, urlParts);
240 |         }
241 |         Optional<String> partOptional = getPart(url, part);
242 |         if (!partOptional.isPresent() || partOptional.get().equals("")) {
243 |             // desired part was not found
244 |             return new ArrayList<>();
245 |         }
246 |         final String partStringRaw = partOptional.get();
247 |         int start = 0;
248 |         int end = 0;
249 |         String partString = urlDecode(partOptional.get());
250 |         switch (part) {
251 |             case HOST:
252 |                 return getHostTokens(url, partStringRaw, partString);
253 |             case PORT:
254 |                 return getPortTokens(url, partStringRaw);
255 |             case PATH:
256 |                 return getPathTokens(url, partStringRaw, partString);
257 |             case REF:
258 |                 return getRefTokens(url, partStringRaw, partString);
259 |             case QUERY:
260 |                 return getQueryTokens(url, partStringRaw, partString);
261 |             case PROTOCOL:
262 |                 return Collections.singletonList(new Token(partString, part, start, partString.length()));
263 |             case WHOLE:
264 |                 return Collections.singletonList(new Token(url, URLPart.WHOLE, 0, url.length() - 1));
265 |             default:
266 |         }
267 |         return Collections.singletonList(new Token(partString, part, start, end));
268 |     }
269 | 
270 | 
271 |     /**
272 |      * URL decode the given string if {@link #urlDecode} is true. The given <code>partString</code> is passed through
273 |      * unaltered otherwise.
274 |      * @param partString string to be URL decoded
275 |      * @return URL decoded string if {@link #urlDecode} is true; unaltered string otherwise.
276 |      * @throws IOException if malformed URL encoding is present and {@link #allowMalformed} is false.
277 |      */
278 |     private String urlDecode(String partString) throws IOException {
279 |         if (urlDecode) {
280 |             try {
281 |                 partString = URLDecoder.decode(partString, "UTF-8");
282 |             } catch (IllegalArgumentException e) {
283 |                 if (!allowMalformed) {
284 |                     throw new IOException("Error performing URL decoding on string: " + partString, e);
285 |                 }
286 |             }
287 |         }
288 |         return partString;
289 |     }
290 | 
291 | 
292 |     private static final Pattern QUERY_SEPARATOR = Pattern.compile("&");
293 | 
294 |     /**
295 |      * Tokenize the given {@link URL} based on the desired {@link URLPart} and currently set tokenizer options.
296 |      * @param url the url to be tokenized
297 |      * @param part the desired part of the url
298 |      * @return a list of {@link Token}s parsed from the given url
299 |      * @throws IOException
300 |      */
301 |     private List<Token> tokenize(URL url, URLPart part) throws IOException {
302 |         String partString = getPart(url, part);
303 |         if (Strings.isNullOrEmpty(partString)) {
304 |             // desired part was not found
305 |             return new ArrayList<>();
306 |         }
307 |         final String partStringRaw = partString;
308 |         int start = 0;
309 |         int end = 0;
310 |         partString = urlDecode(partString);
311 |         switch (part) {
312 |             case HOST:
313 |                 return getHostTokens(url, partStringRaw, partString);
314 |             case PORT:
315 |                 return getPortTokens(url, getPart(url, part));
316 |             case PATH:
317 |                 return getPathTokens(url, partStringRaw, partString);
318 |             case QUERY:
319 |                 return getQueryTokens(url, partStringRaw, partString);
320 |             case PROTOCOL:
321 |             case WHOLE:
322 |                 end = partString.length();
323 |                 break;
324 |             case REF:
325 |                 return getRefTokens(url, partStringRaw, partString);
326 |             default:
327 |         }
328 |         return Collections.singletonList(new Token(partString, part, start, end));
329 |     }
330 | 
331 | 
332 |     /**
333 |      * Retrieve tokens representing the host of the given URL
334 |      * @param url URL to be tokenized
335 |      * @param partStringRaw raw (not url decoded) string containing the host
336 |      * @param partString potentially url decoded string containing the host
337 |      * @return host tokens
338 |      * @throws IOException
339 |      */
340 |     private List<Token> getHostTokens(URL url, String partStringRaw, String partString) throws IOException {
341 |         return getHostTokens(url.toString(), partStringRaw, partString);
342 |     }
343 | 
344 | 
345 |     /**
346 |      * Retrieve tokens representing the host of the given URL
347 |      * @param url URL to be tokenized
348 |      * @param partStringRaw raw (not url decoded) string containing the host
349 |      * @param partString potentially url decoded string containing the host
350 |      * @return host tokens
351 |      * @throws IOException
352 |      */
353 |     private List<Token> getHostTokens(String url, String partStringRaw, String partString) throws IOException {
354 |         int start = getStartIndex(url, partStringRaw);
355 |         if (!tokenizeHost || InetAddresses.isInetAddress(partString)) {
356 |             int end = getEndIndex(start, partStringRaw);
357 |             return Collections.singletonList(new Token(partString, URLPart.HOST, start, end));
358 |         }
359 |         return tokenize(URLPart.HOST, addReader(new ReversePathHierarchyTokenizer('.', '.'), new StringReader(partString)), start);
360 |     }
361 | 
362 | 
363 |     private List<Token> getPortTokens(URL url, String port) {
364 |         return getPortTokens(url.toString(), port);
365 |     }
366 | 
367 | 
368 |     private List<Token> getPortTokens(String url, String port) {
369 |         int start = url.indexOf(":" + port);
370 |         int end = 0;
371 |         if (start == -1) {
372 |             // port was inferred
373 |             start = 0;
374 |         } else {
375 |             // explicit port
376 |             start++;    // account for :
377 |             end = getEndIndex(start, port);
378 |         }
379 |         return Collections.singletonList(new Token(port, URLPart.PORT, start, end));
380 |     }
381 | 
382 | 
383 |     private List<Token> getPathTokens(URL url, String partStringRaw, String partString) throws IOException {
384 |         return getPathTokens(url.toString(), partStringRaw, partString);
385 |     }
386 | 
387 | 
388 |     private List<Token> getPathTokens(String url, String partStringRaw, String partString) throws IOException {
389 |         int start = getStartIndex(url, partStringRaw);
390 |         if (!tokenizePath) {
391 |             int end = getEndIndex(start, partStringRaw);
392 |             return Collections.singletonList(new Token(partString, URLPart.PATH, start, end));
393 |         }
394 |         return tokenize(URLPart.PATH, addReader(new PathHierarchyTokenizer('/', '/'), new StringReader(partString)), start);
395 |     }
396 | 
397 | 
398 |     private List<Token> getRefTokens(URL url, String partStringRaw, String partString) {
399 |         return getRefTokens(url.toString(), partStringRaw, partString);
400 |     }
401 | 
402 | 
403 |     private List<Token> getRefTokens(String url, String partStringRaw, String partString) {
404 |         int start = getStartIndex(url, "#" + partStringRaw) + 1;
405 |         int end = url.length();
406 |         return Collections.singletonList(new Token(partString, URLPart.REF, start, end));
407 |     }
408 | 
409 | 
410 |     private List<Token> getQueryTokens(URL url, String partStringRaw, String partString) throws IOException {
411 |         return getQueryTokens(url.toString(), partStringRaw, partString);
412 |     }
413 | 
414 | 
415 |     private List<Token> getQueryTokens(String url, String partStringRaw, String partString) throws IOException {
416 |         int start = getStartIndex(url, partStringRaw);
417 |         if (!tokenizeQuery) {
418 |             int end = getEndIndex(start, partStringRaw);
419 |             return Collections.singletonList(new Token(partString, URLPart.QUERY, start, end));
420 |         }
421 |         return tokenize(URLPart.QUERY, addReader(new PatternTokenizer(QUERY_SEPARATOR, -1), new StringReader(partString)), start);
422 |     }
423 | 
424 | 
425 |     /**
426 |      * Set the given reader on the given tokenizer
427 |      * @param tokenizer tokenizer on which the reader is to be set
428 |      * @param input the reader to set
429 |      * @return the given tokenizer with the given reader set
430 |      * @throws IOException
431 |      */
432 |     private Tokenizer addReader(Tokenizer tokenizer, Reader input) throws IOException {
433 |         tokenizer.setReader(input);
434 |         return tokenizer;
435 |     }
436 | 
437 | 
438 |     /**
439 |      * Get the start index of the given string in the given url
440 |      * @param url the url
441 |      * @param partStringRaw the url part
442 |      * @return the starting index of the part string if it is found in the given url, -1 if it is not found
443 |      */
444 |     private int getStartIndex(URL url, String partStringRaw) {
445 |         return getStartIndex(url.toString(), partStringRaw);
446 |     }
447 | 
448 | 
449 |     private int getStartIndex(String url, String partStringRaw) {
450 |         return url.indexOf(partStringRaw);
451 |     }
452 | 
453 | 
454 |     /**
455 |      * Get the end index of the given part string
456 |      * @param start the start index of the part string
457 |      * @param partStringRaw the part string
458 |      * @return the end index
459 |      */
460 |     private int getEndIndex(int start, String partStringRaw) {
461 |         return start + partStringRaw.length();
462 |     }
463 | 
464 | 
465 |     /**
466 |      * Get a list of {@link Token}s from the given {@link Tokenizer}
467 |      * @param part the url part which should be used in {@link Token} creation
468 |      * @param tokenizer the tokenizer from which tokens will be gleaned
469 |      * @return a list of tokens
470 |      * @throws IOException
471 |      */
472 |     private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
473 |         tokenizer.reset();
474 |         List<Token> tokens = new ArrayList<>();
475 |         OffsetAttribute offset;
476 |         String token;
477 |         while (tokenizer.incrementToken()) {
478 |             token = tokenizer.getAttribute(CharTermAttribute.class).toString();
479 |             offset = tokenizer.getAttribute(OffsetAttribute.class);
480 |             tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
481 |         }
482 |         return tokens;
483 |     }
484 | 
485 | 
486 |     /**
487 |      * Perform non-standard tokenization.
488 |      * @param url the URL to be tokenized
489 |      * @return a list of {@link Token}s. Since tokens created in this method do not pertain to a specific part of the url,
490 |      * {@link URLPart#WHOLE} will be used.
491 |      */
492 |     private List<Token> tokenizeSpecial(URL url) {
493 |         List<Token> tokens = new ArrayList<>();
494 |         // host:port
495 |         String token = getPart(url, URLPart.HOST) + ":" + getPart(url, URLPart.PORT);
496 |         int start = getStartIndex(url, token);
497 |         int end = 0;
498 |         if(start == -1){
499 |             // implicit port
500 |             start = 0;
501 |         } else {
502 |             end = getEndIndex(start, token);
503 |         }
504 |         tokens.add(new Token(token, URLPart.WHOLE, start, end));
505 | 
506 |         // protocol://host
507 |         token = getPart(url, URLPart.PROTOCOL) + "://" + getPart(url, URLPart.HOST);
508 |         start = getStartIndex(url, token);
509 |         end = getEndIndex(start, token);
510 |         tokens.add(new Token(token, URLPart.WHOLE, start, end));
511 |         return tokens;
512 |     }
513 | 
514 | 
515 | }
516 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/url/URLUtils.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.elasticsearch.index.analysis.URLPart;
  4 | 
  5 | import java.net.URL;
  6 | import java.util.Optional;
  7 | import java.util.regex.Matcher;
  8 | import java.util.regex.Pattern;
  9 | 
 10 | /**
 11 |  * Joe Linn
 12 |  * 7/30/2015
 13 |  */
 14 | public class URLUtils {
 15 |     private static final Pattern PATTERN_PROTOCOL = Pattern.compile("(^[a-zA-Z]*)://");
 16 |     private static final Pattern PATTERN_HOST = Pattern.compile("^(?:^[a-zA-Z]*://)?((?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)/?(?:.*)");
 17 |     private static final Pattern PATTERN_PORT = Pattern.compile("^(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)(?::([0-9]*))?/?(?:.*)");
 18 |     private static final Pattern PATTERN_PATH = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?([^\\?\\#&]*)");
 19 |     private static final Pattern PATTERN_REF = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?(?:[^\\?\\#&]*)(#[^\\?\\&]*)?");
 20 |     private static final Pattern PATTERN_QUERY = Pattern.compile("(?:^[a-zA-Z]*://)?(?:(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|-){0,61}[0-9A-Za-z])?)*\\.?)?(?::[0-9]*)?(?:[^\\?\\#&]*)(?:#[^\\?\\&]*)?(\\?.*)");
 21 | 
 22 |     private URLUtils() {}
 23 | 
 24 | 
 25 |     /**
 26 |      * Attempt to retrieve the desired part of the given URL
 27 |      * @param url URL to parse
 28 |      * @param part desired URL part
 29 |      * @return the part of the URL, if it could be found
 30 |      */
 31 |     public static Optional<String> getPart(String url, URLPart part) {
 32 |         switch (part) {
 33 |             case PROTOCOL:
 34 |                 return captureFirst(url, PATTERN_PROTOCOL);
 35 |             case HOST:
 36 |                 return captureFirst(url, PATTERN_HOST);
 37 |             case PORT:
 38 |                 return getPort(url);
 39 |             case PATH:
 40 |                 return captureFirst(url, PATTERN_PATH);
 41 |             case REF:
 42 |                 Optional<String> refOptional = captureFirst(url, PATTERN_REF);
 43 |                 if (refOptional.isPresent()) {
 44 |                     refOptional = Optional.of(refOptional.get().replaceFirst("#", ""));
 45 |                 }
 46 |                 return refOptional;
 47 |             case QUERY:
 48 |                 Optional<String> queryOptional = captureFirst(url, PATTERN_QUERY);
 49 |                 if (queryOptional.isPresent()) {
 50 |                     queryOptional = Optional.of(queryOptional.get().replaceFirst("\\?", ""));
 51 |                 }
 52 |                 return queryOptional;
 53 |             case WHOLE:
 54 |             default:
 55 |                 return Optional.of(url);
 56 |         }
 57 |     }
 58 | 
 59 | 
 60 |     private static Optional<String> captureFirst(String input, Pattern pattern) {
 61 |         Matcher matcher = pattern.matcher(input);
 62 |         if (matcher.find()) {
 63 |             String group = matcher.group(1);
 64 |             if (group == null) {
 65 |                 return Optional.empty();
 66 |             }
 67 |             return Optional.of(group);
 68 |         }
 69 |         return Optional.empty();
 70 |     }
 71 | 
 72 | 
 73 |     /**
 74 |      * Retrieve the given {@link URLPart} from the given {@link URL}
 75 |      * @param url the url from which a part is to be taken
 76 |      * @param part the part to be taken from the url
 77 |      * @return a part of the given url
 78 |      */
 79 |     public static String getPart(URL url, URLPart part) {
 80 |         switch (part) {
 81 |             case PROTOCOL:
 82 |                 return url.getProtocol();
 83 |             case HOST:
 84 |                 return url.getHost();
 85 |             case PORT:
 86 |                 return getPort(url);
 87 |             case PATH:
 88 |                 return url.getPath();
 89 |             case REF:
 90 |                 return url.getRef();
 91 |             case QUERY:
 92 |                 return url.getQuery();
 93 |             case WHOLE:
 94 |             default:
 95 |                 return url.toString();
 96 |         }
 97 |     }
 98 | 
 99 | 
100 |     /**
101 |      * Parse the port from the given {@link URL}. If the port is not explicitly given, it will be inferred from the
102 |      * protocol.
103 |      *
104 |      * @param url the url
105 |      * @return the port
106 |      */
107 |     public static String getPort(URL url) {
108 |         int port = url.getPort();
109 |         if (port == -1) {
110 |             // infer port from protocol
111 |             Optional<String> portOptional = portFromProtocol(url.getProtocol());
112 |             return portOptional.orElse(null);
113 |         }
114 |         return String.valueOf(port);
115 |     }
116 | 
117 | 
118 |     public static Optional<String> getPort(String url) {
119 |         Optional<String> portOptional = captureFirst(url, PATTERN_PORT);
120 |         if (portOptional.isPresent()) {
121 |             return portOptional;
122 |         }
123 |         // attempt to infer port form protocol
124 |         Optional<String> protocolOptional = getPart(url, URLPart.PROTOCOL);
125 |         if (protocolOptional.isPresent()) {
126 |             return portFromProtocol(protocolOptional.get());
127 |         }
128 |         return Optional.empty();
129 |     }
130 | 
131 | 
132 |     private static Optional<String> portFromProtocol(final String protocol) {
133 |         int port = -1;
134 |         if (protocol.equals("http")) {
135 |             port = 80;
136 |         } else if (protocol.equals("https")) {
137 |             port = 443;
138 |         }
139 |         if (port == -1) {
140 |             // port could not be inferred
141 |             return Optional.empty();
142 |         }
143 |         return Optional.of(String.valueOf(port));
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/AnalysisURLPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis;
 2 | 
 3 | import org.elasticsearch.index.analysis.TokenFilterFactory;
 4 | import org.elasticsearch.index.analysis.TokenizerFactory;
 5 | import org.elasticsearch.index.analysis.URLTokenFilterFactory;
 6 | import org.elasticsearch.index.analysis.URLTokenizerFactory;
 7 | import org.elasticsearch.indices.analysis.AnalysisModule;
 8 | import org.elasticsearch.plugins.AnalysisPlugin;
 9 | import org.elasticsearch.plugins.Plugin;
10 | 
11 | import java.util.Map;
12 | 
13 | import static java.util.Collections.singletonMap;
14 | 
15 | /**
16 |  * Joe Linn
17 |  * 1/17/2015
18 |  */
19 | public class AnalysisURLPlugin extends Plugin implements AnalysisPlugin {
20 |     @Override
21 |     public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
22 |         return singletonMap("url", URLTokenFilterFactory::new);
23 |     }
24 | 
25 |     @Override
26 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
27 |         return singletonMap("url", URLTokenizerFactory::new);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
 1 | #plugin=org.elasticsearch.plugin.analysis.AnalysisURLPlugin
 2 | version=${project.version}
 3 | description=URL tokenizer and token filter.
 4 | name=analysis-url
 5 | site=false
 6 | jvm=true
 7 | classname=org.elasticsearch.plugin.analysis.AnalysisURLPlugin
 8 | java.version=1.8
 9 | elasticsearch.version=${elasticsearch.version}
10 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/IsTokenStreamWithTokenAndPosition.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.url;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 6 | import org.hamcrest.Description;
 7 | import org.hamcrest.Factory;
 8 | import org.hamcrest.TypeSafeMatcher;
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import java.io.IOException;
13 | 
14 | /**
15 |  * Joe Linn
16 |  * 8/2/2015
17 |  */
18 | public class IsTokenStreamWithTokenAndPosition extends TypeSafeMatcher<TokenStream> {
19 |     private static final Logger log = LoggerFactory.getLogger(IsTokenStreamWithTokenAndPosition.class);
20 | 
21 |     private final String token;
22 |     private final int start;
23 |     private final int end;
24 | 
25 |     private boolean foundToken;
26 |     private int actualStart;
27 |     private int actualEnd;
28 | 
29 |     public IsTokenStreamWithTokenAndPosition(String token, int start, int end) {
30 |         this.token = token;
31 |         this.start = start;
32 |         this.end = end;
33 |     }
34 | 
35 |     @Override
36 |     protected boolean matchesSafely(TokenStream tokenizer) {
37 |         CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
38 |         OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
39 |         try {
40 |             tokenizer.reset();
41 |         } catch (IOException e) {
42 |             log.error("Unable to reset tokenizer.", e);
43 |             return false;
44 |         }
45 |         tokenizer.clearAttributes();
46 |         try {
47 |             while (tokenizer.incrementToken()) {
48 |                 if (termAttribute.toString().equals(token)) {
49 |                     foundToken = true;
50 |                     actualStart = offset.startOffset();
51 |                     actualEnd = offset.endOffset();
52 |                     if (actualStart == start && actualEnd == end) {
53 |                         return true;
54 |                     }
55 |                 }
56 |             }
57 |         } catch (IOException e) {
58 |             log.error("Unable to increment tokenizer.", e);
59 |         }
60 |         return false;
61 |     }
62 | 
63 |     @Override
64 |     public void describeTo(Description description) {
65 |         description.appendText("tokenizer containing token '")
66 |                 .appendText(token)
67 |                 .appendText("' starting at offset ")
68 |                 .appendValue(start)
69 |                 .appendText(" and ending at offset ")
70 |                 .appendValue(end);
71 |     }
72 | 
73 | 
74 |     @Override
75 |     protected void describeMismatchSafely(TokenStream item, Description mismatchDescription) {
76 |         if(!foundToken){
77 |             mismatchDescription.appendText("tokenizer which did not contain token ").appendValue(token);
78 |         } else {
79 |             mismatchDescription.appendText("tokenizer containing token ")
80 |                     .appendValue(token)
81 |                     .appendText(" starting at offset ")
82 |                     .appendValue(actualStart)
83 |                     .appendText(" and ending at offset ")
84 |                     .appendValue(actualEnd);
85 |         }
86 |     }
87 | 
88 |     @Factory
89 |     public static IsTokenStreamWithTokenAndPosition hasTokenAtOffset(String token, int start, int end) {
90 |         return new IsTokenStreamWithTokenAndPosition(token, start, end);
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/IsTokenizerWithToken.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.url;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 5 | import org.hamcrest.Description;
 6 | import org.hamcrest.Factory;
 7 | import org.hamcrest.TypeSafeMatcher;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | import java.io.IOException;
12 | 
13 | /**
14 |  * Joe Linn
15 |  * 8/2/2015
16 |  */
17 | public class IsTokenizerWithToken extends TypeSafeMatcher<Tokenizer> {
18 |     private static final Logger log = LoggerFactory.getLogger(IsTokenizerWithToken.class);
19 | 
20 |     private final String token;
21 | 
22 | 
23 |     public IsTokenizerWithToken(String token) {
24 |         this.token = token;
25 |     }
26 | 
27 | 
28 |     @Override
29 |     protected boolean matchesSafely(Tokenizer tokenizer) {
30 |         CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
31 |         try {
32 |             tokenizer.reset();
33 |         } catch (IOException e) {
34 |             log.error("Unable to reset tokenizer.", e);
35 |             return false;
36 |         }
37 |         tokenizer.clearAttributes();
38 |         try {
39 |             while (tokenizer.incrementToken()) {
40 |                 if (termAttribute.toString().equals(token)) {
41 |                     return true;
42 |                 }
43 |             }
44 |         } catch (IOException e) {
45 |             log.error("Unable to increment tokenizer.", e);
46 |         }
47 |         return false;
48 |     }
49 | 
50 | 
51 |     @Override
52 |     public void describeTo(Description description) {
53 |         description.appendText("tokenized the string '").appendText(token).appendText("'");
54 |     }
55 | 
56 | 
57 |     @Factory
58 |     public static IsTokenizerWithToken hasToken(String token){
59 |         return new IsTokenizerWithToken(token);
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/OptionalMatchers.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.hamcrest.Description;
  4 | import org.hamcrest.Matcher;
  5 | import org.hamcrest.TypeSafeMatcher;
  6 | 
  7 | import java.util.Optional;
  8 | 
  9 | /**
 10 |  * @author Joe Linn
 11 |  *         6/25/2016
 12 |  */
 13 | public class OptionalMatchers {
 14 |     public static Matcher<Optional<?>> isPresent() {
 15 |         return new PresenceMatcher();
 16 |     }
 17 | 
 18 | 
 19 |     @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
 20 |     private static class PresenceMatcher extends TypeSafeMatcher<Optional<?>> {
 21 | 
 22 |         @Override
 23 |         protected boolean matchesSafely(Optional<?> optional) {
 24 |             return optional.isPresent();
 25 |         }
 26 | 
 27 |         @Override
 28 |         public void describeTo(Description description) {
 29 |             description.appendText("is <Present>");
 30 |         }
 31 | 
 32 | 
 33 |         @Override
 34 |         protected void describeMismatchSafely(Optional<?> item, Description mismatchDescription) {
 35 |             mismatchDescription.appendText("was <Empty>");
 36 |         }
 37 |     }
 38 | 
 39 | 
 40 |     public static Matcher<Optional<?>> isEmpty() {
 41 |         return new EmptyMatcher();
 42 |     }
 43 | 
 44 | 
 45 |     @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
 46 |     private static class EmptyMatcher extends PresenceMatcher {
 47 |         @Override
 48 |         protected boolean matchesSafely(Optional<?> optional) {
 49 |             return !super.matchesSafely(optional);
 50 |         }
 51 | 
 52 |         @Override
 53 |         public void describeTo(Description description) {
 54 |             description.appendText("is <Empty>");
 55 |         }
 56 | 
 57 |         @SuppressWarnings("OptionalGetWithoutIsPresent")
 58 |         @Override
 59 |         protected void describeMismatchSafely(Optional<?> item, Description mismatchDescription) {
 60 |             mismatchDescription.appendText("had value ")
 61 |                     .appendValue(item.get());
 62 |         }
 63 |     }
 64 | 
 65 | 
 66 |     public static <T> Matcher<Optional<T>> hasValue(Matcher<? super T> matcher) {
 67 |         return new HasValue<>(matcher);
 68 |     }
 69 | 
 70 | 
 71 |     @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
 72 |     private static class HasValue<T> extends TypeSafeMatcher<Optional<T>> {
 73 |         private final Matcher<? super T> matcher;
 74 | 
 75 | 
 76 |         private HasValue(Matcher<? super T> matcher) {
 77 |             this.matcher = matcher;
 78 |         }
 79 | 
 80 | 
 81 |         @Override
 82 |         protected boolean matchesSafely(Optional<T> tOptional) {
 83 |             return tOptional.isPresent() && matcher.matches(tOptional.get());
 84 |         }
 85 | 
 86 |         @Override
 87 |         public void describeTo(Description description) {
 88 |             description.appendText("has value that is ");
 89 |             matcher.describeTo(description);
 90 |         }
 91 | 
 92 | 
 93 |         @Override
 94 |         protected void describeMismatchSafely(Optional<T> item, Description mismatchDescription) {
 95 |             if (item.isPresent()) {
 96 |                 mismatchDescription.appendText("value ")
 97 |                         .appendValue(item.get());
 98 |                 matcher.describeTo(mismatchDescription);
 99 |             } else {
100 |                 mismatchDescription.appendText("was <Empty>");
101 |             }
102 |         }
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLAnalysisTestCase.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.url;
 2 | 
 3 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
 4 | import org.elasticsearch.plugin.analysis.AnalysisURLPlugin;
 5 | import org.elasticsearch.plugins.Plugin;
 6 | import org.elasticsearch.test.ESIntegTestCase;
 7 | import org.elasticsearch.test.StreamsUtils;
 8 | import org.junit.Before;
 9 | 
10 | import java.util.Collection;
11 | import java.util.Collections;
12 | import java.util.List;
13 | 
14 | /**
15 |  * Joe Linn
16 |  * 8/1/2015
17 |  */
18 | public abstract class URLAnalysisTestCase extends ESIntegTestCase {
19 |     protected static final String INDEX = "url_token_filter";
20 |     protected static final String TYPE = "test";
21 | 
22 | 
23 |     @Override
24 |     protected Collection<Class<? extends Plugin>> nodePlugins() {
25 |         return Collections.singletonList(AnalysisURLPlugin.class);
26 |     }
27 | 
28 |     /**
29 |      * For subclasses to override. Overrides must call {@code super.setUp()}.
30 |      */
31 |     @Before
32 |     @Override
33 |     public void setUp() throws Exception {
34 |         super.setUp();
35 |         String settings = StreamsUtils.copyToStringFromClasspath("/test-settings.json");
36 |         String mapping = StreamsUtils.copyToStringFromClasspath("/test-mapping.json");
37 |         client().admin().indices().prepareCreate(INDEX).setSettings(settings).addMapping(TYPE, mapping).get();
38 |         refresh();
39 |         Thread.sleep(75);   // Ensure that the shard is available before we start making analyze requests.
40 |     }
41 | 
42 |     protected List<AnalyzeResponse.AnalyzeToken> analyzeURL(String url, String analyzer) {
43 |         return client().admin().indices().prepareAnalyze(INDEX, url).setAnalyzer(analyzer).get().getTokens();
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.elasticsearch.ElasticsearchException;
  4 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
  5 | import org.elasticsearch.index.query.QueryBuilders;
  6 | import org.elasticsearch.search.SearchHits;
  7 | import org.junit.Test;
  8 | 
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | 
 13 | import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTPS_URL;
 14 | import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTP_URL;
 15 | import static org.hamcrest.Matchers.equalTo;
 16 | import static org.hamcrest.Matchers.hasSize;
 17 | 
 18 | /**
 19 |  * Joe Linn
 20 |  * 1/17/2015
 21 |  */
 22 | public class URLTokenFilterIntegrationTest extends URLAnalysisTestCase {
 23 | 
 24 |     @Test
 25 |     public void testAnalyze() throws InterruptedException {
 26 | 
 27 |         assertURLAnalyzesTo(TEST_HTTP_URL, "url_protocol", "http");
 28 |         assertURLAnalyzesTo(TEST_HTTPS_URL, "url_protocol", "https");
 29 | 
 30 |         assertURLAnalyzesTo(TEST_HTTP_URL, "url_host", "www.foo.bar.com");
 31 | 
 32 |         assertURLAnalyzesTo(TEST_HTTP_URL, "url_port", "9200");
 33 |     }
 34 | 
 35 |     @Test(expected = ElasticsearchException.class)
 36 |     public void testInvalidURL() {
 37 |         analyzeURL("foobar", "url_protocol");
 38 |     }
 39 | 
 40 |     @Test
 41 |     public void testEmptyString() {
 42 |         List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("", "url_protocol");
 43 |         assertThat("no tokens", tokens, hasSize(0));
 44 |     }
 45 | 
 46 |     @Test
 47 |     public void testUrlDecode() {
 48 |         assertURLAnalyzesTo("https://foo.bar.com?email=foo%40bar.com", "url_query", "email=foo@bar.com");
 49 |         assertURLAnalyzesTo("https://ssl.google-analytics.com/r/__utm.gif?utmwv=5.6.4&utms=1&utmn=1031590447&utmhn=www.linkedin.com&utmcs=-&utmsr=1024x768&utmvp=1256x2417&utmsc=24-bit&utmul=en-us&utmje=1&utmfl=-&utmdt=Wells%20Fargo%20Capital%20Finance%20%7C%20LinkedIn&utmhid=735221740&utmr=http%3A%2F%2Fwww.google.com%2Fsearch%3Fq%3Dsite%253Alinkedin.com%2Bwells%2Bfargo%26rls%3Dcom.microsoft%3Aen-us%26ie%3DUTF-8%26oe%3DUTF-8%26startIndex%3D%26startPage%3D1&utmp=biz-overview-public&utmht=1428449620694&utmac=UA-3242811-1&utmcc=__utma%3D23068709.1484257758.1428449621.1428449621.1428449621.1%3B%2B__utmz%3D23068709.1428449621.1.1.utmcsr%3Dgoogle%7Cutmccn%3D(organic)%7Cutmcmd%3Dorganic%7Cutmctr%3Dsite%253Alinkedin.com%2520wells%2520fargo%3B&utmjid=1336170366&utmredir=1&utmu=qBCAAAAAAAAAAAAAAAAAAAAE~", "url_port", "443");
 50 |     }
 51 | 
 52 |     @Test
 53 |     public void testMalformed() {
 54 |         assertURLAnalyzesTo("foo.bar.com:444/baz", "url_port_malformed", "444");
 55 | 
 56 |         Map<String, Object> doc = new HashMap<>();
 57 |         doc.put("url_malformed", "foo.bar/baz/bat");
 58 |         client().prepareIndex(INDEX, "test").setSource(doc).get();
 59 |         refresh();
 60 | 
 61 |         SearchHits hits = client()
 62 |                 .prepareSearch(INDEX)
 63 |                 .setQuery(QueryBuilders.boolQuery().mustNot(QueryBuilders.existsQuery("http_malformed.port")))
 64 |                 .get()
 65 |                 .getHits();
 66 |         assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
 67 |     }
 68 | 
 69 | 
 70 |     @Test
 71 |     public void testPassthrough() {
 72 |         List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.com:9200/foo.bar baz bat.blah", "url_host_passthrough");
 73 |         assertThat(tokens, hasSize(4));
 74 |         assertThat(tokens.get(0).getTerm(), equalTo("foo.com"));
 75 |         assertThat(tokens.get(1).getTerm(), equalTo("com"));
 76 |         assertThat(tokens.get(2).getTerm(), equalTo("baz"));
 77 |         assertThat(tokens.get(3).getTerm(), equalTo("bat.blah"));
 78 |     }
 79 | 
 80 | 
 81 |     @Test
 82 |     public void testIndex() {
 83 |         Map<String, Object> doc = new HashMap<>();
 84 |         doc.put("url", "http://foo.bar/baz/bat");
 85 |         client().prepareIndex(INDEX, "test").setSource(doc).get();
 86 |         doc.put("url", "https://foo.bar.com");
 87 |         client().prepareIndex(INDEX, "test").setSource(doc).get();
 88 |         refresh();
 89 | 
 90 |         SearchHits hits = client().prepareSearch(INDEX).setQuery(QueryBuilders.matchAllQuery()).get().getHits();
 91 |         assertEquals("both docs indexed", 2, hits.getTotalHits());
 92 |     }
 93 | 
 94 |     private void assertURLAnalyzesTo(String url, String analyzer, String expected) {
 95 |         List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL(url, analyzer);
 96 |         assertThat("a URL part was parsed", tokens, hasSize(1));
 97 |         assertEquals("term value", expected, tokens.get(0).getTerm());
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  4 | import org.apache.lucene.analysis.CannedTokenStream;
  5 | import org.apache.lucene.analysis.Token;
  6 | import org.apache.lucene.analysis.TokenStream;
  7 | import org.elasticsearch.index.analysis.URLPart;
  8 | import org.junit.Test;
  9 | 
 10 | import java.io.IOException;
 11 | import java.net.MalformedURLException;
 12 | 
 13 | import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
 14 | 
 15 | public class URLTokenFilterTest extends BaseTokenStreamTestCase {
 16 |     public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
 17 |     public static final String TEST_HTTP_URL2 = "http://www.foo.bar.com";
 18 |     public static final String TEST_HTTPS_URL = "https://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
 19 | 
 20 |     @Test
 21 |     public void testFilterProtocol() throws IOException {
 22 |         URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.PROTOCOL);
 23 |         assertTokenStreamContents(filter, "http");
 24 | 
 25 |         filter = createFilter(TEST_HTTPS_URL, URLPart.PROTOCOL);
 26 |         assertTokenStreamContents(filter, "https");
 27 |     }
 28 | 
 29 |     @Test
 30 |     public void testFilterHost() throws IOException {
 31 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
 32 | 
 33 |         URLTokenFilter filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
 34 |                 .setUrlDeocde(false);
 35 |         assertThat(filter, hasTokenAtOffset("www.foo.bar.com", 7, 22));
 36 |         filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
 37 |                 .setUrlDeocde(false);
 38 |         assertThat(filter, hasTokenAtOffset("foo.bar.com", 11, 22));
 39 |         filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
 40 |                 .setUrlDeocde(false);
 41 |         assertThat(filter, hasTokenAtOffset("bar.com", 15, 22));
 42 |         filter = createFilter(TEST_HTTP_URL, URLPart.HOST)
 43 |                 .setUrlDeocde(false);
 44 |         assertThat(filter, hasTokenAtOffset("com", 19, 22));
 45 |     }
 46 | 
 47 |     @Test
 48 |     public void testFilterPort() throws IOException {
 49 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PORT), "9200");
 50 |     }
 51 | 
 52 |     @Test
 53 |     public void testFilterPath() throws IOException {
 54 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH).setTokenizePath(false), "/index_name/type_name/_search.html");
 55 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL2, URLPart.PATH).setTokenizePath(false), new String[]{});
 56 |     }
 57 | 
 58 |     @Test
 59 |     public void testFilterRef() throws IOException {
 60 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.REF), "tag");
 61 |     }
 62 | 
 63 |     @Test
 64 |     public void testFilterQuery() throws IOException {
 65 |         assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY).setTokenizeQuery(false), "foo=bar&baz=bat");
 66 |     }
 67 | 
 68 |     @Test(expected = MalformedURLException.class)
 69 |     public void testInvalidURL() throws IOException {
 70 |         URLTokenFilter filter = createFilter("foobar", URLPart.HOST);
 71 |         filter.incrementToken();
 72 |     }
 73 | 
 74 |     @Test
 75 |     public void testNullURL() throws IOException {
 76 |         URLTokenFilter filter = createFilter(null, URLPart.HOST);
 77 |         filter.incrementToken();
 78 |     }
 79 | 
 80 |     @Test
 81 |     public void testUrlDecode() throws IOException {
 82 |         assertTokenStreamContents(createFilter("https://www.foo.com?email=foo%40bar.com", URLPart.QUERY, true), "email=foo@bar.com");
 83 |     }
 84 | 
 85 |     @Test
 86 |     public void testInferPort() throws IOException {
 87 |         assertTokenStreamContents(createFilter("http://www.foo.bar.com/baz/bat.html", URLPart.PORT), "80");
 88 |         assertTokenStreamContents(createFilter("https://www.foo.bar.com/baz/bat.html", URLPart.PORT), "443");
 89 |         assertTokenStreamContents(createFilter("https://foo.bar.com", URLPart.PORT), "443");
 90 |     }
 91 | 
 92 |     @Test
 93 |     public void testMalformed() throws IOException {
 94 |         URLTokenFilter filter = createFilter("http://:::::::/baz", URLPart.PROTOCOL, false, true);
 95 |         filter.setTokenizeMalformed(true);
 96 |         assertTokenStreamContents(filter, "http");
 97 | 
 98 |         filter = createFilter("foo.com/bar?baz=bat", URLPart.QUERY, false, true);
 99 |         filter.setTokenizeMalformed(true);
100 |         assertTokenStreamContents(filter, "baz=bat");
101 | 
102 |         filter = createFilter("baz.com:3456/foo", URLPart.PORT, false, true);
103 |         filter.setTokenizeMalformed(true);
104 |         assertTokenStreamContents(filter, "3456");
105 |     }
106 | 
107 |     private URLTokenFilter createFilter(final String url, final URLPart part) {
108 |         return createFilter(url, part, false);
109 |     }
110 | 
111 |     private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode) {
112 |         return createFilter(url, part, urlDecode, false);
113 |     }
114 | 
115 |     private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode, final boolean allowMalformed) {
116 |         int length = 0;
117 |         if (url != null) {
118 |             length = url.length();
119 |         }
120 |         return new URLTokenFilter(new CannedTokenStream(new Token(url, 0, length)), part, urlDecode, allowMalformed);
121 |     }
122 | 
123 |     private static void assertTokenStreamContents(TokenStream in, String output) throws IOException {
124 |         assertTokenStreamContents(in, new String[]{output});
125 |     }
126 | }


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
  4 | import org.elasticsearch.action.bulk.BulkRequestBuilder;
  5 | import org.elasticsearch.action.bulk.BulkResponse;
  6 | import org.elasticsearch.action.search.SearchResponse;
  7 | import org.elasticsearch.common.text.Text;
  8 | import org.elasticsearch.index.query.QueryBuilders;
  9 | import org.elasticsearch.search.SearchHit;
 10 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
 11 | import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
 12 | import org.hamcrest.Matchers;
 13 | import org.junit.Test;
 14 | 
 15 | import java.util.HashMap;
 16 | import java.util.List;
 17 | import java.util.Map;
 18 | 
 19 | import static org.hamcrest.CoreMatchers.equalTo;
 20 | import static org.hamcrest.CoreMatchers.notNullValue;
 21 | import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
 22 | import static org.hamcrest.collection.IsMapContaining.hasKey;
 23 | import static org.hamcrest.core.IsCollectionContaining.hasItem;
 24 | 
 25 | /**
 26 |  * Joe Linn
 27 |  * 8/1/2015
 28 |  */
 29 | public class URLTokenizerIntegrationTest extends URLAnalysisTestCase {
 30 |     @Test
 31 |     public void testAnalyze() {
 32 |         assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol", "http");
 33 |         assertTokensContain(URLTokenizerTest.TEST_HTTPS_URL, "tokenizer_url_protocol", "https");
 34 | 
 35 |         assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_host", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
 36 |         List<AnalyzeResponse.AnalyzeToken> hostTokens = assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_host_single", "www.foo.bar.com");
 37 |         assertThat(hostTokens, hasSize(1));
 38 | 
 39 |         assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
 40 | 
 41 |         assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
 42 | 
 43 |         assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
 44 |     }
 45 | 
 46 | 
 47 |     @Test
 48 |     public void testAnalyzeWhole() throws Exception {
 49 |         List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed");
 50 |         assertThat(tokens, notNullValue());
 51 |         assertThat(tokens, hasSize(7));
 52 |     }
 53 | 
 54 | 
 55 |     @Test
 56 |     public void testHighlight() throws Exception {
 57 |         final String field = "url_highlight_test";
 58 |         Map<String, String> docContent = new HashMap<>();
 59 |         final String url = "http://www.foo.bar.com:8080/baz/bat?bob=blah";
 60 |         docContent.put(field, url);
 61 |         client().prepareIndex(INDEX, TYPE).setSource(docContent).get();
 62 |         refresh(INDEX);
 63 | 
 64 |         SearchResponse response = client().prepareSearch(INDEX).setQuery(QueryBuilders.matchQuery(field, "www.foo.bar.com:8080"))
 65 |                 .highlighter(new HighlightBuilder().preTags("<b>").postTags("</b>").field("*").forceSource(true))
 66 |                 .get();
 67 | 
 68 |         SearchHit[] hits = response.getHits().getHits();
 69 |         assertThat(hits.length, equalTo(1));
 70 | 
 71 |         SearchHit hit = hits[0];
 72 |         Map<String, Object> source = hit.getSource();
 73 |         assertThat(source.size(), equalTo(1));
 74 |         assertThat(source, hasKey(field));
 75 |         assertThat("URL was stored correctly", source.get(field), equalTo(url));
 76 |         assertThat(hit.highlightFields(), hasKey(field));
 77 |         HighlightField highlightField = hit.highlightFields().get(field);
 78 |         Text[] fragments = highlightField.getFragments();
 79 |         assertThat(fragments.length, equalTo(1));
 80 |         Text fragment = fragments[0];
 81 |         assertThat("URL was highlighted correctly", fragment.string(), equalTo("http://<b>www.foo.bar.com</b>:<b>8080</b>/baz/bat?bob=blah"));
 82 |     }
 83 | 
 84 | 
 85 |     @Test
 86 |     public void testBulkIndexing() throws Exception {
 87 |         final String field = "bulk_indexing_test";
 88 |         Map<String, String> content;
 89 |         final int numDocs = 100;
 90 |         BulkRequestBuilder bulkBuilder = client().prepareBulk();
 91 |         for (int i = 0; i < numDocs; i++) {
 92 |             content = new HashMap<>();
 93 |             content.put(field, "http://domain" + i + ".com/foo" + i + "/bar.html");
 94 |             bulkBuilder.add(client().prepareIndex(INDEX, TYPE).setSource(content));
 95 |         }
 96 |         BulkResponse bulkResponse = bulkBuilder.get();
 97 |         assertThat(bulkResponse.buildFailureMessage(), bulkResponse.hasFailures(), equalTo(false));
 98 |     }
 99 | 
100 | 
101 |     private List<AnalyzeResponse.AnalyzeToken> assertTokensContain(String url, String analyzer, String... expected) {
102 |         List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL(url, analyzer);
103 |         for (String e : expected) {
104 |             assertThat(tokens, hasItem(Matchers.<AnalyzeResponse.AnalyzeToken>hasProperty("term", equalTo(e))));
105 |         }
106 |         return tokens;
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis.url;
  2 | 
  3 | import com.google.common.collect.Lists;
  4 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  5 | import org.apache.lucene.analysis.TokenStream;
  6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  7 | import org.elasticsearch.index.analysis.URLPart;
  8 | import org.junit.Test;
  9 | 
 10 | import java.io.IOException;
 11 | import java.io.StringReader;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | 
 15 | import static org.elasticsearch.index.analysis.url.IsTokenStreamWithTokenAndPosition.hasTokenAtOffset;
 16 | import static org.hamcrest.CoreMatchers.equalTo;
 17 | import static org.hamcrest.core.IsCollectionContaining.hasItem;
 18 | 
 19 | /**
 20 |  * Joe Linn
 21 |  * 7/30/2015
 22 |  */
 23 | public class URLTokenizerTest extends BaseTokenStreamTestCase {
 24 |     public static final String TEST_HTTP_URL = "http://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
 25 |     public static final String TEST_HTTPS_URL = "https://www.foo.bar.com:9200/index_name/type_name/_search.html?foo=bar&baz=bat#tag";
 26 | 
 27 | 
 28 |     @Test
 29 |     public void testTokenizeProtocol() throws IOException {
 30 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL);
 31 |         assertTokenStreamContents(tokenizer, "http");
 32 | 
 33 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL);
 34 |         assertThat(tokenizer, hasTokenAtOffset("http", 0, 4));
 35 | 
 36 |         tokenizer = createTokenizer(TEST_HTTPS_URL, URLPart.PROTOCOL);
 37 |         assertTokenStreamContents(tokenizer, "https");
 38 |     }
 39 | 
 40 | 
 41 |     @Test
 42 |     public void testTokenizeHost() throws IOException {
 43 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
 44 |         assertTokenStreamContents(tokenizer, stringArray("www.foo.bar.com", "foo.bar.com", "bar.com", "com"));
 45 | 
 46 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
 47 |         assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com", 7, 22));
 48 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
 49 |         assertThat(tokenizer, hasTokenAtOffset("foo.bar.com", 11, 22));
 50 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
 51 |         assertThat(tokenizer, hasTokenAtOffset("bar.com", 15, 22));
 52 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.HOST);
 53 |         assertThat(tokenizer, hasTokenAtOffset("com", 19, 22));
 54 |     }
 55 | 
 56 | 
 57 |     @Test
 58 |     public void testTokenizePort() throws IOException {
 59 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PORT);
 60 |         assertThat(tokenizer, hasTokenAtOffset("9200", 23, 27));
 61 | 
 62 |         tokenizer = createTokenizer("http://foo.bar.com", URLPart.PORT);
 63 |         assertThat(tokenizer, hasTokenAtOffset("80", 0, 0));
 64 |     }
 65 | 
 66 | 
 67 |     @Test
 68 |     public void testTokenizePath() throws IOException {
 69 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
 70 |         assertTokenStreamContents(tokenizer, stringArray("/index_name", "/index_name/type_name", "/index_name/type_name/_search.html"));
 71 | 
 72 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
 73 |         assertThat(tokenizer, hasTokenAtOffset("/index_name", 27, 38));
 74 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
 75 |         assertThat(tokenizer, hasTokenAtOffset("/index_name/type_name", 27, 48));
 76 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PATH);
 77 |         assertThat(tokenizer, hasTokenAtOffset("/index_name/type_name/_search.html", 27, 61));
 78 | 
 79 |         tokenizer.reset();
 80 |         tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
 81 |         tokenizer.setTokenizePath(false);
 82 | 
 83 |         assertTokenStreamContents(tokenizer, stringArray("/index_name/type_name/_search.html"));
 84 |     }
 85 | 
 86 | 
 87 |     @Test
 88 |     public void testTokenizeNoPath() throws Exception {
 89 |         final String url = "http://www.foo.bar.com:9200";
 90 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.PATH);
 91 |         assertTokenStreamContents(tokenizer, stringArray());
 92 |     }
 93 | 
 94 | 
 95 |     @Test
 96 |     public void testTokenizeQuery() throws IOException {
 97 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
 98 |         assertTokenStreamContents(tokenizer, stringArray("foo=bar", "baz=bat"));
 99 | 
100 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
101 |         assertThat(tokenizer, hasTokenAtOffset("foo=bar", 62, 69));
102 |         tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.QUERY);
103 |         assertThat(tokenizer, hasTokenAtOffset("baz=bat", 70, 77));
104 |     }
105 | 
106 | 
107 |     @Test
108 |     public void testTokenizeRef() throws IOException {
109 |         URLTokenizer tokenizer = createTokenizer("http://foo.com#baz", URLPart.REF);
110 |         assertThat(tokenizer, hasTokenAtOffset("baz", 15, 18));
111 |     }
112 | 
113 | 
114 |     @Test
115 |     public void testAll() throws IOException {
116 |         URLTokenizer tokenizer = new URLTokenizer();
117 |         tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
118 |         CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
119 |         tokenizer.reset();
120 |         tokenizer.clearAttributes();
121 |         List<String> tokens = new ArrayList<>();
122 |         while(tokenizer.incrementToken()){
123 |             tokens.add(termAttribute.toString());
124 |         }
125 | 
126 |         assertThat(tokens, hasItem(equalTo("https")));
127 |         assertThat(tokens, hasItem(equalTo("foo.bar.com")));
128 |         assertThat(tokens, hasItem(equalTo("www.foo.bar.com:9200")));
129 |         assertThat(tokens, hasItem(equalTo("https://www.foo.bar.com")));
130 | 
131 |         tokenizer = createTokenizer("https://foo.com", null);
132 |         assertThat(tokenizer, hasTokenAtOffset("https", 0, 5));
133 |     }
134 | 
135 | 
136 |     @Test(expected = IOException.class)
137 |     public void testMalformed() throws IOException {
138 |         URLTokenizer tokenizer = createTokenizer("://foo.com", URLPart.QUERY);
139 |         assertTokenStreamContents(tokenizer, stringArray("foo=bar", "baz=bat"));
140 |     }
141 | 
142 | 
143 |     @Test
144 |     public void testAllowMalformed() throws IOException {
145 |         URLTokenizer tokenizer = createTokenizer("://foo.com", URLPart.QUERY);
146 |         tokenizer.setAllowMalformed(true);
147 |         assertTokenStreamContents(tokenizer, stringArray("://foo.com"));
148 |     }
149 | 
150 | 
151 |     @Test
152 |     public void testUrlDecode() throws Exception {
153 |         String url = "http://foo.com?baz=foo%20bat";
154 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
155 |         tokenizer.setUrlDecode(true);
156 |         assertTokenStreamContents(tokenizer, stringArray("baz=foo bat"));
157 |     }
158 | 
159 | 
160 |     @Test(expected = IOException.class)
161 |     public void testUrlDecodeIllegalCharacters() throws Exception {
162 |         String url = "http://foo.com?baz=foo%2vbat";
163 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
164 |         tokenizer.setUrlDecode(true);
165 |         assertTokenStreamContents(tokenizer, "");
166 |     }
167 | 
168 | 
169 |     @Test
170 |     public void testUrlDecodeAllowMalformed() throws Exception {
171 |         String url = "http://foo.com?baz=foo%2vbat";
172 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
173 |         tokenizer.setUrlDecode(true);
174 |         tokenizer.setAllowMalformed(true);
175 |         assertTokenStreamContents(tokenizer, "baz=foo%2vbat");
176 |     }
177 | 
178 | 
179 |     @Test
180 |     public void testPartialUrl() throws Exception {
181 |         final String url = "http://";
182 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.QUERY);
183 |         assertTokenStreamContents(tokenizer, new String[]{});
184 |     }
185 | 
186 | 
187 |     @Test
188 |     public void testNoProtocol() throws Exception {
189 |         final String url = "foo.bar.baz/bat/blah.html";
190 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.PATH);
191 |         tokenizer.setAllowMalformed(true);
192 |         tokenizer.setTokenizeMalformed(true);
193 |         assertTokenStreamContents(tokenizer, stringArray("/bat", "/bat/blah.html"));
194 |     }
195 | 
196 | 
197 |     @Test
198 |     public void testMalformedGetRef() throws Exception {
199 |         String url = "/bat/blah.html#tag?baz=bat";
200 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.REF);
201 |         tokenizer.setAllowMalformed(true);
202 |         tokenizer.setTokenizeMalformed(true);
203 |         assertTokenStreamContents(tokenizer, stringArray("tag"));
204 |     }
205 | 
206 | 
207 |     @Test
208 |     public void testMalformedWhole() throws Exception {
209 |         String url = "foo.bar.com/baz.html/query?a=1";
210 |         URLTokenizer tokenizer = createTokenizer(url, URLPart.WHOLE);
211 |         tokenizer.setAllowMalformed(true);
212 |         tokenizer.setTokenizeMalformed(true);
213 |         assertTokenStreamContents(tokenizer, stringArray("foo.bar.com/baz.html/query?a=1"));
214 |     }
215 | 
216 | 
217 |     @Test
218 |     public void testProtocolAndPort() throws Exception {
219 |         URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL, URLPart.PORT);
220 |         assertTokenStreamContents(tokenizer, stringArray("http", "9200"));
221 |     }
222 | 
223 | 
224 |     @Test
225 |     public void testMalformedHostAndWhole() throws Exception {
226 |         URLTokenizer tokenizer = createTokenizer("example.com", URLPart.WHOLE, URLPart.HOST);
227 |         tokenizer.setAllowMalformed(true);
228 |         tokenizer.setTokenizeMalformed(true);
229 |         tokenizer.setTokenizeHost(false);
230 |         assertTokenStreamContents(tokenizer, stringArray("example.com"));
231 |     }
232 | 
233 | 
234 |     @Test
235 |     public void testTokenizeMalformedNoPartSpecified() throws Exception {
236 |         URLTokenizer tokenizer = createTokenizer("example.com");
237 |         tokenizer.setAllowMalformed(true);
238 |         tokenizer.setTokenizeMalformed(true);
239 |         tokenizer.setTokenizeHost(false);
240 |         assertTokenStreamContents(tokenizer, stringArray("example.com"));
241 |     }
242 | 
243 | 
244 |     @Test
245 |     public void testAllowMalformedNoPartsSpecified() throws Exception {
246 |         URLTokenizer tokenizer = createTokenizer("example.com");
247 |         tokenizer.setAllowMalformed(true);
248 |         tokenizer.setTokenizeHost(false);
249 |         assertTokenStreamContents(tokenizer, stringArray("example.com"));
250 |     }
251 | 
252 | 
253 |     @Test
254 |     public void testTokenizeSpecial() throws Exception {
255 |         final String url = "http://www.foo.bar.com:8080/baz/bat?bob=blah";
256 |         URLTokenizer tokenizer = createEverythingTokenizer(url);
257 |         assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com:8080", 7, 27));
258 |         tokenizer = createEverythingTokenizer(url);
259 |         assertThat(tokenizer, hasTokenAtOffset("www.foo.bar.com", 7, 22));
260 |         tokenizer = createEverythingTokenizer(url);
261 |         assertThat(tokenizer, hasTokenAtOffset("foo.bar.com", 11, 22));
262 |         tokenizer = createEverythingTokenizer(url);
263 |         assertThat(tokenizer, hasTokenAtOffset("bar.com", 15, 22));
264 |     }
265 | 
266 | 
267 |     private URLTokenizer createEverythingTokenizer(String input) throws IOException {
268 |         URLTokenizer tokenizer = createTokenizer(input);
269 |         tokenizer.setAllowMalformed(true);
270 |         tokenizer.setUrlDecode(true);
271 |         tokenizer.setTokenizeMalformed(true);
272 |         tokenizer.setTokenizeHost(true);
273 |         tokenizer.setTokenizePath(true);
274 |         tokenizer.setTokenizeQuery(true);
275 |         return tokenizer;
276 |     }
277 | 
278 | 
279 |     private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
280 |         URLTokenizer tokenizer = new URLTokenizer();
281 |         if (parts != null) {
282 |             tokenizer.setParts(Lists.newArrayList(parts));
283 |         }
284 |         tokenizer.setReader(new StringReader(input));
285 |         return tokenizer;
286 |     }
287 | 
288 | 
289 |     private String[] stringArray(String... strings) {
290 |         return strings;
291 |     }
292 | 
293 | 
294 |     private static void assertTokenStreamContents(TokenStream in, String output) throws IOException {
295 |         assertTokenStreamContents(in, new String[]{output});
296 |     }
297 | }


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/url/URLUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.url;
 2 | 
 3 | import org.elasticsearch.index.analysis.URLPart;
 4 | import org.junit.Test;
 5 | 
 6 | import static org.elasticsearch.index.analysis.url.OptionalMatchers.hasValue;
 7 | import static org.elasticsearch.index.analysis.url.OptionalMatchers.isEmpty;
 8 | import static org.elasticsearch.index.analysis.url.URLUtils.getPart;
 9 | import static org.hamcrest.CoreMatchers.equalTo;
10 | import static org.hamcrest.MatcherAssert.assertThat;
11 | 
12 | /**
13 |  * @author Joe Linn
14 |  *         6/25/2016
15 |  */
16 | public class URLUtilsTest {
17 |     private static final String URL_1 = "http://foo.bar.com/baz/bat.html#whee?bob=loblaw&this=that";
18 |     private static final String URL_2 = "foo.bar.com/baz/bat.html#whee?bob=loblaw&this=that";
19 |     private static final String URL_3 = "/baz/bat.html#whee?bob=loblaw&this=that";
20 |     private static final String URL_4 = "/baz/bat.html?bob=loblaw&this=that";
21 | 
22 |     @Test
23 |     public void testGetProtocol() {
24 |         final URLPart part = URLPart.PROTOCOL;
25 |         assertThat(getPart(URL_1, part), hasValue(equalTo("http")));
26 |         assertThat(getPart(URL_2, part), isEmpty());
27 |     }
28 | 
29 | 
30 |     @Test
31 |     public void testGetHost() {
32 |         final URLPart part = URLPart.HOST;
33 |         assertThat(getPart(URL_1, part), hasValue(equalTo("foo.bar.com")));
34 |         assertThat(getPart(URL_2, part), hasValue(equalTo("foo.bar.com")));
35 |     }
36 | 
37 | 
38 |     @Test
39 |     public void testGetPort() {
40 |         final URLPart part = URLPart.PORT;
41 |         assertThat(getPart(URL_1, part), hasValue(equalTo("80")));
42 |         assertThat(getPart(URL_2, part), isEmpty());
43 |     }
44 | 
45 | 
46 |     @Test
47 |     public void testGetPath() {
48 |         final URLPart part = URLPart.PATH;
49 |         assertThat(getPart(URL_1, part), hasValue(equalTo("/baz/bat.html")));
50 |         assertThat(getPart(URL_2, part), hasValue(equalTo("/baz/bat.html")));
51 |         assertThat(getPart(URL_3, part), hasValue(equalTo("/baz/bat.html")));
52 |     }
53 | 
54 | 
55 |     @Test
56 |     public void testGetRef() {
57 |         final URLPart part = URLPart.REF;
58 |         assertThat(getPart(URL_1, part), hasValue(equalTo("whee")));
59 |         assertThat(getPart(URL_2, part), hasValue(equalTo("whee")));
60 |         assertThat(getPart(URL_3, part), hasValue(equalTo("whee")));
61 |     }
62 | 
63 | 
64 |     @Test
65 |     public void testGetQuery() {
66 |         final URLPart part = URLPart.QUERY;
67 |         assertThat(getPart(URL_1, part), hasValue(equalTo("bob=loblaw&this=that")));
68 |         assertThat(getPart(URL_2, part), hasValue(equalTo("bob=loblaw&this=that")));
69 |         assertThat(getPart(URL_3, part), hasValue(equalTo("bob=loblaw&this=that")));
70 |         assertThat(getPart(URL_4, part), hasValue(equalTo("bob=loblaw&this=that")));
71 |     }
72 | }


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | es.logger.level=INFO
2 | log4j.rootLogger=${es.logger.level}, out
3 | 
4 | log4j.appender.out=org.apache.log4j.ConsoleAppender
5 | log4j.appender.out.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n


--------------------------------------------------------------------------------
/src/test/resources/test-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "url": {
 4 |             "type": "string",
 5 |             "fields": {
 6 |                 "url": {
 7 |                     "type": "string",
 8 |                     "index": "not_analyzed"
 9 |                 },
10 |                 "port": {
11 |                     "type": "string",
12 |                     "analyzer": "url_port"
13 |                 }
14 |             }
15 |         },
16 |         "url_tokenized": {
17 |             "type": "string",
18 |             "fields": {
19 |                 "url_tokenized": {
20 |                     "type": "string",
21 |                     "index": "not_analyzed"
22 |                 },
23 |                 "protocol": {
24 |                     "type": "string",
25 |                     "analyzer": "tokenizer_url_protocol"
26 |                 }
27 |             }
28 |         },
29 |         "url_malformed": {
30 |             "type": "string",
31 |             "fields": {
32 |                 "url": {
33 |                     "type": "string",
34 |                     "index": "not_analyzed"
35 |                 },
36 |                 "port": {
37 |                     "type": "string",
38 |                     "analyzer": "url_port_malformed"
39 |                 }
40 |             }
41 |         },
42 |         "url_highlight_test": {
43 |             "type": "string",
44 |             "analyzer": "url_highlight_test"
45 |         },
46 |         "bulk_indexing_test": {
47 |             "type": "string",
48 |             "analyzer": "bulk_indexing_test"
49 |         }
50 |     }
51 | }


--------------------------------------------------------------------------------
/src/test/resources/test-settings.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "analysis": {
  3 |         "tokenizer": {
  4 |             "url_protocol": {
  5 |                 "type": "url",
  6 |                 "part": "protocol"
  7 |             },
  8 |             "url_host": {
  9 |                 "type": "url",
 10 |                 "part": "host"
 11 |             },
 12 |             "url_host_single": {
 13 |                 "type": "url",
 14 |                 "part": "host",
 15 |                 "tokenize_host": false
 16 |             },
 17 |             "url_protocol_and_host": {
 18 |                 "type": "url",
 19 |                 "part": ["protocol", "host"]
 20 |             },
 21 |             "url_all": {
 22 |                 "type": "url"
 23 |             },
 24 |             "url_all_malformed": {
 25 |                 "type": "url",
 26 |                 "allow_malformed": true,
 27 |                 "tokenize_malformed": true
 28 |             },
 29 |             "url_highlight_test": {
 30 |                 "type": "url",
 31 |                 "url_decode": true,
 32 |                 "allow_malformed": true,
 33 |                 "tokenize_malformed": true,
 34 |                 "tokenize_host": true,
 35 |                 "tokenize_path": true,
 36 |                 "tokenize_query": true
 37 |             }
 38 |         },
 39 |         "filter": {
 40 |             "url_protocol": {
 41 |                 "type": "url",
 42 |                 "part": "protocol"
 43 |             },
 44 |             "url_host": {
 45 |                 "type": "url",
 46 |                 "part": "host",
 47 |                 "tokenize_host": false
 48 |             },
 49 |             "url_port": {
 50 |                 "type": "url",
 51 |                 "part": "port"
 52 |             },
 53 |             "url_query": {
 54 |                 "type": "url",
 55 |                 "part": "query",
 56 |                 "url_decode": true,
 57 |                 "tokenize_query": false
 58 |             },
 59 |             "url_port_malformed": {
 60 |                 "type": "url",
 61 |                 "part": "port",
 62 |                 "allow_malformed": true,
 63 |                 "tokenize_malformed": true
 64 |             },
 65 |             "url_host_passthrough": {
 66 |                 "type": "url",
 67 |                 "part": "host",
 68 |                 "passthrough": "true"
 69 |             },
 70 |             "bulk_indexing_test": {
 71 |                 "type": "url",
 72 |                 "part": ["protocol", "host", "port", "path", "query", "ref"],
 73 |                 "url_decode": true,
 74 |                 "allow_malformed": true,
 75 |                 "tokenize_malformed": true
 76 |             }
 77 |         },
 78 |         "analyzer": {
 79 |             "url_protocol": {
 80 |                 "filter": [
 81 |                     "url_protocol"
 82 |                 ],
 83 |                 "tokenizer": "whitespace"
 84 |             },
 85 |             "url_host": {
 86 |                 "filter": [
 87 |                     "url_host"
 88 |                 ],
 89 |                 "tokenizer": "whitespace"
 90 |             },
 91 |             "url_port": {
 92 |                 "filter": [
 93 |                     "url_port"
 94 |                 ],
 95 |                 "tokenizer": "whitespace"
 96 |             },
 97 |             "url_query": {
 98 |                 "filter": [
 99 |                     "url_query"
100 |                 ],
101 |                 "tokenizer": "whitespace"
102 |             },
103 |             "url_port_malformed": {
104 |                 "filter": [
105 |                     "url_port_malformed"
106 |                 ],
107 |                 "tokenizer": "whitespace"
108 |             },
109 |             "url_host_passthrough": {
110 |                 "filter": [
111 |                     "url_host_passthrough"
112 |                 ],
113 |                 "tokenizer": "whitespace"
114 |             },
115 |             "tokenizer_url_protocol": {
116 |                 "tokenizer": "url_protocol"
117 |             },
118 |             "tokenizer_url_host": {
119 |                 "tokenizer": "url_host"
120 |             },
121 |             "tokenizer_url_host_single": {
122 |                 "tokenizer": "url_host_single"
123 |             },
124 |             "tokenizer_url_protocol_and_host": {
125 |                 "tokenizer": "url_protocol_and_host"
126 |             },
127 |             "tokenizer_url_all": {
128 |                 "tokenizer": "url_all"
129 |             },
130 |             "tokenizer_url_all_malformed": {
131 |                 "tokenizer": "url_all_malformed"
132 |             },
133 |             "url_highlight_test": {
134 |                 "tokenizer": "url_highlight_test"
135 |             },
136 |             "bulk_indexing_test": {
137 |                 "type": "custom",
138 |                 "tokenizer": "whitespace",
139 |                 "filter": ["bulk_indexing_test"]
140 |             }
141 |         }
142 |     }
143 | }


--------------------------------------------------------------------------------