├── .gitignore
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── README_ver1.5.md
├── README_ver1.md
├── bin
    └── riverweb
├── config
    └── log4j.xml
├── pom.xml
└── src
    ├── main
        ├── assemblies
        │   ├── common-bin.xml
        │   ├── targz-bin.xml
        │   └── zip-bin.xml
        ├── java
        │   └── org
        │   │   └── codelibs
        │   │       └── riverweb
        │   │           ├── RiverWeb.java
        │   │           ├── ScriptExecutionException.java
        │   │           ├── WebRiverConstants.java
        │   │           ├── app
        │   │               └── service
        │   │               │   └── ScriptService.java
        │   │           ├── config
        │   │               ├── RiverConfig.java
        │   │               └── RiverConfigManager.java
        │   │           ├── crawler
        │   │               └── RwCrawlerThread.java
        │   │           ├── entity
        │   │               └── ScrapingRule.java
        │   │           ├── interval
        │   │               └── WebRiverIntervalController.java
        │   │           ├── transformer
        │   │               └── ScrapingTransformer.java
        │   │           └── util
        │   │               ├── ConfigProperties.java
        │   │               ├── ConversionUtil.java
        │   │               ├── ScriptUtils.java
        │   │               └── SettingsUtils.java
        └── resources
        │   ├── .gitkeep
        │   ├── app.xml
        │   ├── config.xml
        │   ├── crawler
        │       ├── interval+.xml
        │       ├── rule+.xml
        │       └── transformer+.xml
        │   ├── crawler_es+crawlerThread.xml
        │   ├── lasta_di.properties
        │   └── riverweb.properties
    └── test
        ├── java
            └── org
            │   └── codelibs
            │       └── riverweb
            │           ├── RiverWebTest.java
            │           ├── app
            │               └── service
            │               │   └── ScriptServiceTest.java
            │           └── transformer
            │               └── ScrapingTransformerTest.java
        └── resources
            ├── html
                └── fess_codelibs_org.html
            └── log4j.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .settings
3 | .classpath
4 | .project
5 | .idea
6 | /target
7 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [yyyy] [name of copyright owner]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | CodeLibs
2 | Copyright 2011-2015 CodeLibs
3 | 
4 | This product includes software developed by The Apache Software
5 | Foundation (http://www.apache.org/).
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | (__River Web does not sync up with the latest elasticsearch. Fess is Enterprise Search Server and contains the same features as River Web. See [Fess](https://github.com/codelibs/fess)__)
  2 | 
  3 | Elasticsearch River Web
  4 | =======================
  5 | 
  6 | ## Overview
  7 | 
  8 | Elasticsearch River Web is a web crawler application for Elasticsearch.
  9 | This application provides a feature to crawl web sites and extract the content by CSS Query.
 10 | (As of version 1.5, River Web is not Elasticsearch plugin)
 11 | 
 12 | If you want to use Full Text Search Server, please see [Fess](https://github.com/codelibs/fess "Fess").
 13 | 
 14 | ## Version
 15 | 
 16 | | River Web | Tested on ES  | Download |
 17 | |:---------:|:-------------:|:--------:|
 18 | | master    | 2.4.X         | [Snapshot](http://maven.codelibs.org/org/codelibs/river-web/ "Snapshot") |
 19 | | 2.4.0     | 2.4.0         | [Download](https://github.com/codelibs/elasticsearch-river-web/releases/tag/river-web-2.4.0 "2.4.0") |
 20 | | 2.0.2     | 2.3.1         | [Download](https://github.com/codelibs/elasticsearch-river-web/releases/tag/river-web-2.0.2 "2.0.2") |
 21 | | 2.0.1     | 2.2.0         | [Download](https://github.com/codelibs/elasticsearch-river-web/releases/tag/river-web-2.0.1 "2.0.1") |
 22 | | 2.0.0     | 2.1.2         | [Download](https://github.com/codelibs/elasticsearch-river-web/releases/tag/river-web-2.0.0 "2.0.0") |
 23 | 
 24 | For old version, see [README\_ver1.md](https://github.com/codelibs/elasticsearch-river-web/blob/master/README_ver1.md "README_ver1.md") or [README\_ver1.5.md](https://github.com/codelibs/elasticsearch-river-web/blob/master/README_ver1.5.md "README_ver1.5.md").
 25 | 
 26 | ### Issues/Questions
 27 | 
 28 | Please file an [issue](https://github.com/codelibs/elasticsearch-river-web/issues "issue").
 29 | (Japanese forum is [here](https://github.com/codelibs/codelibs-ja-forum "here").)
 30 | 
 31 | ## Installation
 32 | 
 33 | ### Install River Web 
 34 | 
 35 | #### Zip File
 36 | 
 37 |     $ unzip elasticsearch-river-web-[VERSION].zip
 38 | 
 39 | #### Tar.GZ File
 40 | 
 41 |     $ tar zxvf elasticsearch-river-web-[VERSION].tar.gz
 42 | 
 43 | ## Usage
 44 | 
 45 | ### Create Index To Store Crawl Data
 46 | 
 47 | An index for storing crawl data is needed before starting River Web.
 48 | For example, to store data to "webindex/my_web", create it as below:
 49 | 
 50 |     $ curl -XPUT 'localhost:9200/webindex' -d '
 51 |     {  
 52 |       "settings":{  
 53 |         "index":{  
 54 |           "refresh_interval":"1s",
 55 |           "number_of_shards":"10",
 56 |           "number_of_replicas" : "0"
 57 |         }
 58 |       },
 59 |       "mappings":{  
 60 |         "my_web":{  
 61 |           "properties":{  
 62 |             "url":{  
 63 |               "type":"string",
 64 |               "index":"not_analyzed"
 65 |             },
 66 |             "method":{  
 67 |               "type":"string",
 68 |               "index":"not_analyzed"
 69 |             },
 70 |             "charSet":{  
 71 |               "type":"string",
 72 |               "index":"not_analyzed"
 73 |             },
 74 |             "mimeType":{  
 75 |               "type":"string",
 76 |               "index":"not_analyzed"
 77 |             }
 78 |           }
 79 |         }
 80 |       }
 81 |     }'
 82 | 
 83 | Feel free to add any properties other than the above if you need them.
 84 | 
 85 | ### Register Crawl Config Data
 86 | 
 87 | A crawling configuration is created by registering a document to .river\_web index as below.
 88 | This example crawls sites of http://www.codelibs.org/ and http://fess.codelibs.org/.
 89 | 
 90 |     $ curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
 91 |         "index" : "webindex",
 92 |         "type" : "my_web",
 93 |         "urls" : ["http://www.codelibs.org/", "http://fess.codelibs.org/"],
 94 |         "include_urls" : ["http://www.codelibs.org/.*", "http://fess.codelibs.org/.*"],
 95 |         "max_depth" : 3,
 96 |         "max_access_count" : 100,
 97 |         "num_of_thread" : 5,
 98 |         "interval" : 1000,
 99 |         "target" : [
100 |           {
101 |             "pattern" : {
102 |               "url" : "http://www.codelibs.org/.*",
103 |               "mimeType" : "text/html"
104 |             },
105 |             "properties" : {
106 |               "title" : {
107 |                 "text" : "title"
108 |               },
109 |               "body" : {
110 |                 "text" : "body"
111 |               },
112 |               "bodyAsHtml" : {
113 |                 "html" : "body"
114 |               },
115 |               "projects" : {
116 |                 "text" : "ul.nav-list li a",
117 |                 "isArray" : true
118 |               }
119 |             }
120 |           },
121 |           {
122 |             "pattern" : {
123 |               "url" : "http://fess.codelibs.org/.*",
124 |               "mimeType" : "text/html"
125 |             },
126 |             "properties" : {
127 |               "title" : {
128 |                 "text" : "title"
129 |               },
130 |               "body" : {
131 |                 "text" : "body",
132 |                 "trimSpaces" : true
133 |               },
134 |               "menus" : {
135 |                 "text" : "ul.nav-list li a",
136 |                 "isArray" : true
137 |               }
138 |             }
139 |           }
140 |         ]
141 |     }'
142 | 
143 | The configuration is:
144 | 
145 | | Property                      | Type    | Description                                     |
146 | |:------------------------------|:-------:|:------------------------------------------------|
147 | | index                         | string  | Stored index name.                              |
148 | | type                          | string  | Stored type name.                               |
149 | | urls                          | array   | Start point of URL for crawling.                |
150 | | include\_urls                 | array   | White list of URL for crawling.                 |
151 | | exclude\_urls                 | array   | Black list of URL for crawling.                 |
152 | | max\_depth                    | int     | Depth of crawling documents.                    |
153 | | max\_access\_count            | int     | The number of crawling documents.               |
154 | | num\_of\_thread               | int     | The number of crawler threads.                  |
155 | | interval                      | int     | Interval time (ms) to crawl documents.          |
156 | | incremental                   | boolean | Incremental crawling.                           |
157 | | overwrite                     | boolean | Delete documents of old duplicated url.         |
158 | | user\_agent                   | string  | User-agent name when crawling.                  |
159 | | robots\_txt                   | boolean | If you want to ignore robots.txt, false.        |
160 | | authentications               | object  | Specify BASIC/DIGEST/NTLM authentication info.  |
161 | | target.urlPattern             | string  | URL pattern to extract contents by CSS Query.   |
162 | | target.properties.name        | string  | "name" is used as a property name in the index. |
163 | | target.properties.name.text   | string  | CSS Query for the property value.               |
164 | | target.properties.name.html   | string  | CSS Query for the property value.               |
165 | | target.properties.name.script | string  | Rewrite the property value by Script(Groovy).   |
166 | 
167 | ### Start Crawler
168 | 
169 |     ./bin/riverweb --config-id [config doc id] --cluster-name [Elasticsearch Cluster Name] --cleanup
170 | 
171 | For example,
172 | 
173 |     ./bin/riverweb --config-id my_web --cluster-name elasticsearch --cleanup
174 | 
175 | ### Unregister Crawl Config Data
176 | 
177 | If you want to stop the crawler, kill the crawler process and then delete the config document as below:
178 | 
179 |     $ curl -XDELETE 'localhost:9200/.river_web/config/my_web'
180 | 
181 | ## Examples
182 | 
183 | ### Full Text Search for Your site (ex. http://fess.codelibs.org/)
184 | 
185 |     $ curl -XPUT 'localhost:9200/.river_web/fess/fess_site' -d '{
186 |         "index" : "webindex",
187 |         "type" : "fess_site",
188 |         "urls" : ["http://fess.codelibs.org/"],
189 |         "include_urls" : ["http://fess.codelibs.org/.*"],
190 |         "max_depth" : 3,
191 |         "max_access_count" : 1000,
192 |         "num_of_thread" : 5,
193 |         "interval" : 1000,
194 |         "target" : [
195 |           {
196 |             "pattern" : {
197 |                 "url" : "http://fess.codelibs.org/.*",
198 |                 "mimeType" : "text/html"
199 |             },
200 |             "properties" : {
201 |                 "title" : {
202 |                     "text" : "title"
203 |                 },
204 |                 "body" : {
205 |                     "text" : "body",
206 |                     "trimSpaces" : true
207 |                 }
208 |             }
209 |           }
210 |         ]
211 |     }'
212 | 
213 | 
214 | ### Aggregate a title/content from news.yahoo.com
215 | 
216 |     $ curl -XPUT 'localhost:9200/.river_web/config/yahoo_site' -d '{
217 |         "index" : "webindex",
218 |         "type" : "my_web",
219 |         "urls" : ["http://news.yahoo.com/"],
220 |         "include_urls" : ["http://news.yahoo.com/.*"],
221 |         "max_depth" : 1,
222 |         "max_access_count" : 10,
223 |         "num_of_thread" : 3,
224 |         "interval" : 3000,
225 |         "user_agent" : "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
226 |         "target" : [
227 |           {
228 |             "pattern" : {
229 |               "url" : "http://news.yahoo.com/video/.*html",
230 |               "mimeType" : "text/html"
231 |             },
232 |             "properties" : {
233 |               "title" : {
234 |                 "text" : "title"
235 |               }
236 |             }
237 |           },
238 |           {
239 |             "pattern" : {
240 |               "url" : "http://news.yahoo.com/.*html",
241 |               "mimeType" : "text/html"
242 |             },
243 |             "properties" : {
244 |               "title" : {
245 |                 "text" : "h1.headline"
246 |               },
247 |               "content" : {
248 |                 "text" : "section#mediacontentstory p"
249 |               }
250 |             }
251 |           }
252 |         ]
253 |     }'
254 | 
255 | (if news.yahoo.com is updated, the above example needs to be updated.)
256 | 
257 | ## Others
258 | 
259 | ### BASIC/DIGEST/NTLM authentication
260 | 
261 | River Web supports BASIC/DIGEST/NTLM authentication.
262 | Set authentications object.
263 | 
264 |     ...
265 |     "num_of_thread" : 5,
266 |     "interval" : 1000,
267 |     "authentications":[
268 |       {
269 |         "scope": {
270 |           "scheme":"BASIC"
271 |         },
272 |         "credentials": {
273 |           "username":"testuser",
274 |           "password":"secret"
275 |         }
276 |       }],
277 |     "target" : [
278 |     ...
279 | 
280 | The configuration is:
281 | 
282 | | Property                                | Type    | Description                                     |
283 | |:----------------------------------------|:-------:|:------------------------------------------------|
284 | | authentications.scope.scheme            | string  | BASIC, DIGEST or NTLM                           |
285 | | authentications.scope.host              | string  | (Optional)Target hostname.                      |
286 | | authentications.scope.port              | int     | (Optional)Port number.                          |
287 | | authentications.scope.realm             | string  | (Optional)Realm name.                           |
288 | | authentications.credentials.username    | string  | Username.                                       |
289 | | authentications.credentials.password    | string  | Password.                                       |
290 | | authentications.credentials.workstation | string  | (Optional)Workstation for NTLM.                 |
291 | | authentications.credentials.domain      | string  | (Optional)Domain for NTLM.                      |
292 | 
293 | For example, if you want to use an user in ActiveDirectory, the configuration is below:
294 | 
295 |     "authentications":[
296 |       {
297 |         "scope": {
298 |           "scheme":"NTLM"
299 |         },
300 |         "credentials": {
301 |           "domain":"your.ad.domain",
302 |           "username":"taro",
303 |           "password":"himitsu"
304 |         }
305 |       }],
306 | 
307 | 
308 | ### Use attachment type
309 | 
310 | River Web supports [attachment type](https://github.com/elasticsearch/elasticsearch-mapper-attachments).
311 | For example, create a mapping with attachment type:
312 | 
313 |     curl -XPUT "localhost:9200/web/test/_mapping?pretty" -d '{
314 |       "test" : {
315 |         "properties" : {
316 |     ...
317 |           "my_attachment" : {
318 |               "type" : "attachment",
319 |               "fields" : {
320 |                 "file" : { "index" : "no" },
321 |                 "title" : { "store" : "yes" },
322 |                 "date" : { "store" : "yes" },
323 |                 "author" : { "store" : "yes" },
324 |                 "keywords" : { "store" : "yes" },
325 |                 "content_type" : { "store" : "yes" },
326 |                 "content_length" : { "store" : "yes" }
327 |               }
328 |           }
329 |     ...
330 | 
331 | and then start your river. In "properties" object, when a value of "type" is "attachment", the crawled url is stored as base64-encoded data.
332 | 
333 |     curl -XPUT localhost:9200/.river_web/config/2 -d '{
334 |           "index" : "web",
335 |           "type" : "data",
336 |           "urls" : "http://...",
337 |     ...
338 |           "target" : [
339 |     ...
340 |             {
341 |               "settings" : {
342 |                 "html" : false
343 |               },
344 |               "pattern" : {
345 |                 "url" : "http://.../.*"
346 |               },
347 |               "properties" : {
348 |                 "my_attachment" : {
349 |                   "type" : "attachment"
350 |                 }
351 |               }
352 |             }
353 |           ]
354 |     ...
355 | 
356 | ### Use Multibyte Characters
357 | 
358 | An example in Japanese environment is below.
359 | First, put some configuration file into conf directory of Elasticsearch.
360 | 
361 |     $ cd $ES_HOME/conf    # ex. /etc/elasticsearch if using rpm package
362 |     $ sudo wget https://raw.github.com/codelibs/fess-server/master/src/tomcat/solr/core1/conf/mapping_ja.txt
363 |     $ sudo wget http://svn.apache.org/repos/asf/lucene/dev/trunk/solr/example/solr/collection1/conf/lang/stopwords_ja.txt 
364 | 
365 | and then create "webindex" index with analyzers for Japanese.
366 | (If you want to use uni-gram, remove cjk\_bigram in filter)
367 | 
368 |     $ curl -XPUT "localhost:9200/webindex" -d '
369 |     {
370 |       "settings" : {
371 |         "analysis" : {
372 |           "analyzer" : {
373 |             "default" : {
374 |               "type" : "custom",
375 |               "char_filter" : ["mappingJa"],
376 |               "tokenizer" : "standard",
377 |               "filter" : ["word_delimiter", "lowercase", "cjk_width", "cjk_bigram"]
378 |             }
379 |           },
380 |           "char_filter" : {
381 |             "mappingJa": {
382 |               "type" : "mapping",
383 |               "mappings_path" : "mapping_ja.txt"
384 |             }
385 |           },
386 |           "filter" : {
387 |             "stopJa" : {
388 |               "type" : "stop",
389 |               "stopwords_path" : "stopwords_ja.txt"
390 |             }
391 |           }
392 |         }
393 |       }
394 |     }'
395 | 
396 | ### Rewrite a property value by Script
397 | 
398 | River Web allows you to rewrite crawled data by Java's ScriptEngine.
399 | "javascript" is available.
400 | In "properties" object, put "script" value to a property you want to rewrite.
401 | 
402 |     ...
403 |             "properties" : {
404 |     ...
405 |               "flag" : {
406 |                 "text" : "body",
407 |                 "script" : "value.indexOf('Elasticsearch') > 0 ? 'yes' : 'no';"
408 |               },
409 | 
410 | The above is, if a string value of body element in HTML contains "Elasticsearch", set "yes" to "flag" property.
411 | 
412 | ### Use HTTP proxy
413 | 
414 | Put "proxy" property in "crawl" property.
415 | 
416 |     curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
417 |         "index" : "webindex",
418 |         "type" : "my_web",
419 |     ...
420 |             "proxy" : {
421 |               "host" : "proxy.server.com",
422 |               "port" : 8080
423 |             },
424 | 
425 | ### Specify next crawled urls when crawling
426 | 
427 | To set "isChildUrl" property to true, the property values is used as next crawled urls.
428 | 
429 |     ...
430 |         "target" : [
431 |           {
432 |     ...
433 |             "properties" : {
434 |               "childUrl" : {
435 |                 "value" : ["http://fess.codelibs.org/","http://fess.codelibs.org/ja/"],
436 |                 "isArray" : true,
437 |                 "isChildUrl" : true
438 |               },
439 | 
440 | ### Intercept start/execute/finish/close actions
441 | 
442 | You can insert your script to Executing Crawler(execute)/Finished Crawler(finish).
443 | To insert scripts, put "script" property as below:
444 | 
445 |     curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
446 |         "script":{
447 |           "execute":"your script...",
448 |           "finish":"your script...",
449 |         },
450 |         ...
451 | 
452 | ## FAQ
453 | 
454 | ### What does "No scraping rule." mean?
455 | 
456 | In a river setting, "url" is starting urls to crawl a site, "include_urls" filters urls whether are crawled or not, and "target.pattern.url" is a rule to store extracted web data.
457 | If a crawling url does not match "target.pattern.url", you would see the message.
458 | Therefore, it means the crawled url does not have an extraction rule.
459 | 
460 | ### How to extract an attribute of meta tag
461 | 
462 | For example, if you want to grab a content of description's meta tag, the configuration is below:
463 | 
464 |     ...
465 |     "target" : [
466 |     ...
467 |       "properties" : {
468 |     ...
469 |         "meta" : {
470 |           "attr" : "meta[name=description]",
471 |           "args" : [ "content" ]
472 |         },
473 | 
474 | ### Incremental crawling dose not work?
475 | 
476 | "url" field needs to be "not\_analyzed" in a mapping of your stored index.
477 | See [Create Index To Store Crawl Data](https://github.com/codelibs/elasticsearch-river-web#create-index-to-store-crawl-data "Create Index To Store Crawl Data").
478 | 
479 | 
480 | ### Where is crawled data stored?
481 | 
482 | crawled data are stored to ".s2robot" index during cralwing, data extracted from them are stored to your index specified by a river setting, and then data in "robot" index are removed when the crawler is finished.
483 | 
484 | ## Powered By
485 | 
486 | * [Lasta Di](https://github.com/lastaflute/lasta-di "Lasta Di"): DI Container
487 | * [Fess Crawler](https://github.com/codelibs/fess-crawler "Fess Crawler"): Web Crawler
488 | 


--------------------------------------------------------------------------------
/README_ver1.5.md:
--------------------------------------------------------------------------------
  1 | Elasticsearch River Web
  2 | =======================
  3 | 
  4 | ## Overview
  5 | 
  6 | Elasticsearch River Web is a web crawler application for Elasticsearch.
  7 | This application provides a feature to crawl web sites and extract the content by CSS Query.
  8 | (As of version 1.5, River Web is not Elasticsearch plugin)
  9 | 
 10 | ## Version
 11 | 
 12 | | River Web | Tested on ES  | Download |
 13 | |:---------:|:-------------:|:--------:|
 14 | | master    | 1.5.X         | [Snapshot](http://maven.codelibs.org/org/codelibs/elasticsearch-river-web/ "Snapshot") |
 15 | | 1.5.1     | 1.5.2         | [ZIP](http://maven.codelibs.org/org/codelibs/elasticsearch-river-web/1.5.1/elasticsearch-river-web-1.5.1.zip "ZIP"),[TGZ](http://maven.codelibs.org/org/codelibs/elasticsearch-river-web/1.5.1/elasticsearch-river-web-1.5.1.tar.gz "TGZ") |
 16 | 
 17 | For old plugin version, see [README_ver1.md](https://github.com/codelibs/elasticsearch-river-web/blob/master/README_ver1.md "README_ver1.md").
 18 | 
 19 | ### Issues/Questions
 20 | 
 21 | Please file an [issue](https://github.com/codelibs/elasticsearch-river-web/issues "issue").
 22 | (Japanese forum is [here](https://github.com/codelibs/codelibs-ja-forum "here").)
 23 | 
 24 | ## Installation
 25 | 
 26 | ### Install River Web 
 27 | 
 28 | #### Zip File
 29 | 
 30 |     $ unzip elasticsearch-river-web-[VERSION].zip
 31 | 
 32 | #### Tar.GZ File
 33 | 
 34 |     $ tar zxvf elasticsearch-river-web-[VERSION].tar.gz
 35 | 
 36 | ## Usage
 37 | 
 38 | ### Create Index To Store Crawl Data
 39 | 
 40 | An index for storing crawl data is needed before starting River Web.
 41 | For example, to store data to "webindex/my_web", create it as below:
 42 | 
 43 |     $ curl -XPUT 'localhost:9200/webindex' -d '
 44 |     {  
 45 |       "settings":{  
 46 |         "index":{  
 47 |           "refresh_interval":"1s",
 48 |           "number_of_shards":"10",
 49 |           "number_of_replicas" : "0"
 50 |         }
 51 |       },
 52 |       "mappings":{  
 53 |         "my_web":{  
 54 |           "properties":{  
 55 |             "url":{  
 56 |               "type":"string",
 57 |               "index":"not_analyzed"
 58 |             },
 59 |             "method":{  
 60 |               "type":"string",
 61 |               "index":"not_analyzed"
 62 |             },
 63 |             "charSet":{  
 64 |               "type":"string",
 65 |               "index":"not_analyzed"
 66 |             },
 67 |             "mimeType":{  
 68 |               "type":"string",
 69 |               "index":"not_analyzed"
 70 |             }
 71 |           }
 72 |         }
 73 |       }
 74 |     }'
 75 | 
 76 | Feel free to add any properties other than the above if you need them.
 77 | 
 78 | ### Register Crawl Config Data
 79 | 
 80 | A crawling configuration is created by registering a document to .river_web index as below.
 81 | This example crawls sites of http://www.codelibs.org/ and http://fess.codelibs.org/.
 82 | 
 83 |     $ curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
 84 |         "index" : "webindex",
 85 |         "type" : "my_web",
 86 |         "url" : ["http://www.codelibs.org/", "http://fess.codelibs.org/"],
 87 |         "includeFilter" : ["http://www.codelibs.org/.*", "http://fess.codelibs.org/.*"],
 88 |         "maxDepth" : 3,
 89 |         "maxAccessCount" : 100,
 90 |         "numOfThread" : 5,
 91 |         "interval" : 1000,
 92 |         "target" : [
 93 |           {
 94 |             "pattern" : {
 95 |               "url" : "http://www.codelibs.org/.*",
 96 |               "mimeType" : "text/html"
 97 |             },
 98 |             "properties" : {
 99 |               "title" : {
100 |                 "text" : "title"
101 |               },
102 |               "body" : {
103 |                 "text" : "body"
104 |               },
105 |               "bodyAsHtml" : {
106 |                 "html" : "body"
107 |               },
108 |               "projects" : {
109 |                 "text" : "ul.nav-list li a",
110 |                 "isArray" : true
111 |               }
112 |             }
113 |           },
114 |           {
115 |             "pattern" : {
116 |               "url" : "http://fess.codelibs.org/.*",
117 |               "mimeType" : "text/html"
118 |             },
119 |             "properties" : {
120 |               "title" : {
121 |                 "text" : "title"
122 |               },
123 |               "body" : {
124 |                 "text" : "body",
125 |                 "trimSpaces" : true
126 |               },
127 |               "menus" : {
128 |                 "text" : "ul.nav-list li a",
129 |                 "isArray" : true
130 |               }
131 |             }
132 |           }
133 |         ]
134 |     }'
135 | 
136 | The configuration is:
137 | 
138 | | Property                      | Type    | Description                                     |
139 | |:------------------------------|:-------:|:------------------------------------------------|
140 | | index                         | string  | Stored index name.                              |
141 | | type                          | string  | Stored type name.                               |
142 | | url                           | array   | Start point of URL for crawling.                |
143 | | includeFilter                 | array   | White list of URL for crawling.                 |
144 | | excludeFilter                 | array   | Black list of URL for crawling.                 |
145 | | maxDepth                      | int     | Depth of crawling documents.                    |
146 | | maxAccessCount                | int     | The number of crawling documents.               |
147 | | numOfThread                   | int     | The number of crawler threads.                  |
148 | | interval                      | int     | Interval time (ms) to crawl documents.          |
149 | | incremental                   | boolean | Incremental crawling.                           |
150 | | overwrite                     | boolean | Delete documents of old duplicated url.         |
151 | | userAgent                     | string  | User-agent name when crawling.                  |
152 | | robotsTxt                     | boolean | If you want to ignore robots.txt, false.        |
153 | | authentications               | object  | Specify BASIC/DIGEST/NTLM authentication info.  |
154 | | target.urlPattern             | string  | URL pattern to extract contents by CSS Query.   |
155 | | target.properties.name        | string  | "name" is used as a property name in the index. |
156 | | target.properties.name.text   | string  | CSS Query for the property value.               |
157 | | target.properties.name.html   | string  | CSS Query for the property value.               |
158 | | target.properties.name.script | string  | Rewrite the property value by Script(Groovy).   |
159 | 
160 | ### Start Crawler
161 | 
162 |     ./bin/riverweb --config-id [config doc id] --cluster-name [Elasticsearch Cluster Name] --cleanup
163 | 
164 | For example,
165 | 
166 |     ./bin/riverweb --config-id my_web --cluster-name elasticsearch --cleanup
167 | 
168 | ### Unregister Crawl Config Data
169 | 
170 | If you want to stop the crawler, kill the crawler process and then delete the config document as below:
171 | 
172 |     $ curl -XDELETE 'localhost:9200/.river_web/config/my_web'
173 | 
174 | ## Examples
175 | 
176 | ### Full Text Search for Your site (ex. http://fess.codelibs.org/)
177 | 
178 |     $ curl -XPUT 'localhost:9200/.river_web/fess/fess_site' -d '{
179 |         "index" : "webindex",
180 |         "type" : "fess_site",
181 |         "url" : ["http://fess.codelibs.org/"],
182 |         "includeFilter" : ["http://fess.codelibs.org/.*"],
183 |         "maxDepth" : 3,
184 |         "maxAccessCount" : 1000,
185 |         "numOfThread" : 5,
186 |         "interval" : 1000,
187 |         "target" : [
188 |           {
189 |             "pattern" : {
190 |                 "url" : "http://fess.codelibs.org/.*",
191 |                 "mimeType" : "text/html"
192 |             },
193 |             "properties" : {
194 |                 "title" : {
195 |                     "text" : "title"
196 |                 },
197 |                 "body" : {
198 |                     "text" : "body",
199 |                     "trimSpaces" : true
200 |                 }
201 |             }
202 |           }
203 |         ]
204 |     }'
205 | 
206 | 
207 | ### Aggregate a title/content from news.yahoo.com
208 | 
209 |     $ curl -XPUT 'localhost:9200/.river_web/config/yahoo_site' -d '{
210 |         "index" : "webindex",
211 |         "type" : "my_web",
212 |         "url" : ["http://news.yahoo.com/"],
213 |         "includeFilter" : ["http://news.yahoo.com/.*"],
214 |         "maxDepth" : 1,
215 |         "maxAccessCount" : 10,
216 |         "numOfThread" : 3,
217 |         "interval" : 3000,
218 |         "userAgent" : "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
219 |         "target" : [
220 |           {
221 |             "pattern" : {
222 |               "url" : "http://news.yahoo.com/video/.*html",
223 |               "mimeType" : "text/html"
224 |             },
225 |             "properties" : {
226 |               "title" : {
227 |                 "text" : "title"
228 |               }
229 |             }
230 |           },
231 |           {
232 |             "pattern" : {
233 |               "url" : "http://news.yahoo.com/.*html",
234 |               "mimeType" : "text/html"
235 |             },
236 |             "properties" : {
237 |               "title" : {
238 |                 "text" : "h1.headline"
239 |               },
240 |               "content" : {
241 |                 "text" : "section#mediacontentstory p"
242 |               }
243 |             }
244 |           }
245 |         ]
246 |     }'
247 | 
248 | (if news.yahoo.com is updated, the above example needs to be updated.)
249 | 
250 | ## Others
251 | 
252 | ### BASIC/DIGEST/NTLM authentication
253 | 
254 | River Web supports BASIC/DIGEST/NTLM authentication.
255 | Set authentications object.
256 | 
257 |     ...
258 |     "numOfThread" : 5,
259 |     "interval" : 1000,
260 |     "authentications":[
261 |       {
262 |         "scope": {
263 |           "scheme":"BASIC"
264 |         },
265 |         "credentials": {
266 |           "username":"testuser",
267 |           "password":"secret"
268 |         }
269 |       }],
270 |     "target" : [
271 |     ...
272 | 
273 | The configuration is:
274 | 
275 | | Property                                | Type    | Description                                     |
276 | |:----------------------------------------|:-------:|:------------------------------------------------|
277 | | authentications.scope.scheme            | string  | BASIC, DIGEST or NTLM                           |
278 | | authentications.scope.host              | string  | (Optional)Target hostname.                      |
279 | | authentications.scope.port              | int     | (Optional)Port number.                          |
280 | | authentications.scope.realm             | string  | (Optional)Realm name.                           |
281 | | authentications.credentials.username    | string  | Username.                                       |
282 | | authentications.credentials.password    | string  | Password.                                       |
283 | | authentications.credentials.workstation | string  | (Optional)Workstation for NTLM.                 |
284 | | authentications.credentials.domain      | string  | (Optional)Domain for NTLM.                      |
285 | 
286 | For example, if you want to use an user in ActiveDirectory, the configuration is below:
287 | 
288 |     "authentications":[
289 |       {
290 |         "scope": {
291 |           "scheme":"NTLM"
292 |         },
293 |         "credentials": {
294 |           "domain":"your.ad.domain",
295 |           "username":"taro",
296 |           "password":"himitsu"
297 |         }
298 |       }],
299 | 
300 | 
301 | ### Use attachment type
302 | 
303 | River Web supports [attachment type](https://github.com/elasticsearch/elasticsearch-mapper-attachments).
304 | For example, create a mapping with attachment type:
305 | 
306 |     curl -XPUT "localhost:9200/web/test/_mapping?pretty" -d '{
307 |       "test" : {
308 |         "properties" : {
309 |     ...
310 |           "my_attachment" : {
311 |               "type" : "attachment",
312 |               "fields" : {
313 |                 "file" : { "index" : "no" },
314 |                 "title" : { "store" : "yes" },
315 |                 "date" : { "store" : "yes" },
316 |                 "author" : { "store" : "yes" },
317 |                 "keywords" : { "store" : "yes" },
318 |                 "content_type" : { "store" : "yes" },
319 |                 "content_length" : { "store" : "yes" }
320 |               }
321 |           }
322 |     ...
323 | 
324 | and then start your river. In "properties" object, when a value of "type" is "attachment", the crawled url is stored as base64-encoded data.
325 | 
326 |     curl -XPUT localhost:9200/.river_web/config/2 -d '{
327 |           "index" : "web",
328 |           "type" : "data",
329 |           "url" : "http://...",
330 |     ...
331 |           "target" : [
332 |     ...
333 |             {
334 |               "settings" : {
335 |                 "html" : false
336 |               },
337 |               "pattern" : {
338 |                 "url" : "http://.../.*"
339 |               },
340 |               "properties" : {
341 |                 "my_attachment" : {
342 |                   "type" : "attachment"
343 |                 }
344 |               }
345 |             }
346 |           ]
347 |     ...
348 | 
349 | ### Use Multibyte Characters
350 | 
351 | An example in Japanese environment is below.
352 | First, put some configuration file into conf directory of Elasticsearch.
353 | 
354 |     $ cd $ES_HOME/conf    # ex. /etc/elasticsearch if using rpm package
355 |     $ sudo wget https://raw.github.com/codelibs/fess-server/master/src/tomcat/solr/core1/conf/mapping_ja.txt
356 |     $ sudo wget http://svn.apache.org/repos/asf/lucene/dev/trunk/solr/example/solr/collection1/conf/lang/stopwords_ja.txt 
357 | 
358 | and then create "webindex" index with analyzers for Japanese.
359 | (If you want to use uni-gram, remove cjk\_bigram in filter)
360 | 
361 |     $ curl -XPUT "localhost:9200/webindex" -d '
362 |     {
363 |       "settings" : {
364 |         "analysis" : {
365 |           "analyzer" : {
366 |             "default" : {
367 |               "type" : "custom",
368 |               "char_filter" : ["mappingJa"],
369 |               "tokenizer" : "standard",
370 |               "filter" : ["word_delimiter", "lowercase", "cjk_width", "cjk_bigram"]
371 |             }
372 |           },
373 |           "char_filter" : {
374 |             "mappingJa": {
375 |               "type" : "mapping",
376 |               "mappings_path" : "mapping_ja.txt"
377 |             }
378 |           },
379 |           "filter" : {
380 |             "stopJa" : {
381 |               "type" : "stop",
382 |               "stopwords_path" : "stopwords_ja.txt"
383 |             }
384 |           }
385 |         }
386 |       }
387 |     }'
388 | 
389 | ### Rewrite a property value by Script
390 | 
391 | River Web allows you to rewrite crawled data by Java's ScriptEngine.
392 | "javascript" is available.
393 | In "properties" object, put "script" value to a property you want to rewrite.
394 | 
395 |     ...
396 |             "properties" : {
397 |     ...
398 |               "flag" : {
399 |                 "text" : "body",
400 |                 "script" : "value.indexOf('Elasticsearch') > 0 ? 'yes' : 'no';"
401 |               },
402 | 
403 | The above is, if a string value of body element in HTML contains "Elasticsearch", set "yes" to "flag" property.
404 | 
405 | ### Use HTTP proxy
406 | 
407 | Put "proxy" property in "crawl" property.
408 | 
409 |     curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
410 |         "index" : "webindex",
411 |         "type" : "my_web",
412 |     ...
413 |             "proxy" : {
414 |               "host" : "proxy.server.com",
415 |               "port" : 8080
416 |             },
417 | 
418 | ### Specify next crawled urls when crawling
419 | 
420 | To set "isChildUrl" property to true, the property values is used as next crawled urls.
421 | 
422 |     ...
423 |         "target" : [
424 |           {
425 |     ...
426 |             "properties" : {
427 |               "childUrl" : {
428 |                 "value" : ["http://fess.codelibs.org/","http://fess.codelibs.org/ja/"],
429 |                 "isArray" : true,
430 |                 "isChildUrl" : true
431 |               },
432 | 
433 | ### Intercept start/execute/finish/close actions
434 | 
435 | You can insert your script to Executing Crawler(execute)/Finished Crawler(finish).
436 | To insert scripts, put "script" property as below:
437 | 
438 |     curl -XPUT 'localhost:9200/.river_web/config/my_web' -d '{
439 |         "script":{
440 |           "execute":"your script...",
441 |           "finish":"your script...",
442 |         },
443 |         ...
444 | 
445 | ## FAQ
446 | 
447 | ### What does "No scraping rule." mean?
448 | 
449 | In a river setting, "url" is starting urls to crawl a site, "includeFilter" filters urls whether are crawled or not, and "target.pattern.url" is a rule to store extracted web data.
450 | If a crawling url does not match "target.pattern.url", you would see the message.
451 | Therefore, it means the crawled url does not have an extraction rule.
452 | 
453 | ### How to extract an attribute of meta tag
454 | 
455 | For example, if you want to grab a content of description's meta tag, the configuration is below:
456 | 
457 |     ...
458 |     "target" : [
459 |     ...
460 |       "properties" : {
461 |     ...
462 |         "meta" : {
463 |           "attr" : "meta[name=description]",
464 |           "args" : [ "content" ]
465 |         },
466 | 
467 | ### Incremental crawling dose not work?
468 | 
469 | "url" field needs to be "not_analyzed" in a mapping of your stored index.
470 | See [Create Index To Store Crawl Data](https://github.com/codelibs/elasticsearch-river-web#create-index-to-store-crawl-data "Create Index To Store Crawl Data").
471 | 
472 | 
473 | ### Where is crawled data stored?
474 | 
475 | crawled data are stored to ".s2robot" index during cralwing, data extracted from them are stored to your index specified by a river setting, and then data in "robot" index are removed when the crawler is finished.
476 | 
477 | ## Powered by
478 | 
479 | * [Lasta Di](https://github.com/lastaflute/lasta-di "Lasta Di"): DI Container
480 | * [S2Robot](https://github.com/codelibs/s2robot "S2Robot"): Web Crawler
481 | 


--------------------------------------------------------------------------------
/README_ver1.md:
--------------------------------------------------------------------------------
  1 | Elasticsearch River Web
  2 | =======================
  3 | 
  4 | ## Overview
  5 | 
  6 | Elasticsearch River Web Plugin is a web crawler for Elasticsearch.
  7 | This plugin provides a feature to crawl web sites and extract the content by CSS Query.
  8 | 
  9 | ## Version
 10 | 
 11 | | River Web | elasticsearch |
 12 | |:---------:|:-------------:|
 13 | | master    | 1.4.X         |
 14 | | 1.4.0     | 1.4.1         |
 15 | | 1.3.1     | 1.3.4         |
 16 | | 1.2.0     | 1.2.1         |
 17 | | 1.1.2     | 1.1.1         |
 18 | | 1.1.1     | 1.0.2         |
 19 | | 1.0.1     | 0.90.7        |
 20 | 
 21 | ### Issues/Questions
 22 | 
 23 | Please file an [issue](https://github.com/codelibs/elasticsearch-river-web/issues "issue").
 24 | (Japanese forum is [here](https://github.com/codelibs/codelibs-ja-forum "here").)
 25 | 
 26 | ## Installation
 27 | 
 28 | ### Install Quartz Plugin
 29 | 
 30 | River Web plugin depends on Quartz plugin. 
 31 | [Quartz plugin](https://github.com/codelibs/elasticsearch-quartz) needs to be installed before installing River Web plugin.
 32 | 
 33 |     $ $ES_HOME/bin/plugin --install org.codelibs/elasticsearch-quartz/1.0.1
 34 | 
 35 | ### Install River Web Plugin
 36 | 
 37 |     $ $ES_HOME/bin/plugin --install org.codelibs/elasticsearch-river-web/1.4.0
 38 | 
 39 | ## Usage
 40 | 
 41 | ### Create Index To Store Crawl Data
 42 | 
 43 | An index is needed to store crawl data before starting a river.
 44 | For example, to store data to "webindex", create it as below:
 45 | 
 46 |     $ curl -XPUT 'localhost:9200/webindex'
 47 | 
 48 | and then create a mapping setting if using "overwrite" option:
 49 | 
 50 |     $ curl -XPUT "localhost:9200/webindex/my_web/_mapping" -d '
 51 |     {
 52 |       "my_web" : {
 53 |         "dynamic_templates" : [
 54 |           {
 55 |             "url" : {
 56 |               "match" : "url",
 57 |               "mapping" : {
 58 |                 "type" : "string",
 59 |                 "store" : "yes",
 60 |                 "index" : "not_analyzed"
 61 |               }
 62 |             }
 63 |           },
 64 |           {
 65 |             "method" : {
 66 |               "match" : "method",
 67 |               "mapping" : {
 68 |                 "type" : "string",
 69 |                 "store" : "yes",
 70 |                 "index" : "not_analyzed"
 71 |               }
 72 |             }
 73 |           },
 74 |           {
 75 |             "charSet" : {
 76 |               "match" : "charSet",
 77 |               "mapping" : {
 78 |                 "type" : "string",
 79 |                 "store" : "yes",
 80 |                 "index" : "not_analyzed"
 81 |               }
 82 |             }
 83 |           },
 84 |           {
 85 |             "mimeType" : {
 86 |               "match" : "mimeType",
 87 |               "mapping" : {
 88 |                 "type" : "string",
 89 |                 "store" : "yes",
 90 |                 "index" : "not_analyzed"
 91 |               }
 92 |             }
 93 |           }
 94 |         ]
 95 |       }
 96 |     }'
 97 | 
 98 | "my\_web" is a type given by your river name or "crawl.type".
 99 | 
100 | ### Register Crawl Data
101 | 
102 | A crawling configuration is created by registering a river as below.
103 | This example crawls sites of http://www.codelibs.org/ and http://fess.codelibs.org/ at 6:00am.
104 | 
105 |     $ curl -XPUT 'localhost:9200/_river/my_web/_meta' -d '{
106 |         "type" : "web",
107 |         "crawl" : {
108 |             "index" : "webindex",
109 |             "url" : ["http://www.codelibs.org/", "http://fess.codelibs.org/"],
110 |             "includeFilter" : ["http://www.codelibs.org/.*", "http://fess.codelibs.org/.*"],
111 |             "maxDepth" : 3,
112 |             "maxAccessCount" : 100,
113 |             "numOfThread" : 5,
114 |             "interval" : 1000,
115 |             "target" : [
116 |               {
117 |                 "pattern" : {
118 |                   "url" : "http://www.codelibs.org/.*",
119 |                   "mimeType" : "text/html"
120 |                 },
121 |                 "properties" : {
122 |                   "title" : {
123 |                     "text" : "title"
124 |                   },
125 |                   "body" : {
126 |                     "text" : "body"
127 |                   },
128 |                   "bodyAsHtml" : {
129 |                     "html" : "body"
130 |                   },
131 |                   "projects" : {
132 |                     "text" : "ul.nav-list li a",
133 |                     "isArray" : true
134 |                   }
135 |                 }
136 |               },
137 |               {
138 |                 "pattern" : {
139 |                   "url" : "http://fess.codelibs.org/.*",
140 |                   "mimeType" : "text/html"
141 |                 },
142 |                 "properties" : {
143 |                   "title" : {
144 |                     "text" : "title"
145 |                   },
146 |                   "body" : {
147 |                     "text" : "body",
148 |                     "trimSpaces" : true
149 |                   },
150 |                   "menus" : {
151 |                     "text" : "ul.nav-list li a",
152 |                     "isArray" : true
153 |                   }
154 |                 }
155 |               }
156 |             ]
157 |         },
158 |         "schedule" : {
159 |             "cron" : "0 0 6 * * ?"
160 |         }
161 |     }'
162 | 
163 | "my\_web" is a configuration name for River, and you can replace it with one you want.
164 | 
165 | The configuration is:
166 | 
167 | | Property                          | Type    | Description                                     |
168 | |:------------------------------------|:-------:|:------------------------------------------------|
169 | | crawl.index                         | string  | Stored index name.                              |
170 | | crawl.type                          | string  | Stored type name.                               |
171 | | crawl.url                           | array   | Start point of URL for crawling.                |
172 | | crawl.includeFilter                 | array   | White list of URL for crawling.                 |
173 | | crawl.excludeFilter                 | array   | Black list of URL for crawling.                 |
174 | | crawl.maxDepth                      | int     | Depth of crawling documents.                    |
175 | | crawl.maxAccessCount                | int     | The number of crawling documents.               |
176 | | crawl.numOfThread                   | int     | The number of crawler threads.                  |
177 | | crawl.interval                      | int     | Interval time (ms) to crawl documents.          |
178 | | crawl.incremental                   | boolean | Incremental crawling.                           |
179 | | crawl.overwrite                     | boolean | Delete documents of old duplicated url.         |
180 | | crawl.userAgent                     | string  | User-agent name when crawling.                  |
181 | | crawl.robotsTxt                     | boolean | If you want to ignore robots.txt, false.        |
182 | | crawl.authentications               | object  | Specify BASIC/DIGEST/NTLM authentication info.  |
183 | | crawl.target.urlPattern             | string  | URL pattern to extract contents by CSS Query.   |
184 | | crawl.target.properties.name        | string  | "name" is used as a property name in the index. |
185 | | crawl.target.properties.name.text   | string  | CSS Query for the property value.               |
186 | | crawl.target.properties.name.html   | string  | CSS Query for the property value.               |
187 | | crawl.target.properties.name.script | string  | Rewrite the property value by Script(Groovy).   |
188 | | schedule.cron                       | string  | [Cron format](http://quartz-scheduler.org/api/2.2.0/org/quartz/CronExpression.html) to start a crawler.                 |
189 | 
190 | 
191 | ### Unregister Crawl Data
192 | 
193 | If you want to stop the crawler, type as below: (replace my\_web with your river name)
194 | 
195 |     $ curl -XDELETE 'localhost:9200/_river/my_web/'
196 | 
197 | ## Examples
198 | 
199 | ### Full Text Search for Your site (ex. http://fess.codelibs.org/)
200 | 
201 |     $ curl -XPUT 'localhost:9200/_river/fess/_meta' -d '{
202 |         "type" : "web",
203 |         "crawl" : {
204 |             "index" : "webindex",
205 |             "url" : ["http://fess.codelibs.org/"],
206 |             "includeFilter" : ["http://fess.codelibs.org/.*"],
207 |             "maxDepth" : 3,
208 |             "maxAccessCount" : 1000,
209 |             "numOfThread" : 5,
210 |             "interval" : 1000,
211 |             "target" : [{
212 |                 "pattern" : {
213 |                     "url" : "http://fess.codelibs.org/.*",
214 |                     "mimeType" : "text/html"
215 |                 },
216 |                 "properties" : {
217 |                     "title" : {
218 |                         "text" : "title"
219 |                     },
220 |                     "body" : {
221 |                         "text" : "body",
222 |                         "trimSpaces" : true
223 |                     }
224 |                 }
225 |             }]
226 |         },
227 |         "schedule" : {
228 |             "cron" : "0 0 0 * * ?"
229 |         }
230 |     }'
231 | 
232 | 
233 | ### Aggregate a title/content from news.yahoo.com
234 | 
235 |     $ curl -XPUT 'localhost:9200/_river/yahoo_com/_meta' -d '{
236 |       "type" : "web",
237 |       "crawl" : {
238 |         "index" : "webindex",
239 |         "url" : ["http://news.yahoo.com/"],
240 |         "includeFilter" : ["http://news.yahoo.com/.*"],
241 |         "maxDepth" : 1,
242 |         "maxAccessCount" : 10,
243 |         "numOfThread" : 3,
244 |         "interval" : 3000,
245 |         "userAgent" : "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
246 |         "target" : [
247 |           {
248 |             "pattern" : {
249 |               "url" : "http://news.yahoo.com/video/.*html",
250 |               "mimeType" : "text/html"
251 |             },
252 |             "properties" : {
253 |               "title" : {
254 |                 "text" : "title"
255 |               }
256 |             }
257 |           },
258 |           {
259 |             "pattern" : {
260 |               "url" : "http://news.yahoo.com/.*html",
261 |               "mimeType" : "text/html"
262 |             },
263 |             "properties" : {
264 |               "title" : {
265 |                 "text" : "h1.headline"
266 |               },
267 |               "content" : {
268 |                 "text" : "section#mediacontentstory p"
269 |               }
270 |             }
271 |           }
272 |         ]
273 |       },
274 |       "schedule" : {
275 |         "cron" : "0 0 * * * ?"
276 |       }
277 |     }'
278 | 
279 | (if news.yahoo.com is updated, the above example needs to be updated.)
280 | 
281 | ## Others
282 | 
283 | ### BASIC/DIGEST/NTLM authentication
284 | 
285 | River Web supports BASIC/DIGEST/NTLM authentication.
286 | Set crawl.authentications object.
287 | 
288 |     ...
289 |     "numOfThread" : 5,
290 |     "interval" : 1000,
291 |     "authentications":[
292 |       {
293 |         "scope": {
294 |           "scheme":"BASIC"
295 |         },
296 |         "credentials": {
297 |           "username":"testuser",
298 |           "password":"secret"
299 |         }
300 |       }],
301 |     "target" : [
302 |     ...
303 | 
304 | The configuration is:
305 | 
306 | | Property                                      | Type    | Description                                     |
307 | |:----------------------------------------------|:-------:|:------------------------------------------------|
308 | | crawl.authentications.scope.scheme            | string  | BASIC, DIGEST or NTLM                           |
309 | | crawl.authentications.scope.host              | string  | (Optional)Target hostname.                      |
310 | | crawl.authentications.scope.port              | int     | (Optional)Port number.                          |
311 | | crawl.authentications.scope.realm             | string  | (Optional)Realm name.                           |
312 | | crawl.authentications.credentials.username    | string  | Username.                                       |
313 | | crawl.authentications.credentials.password    | string  | Password.                                       |
314 | | crawl.authentications.credentials.workstation | string  | (Optional)Workstation for NTLM.                 |
315 | | crawl.authentications.credentials.domain      | string  | (Optional)Domain for NTLM.                      |
316 | 
317 | For example, if you want to use an user in ActiveDirectory, the configuration is below:
318 | 
319 |     "authentications":[
320 |       {
321 |         "scope": {
322 |           "scheme":"NTLM"
323 |         },
324 |         "credentials": {
325 |           "domain":"your.ad.domain",
326 |           "username":"taro",
327 |           "password":"himitsu"
328 |         }
329 |       }],
330 | 
331 | 
332 | ### Use attachment type
333 | 
334 | River Web supports [attachment type](https://github.com/elasticsearch/elasticsearch-mapper-attachments).
335 | For example, create a mapping with attachment type:
336 | 
337 |     curl -XPUT "localhost:9200/web/test/_mapping?pretty" -d '{
338 |       "test" : {
339 |         "dynamic_templates" : [
340 |         {
341 |     ...
342 |           "my_attachment" : {
343 |             "match" : "my_attachment",
344 |             "mapping" : {
345 |               "type" : "attachment",
346 |               "fields" : {
347 |                 "file" : { "index" : "no" },
348 |                 "title" : { "store" : "yes" },
349 |                 "date" : { "store" : "yes" },
350 |                 "author" : { "store" : "yes" },
351 |                 "keywords" : { "store" : "yes" },
352 |                 "content_type" : { "store" : "yes" },
353 |                 "content_length" : { "store" : "yes" }
354 |               }
355 |             }
356 |           }
357 |     ...
358 | 
359 | and then start your river. In "properties" object, when a value of "type" is "attachment", the crawled url is stored as base64-encoded data.
360 | 
361 |     curl -XPUT 'localhost:9200/_river/test/_meta?pretty' -d '{
362 |       "type" : "web",
363 |       "crawl" : {
364 |           "index" : "web",
365 |           "url" : "http://...",
366 |     ...
367 |           "target" : [
368 |     ...
369 |             {
370 |               "settings" : {
371 |                 "html" : false
372 |               },
373 |               "pattern" : {
374 |                 "url" : "http://.../.*"
375 |               },
376 |               "properties" : {
377 |                 "my_attachment" : {
378 |                   "type" : "attachment"
379 |                 }
380 |               }
381 |             }
382 |           ]
383 |     ...
384 | 
385 | ### Use Multibyte Characters
386 | 
387 | An example in Japanese environment is below.
388 | First, put some configuration file into conf directory of Elasticsearch.
389 | 
390 |     $ cd $ES_HOME/conf    # ex. /etc/elasticsearch if using rpm package
391 |     $ sudo wget https://raw.github.com/codelibs/fess-server/master/src/tomcat/solr/core1/conf/mapping_ja.txt
392 |     $ sudo wget http://svn.apache.org/repos/asf/lucene/dev/trunk/solr/example/solr/collection1/conf/lang/stopwords_ja.txt 
393 | 
394 | and then create "webindex" index with analyzers for Japanese.
395 | (If you want to use uni-gram, remove cjk\_bigram in filter)
396 | 
397 |     $ curl -XPUT "localhost:9200/webindex" -d '
398 |     {
399 |       "settings" : {
400 |         "analysis" : {
401 |           "analyzer" : {
402 |             "default" : {
403 |               "type" : "custom",
404 |               "char_filter" : ["mappingJa"],
405 |               "tokenizer" : "standard",
406 |               "filter" : ["word_delimiter", "lowercase", "cjk_width", "cjk_bigram"]
407 |             }
408 |           },
409 |           "char_filter" : {
410 |             "mappingJa": {
411 |               "type" : "mapping",
412 |               "mappings_path" : "mapping_ja.txt"
413 |             }
414 |           },
415 |           "filter" : {
416 |             "stopJa" : {
417 |               "type" : "stop",
418 |               "stopwords_path" : "stopwords_ja.txt"
419 |             }
420 |           }
421 |         }
422 |       }
423 |     }'
424 | 
425 | ### Rewrite a property value by Script
426 | 
427 | River Web allows you to rewrite crawled data by [Elasticsearch's scripting](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/modules-scripting.html).
428 | The default script language is Groovy.
429 | In "properties" object, put "script" value to a property you want to rewrite.
430 | 
431 |     ...
432 |             "properties" : {
433 |     ...
434 |               "flag" : {
435 |                 "text" : "body",
436 |                 "script" : "value.contains(\"Elasticsearch\") ? \"yes\" : \"no\""
437 |               },
438 | 
439 | The above is, if a string value of body element in HTML contains "Elasticsearch", set "yes" to "flag" property.
440 | 
441 | ### Start a crawler immediately
442 | 
443 | To start a crawler immediately, remove "cron" property in a configuration to register a river.
444 | No "cron" property means that the crawler starts right now and the river configuration is removed automatically at the end of the crawling.
445 | 
446 | ### Use HTTP proxy
447 | 
448 | Put "proxy" property in "crawl" property.
449 | 
450 |     curl -XPUT 'localhost:9200/_river/my_web/_meta' -d '{
451 |         "type" : "web",
452 |         "crawl" : {
453 |     ...
454 |             "proxy" : {
455 |               "host" : "proxy.server.com",
456 |               "port" : 8080
457 |             },
458 | 
459 | ### Specify next crawled urls when crawling
460 | 
461 | To set "isChildUrl" property to true, the property values is used as next crawled urls.
462 | 
463 |     "crawl" : {
464 |     ...
465 |         "target" : [
466 |           {
467 |     ...
468 |             "properties" : {
469 |               "childUrl" : {
470 |                 "value" : ["http://fess.codelibs.org/","http://fess.codelibs.org/ja/"],
471 |                 "isArray" : true,
472 |                 "isChildUrl" : true
473 |               },
474 | 
475 | ### Intercept start/execute/finish/close actions
476 | 
477 | You can insert your script to Starting River(start)/Executing Crawler(execute)/Finished Crawler(finish)/Closed River(close).
478 | To insert scripts, put "script" property to "crawl" property.
479 | 
480 |     {
481 |       "crawl" : {
482 |       ...
483 |         "script":{
484 |           "start":"your script...",
485 |           "execute":"your script...",
486 |           "finish":"your script...",
487 |           "close":"your script..."
488 |         },
489 | 
490 | ### Create Index For Crawling (1.0.0 - 1.1.0)
491 | 
492 | River Web Plugin needs 'robot' index for web crawling.
493 | Therefore, in version 1.0.0 - 1.1.0, you need to create it before starting the crawl.
494 | Type the following commands to create 'robot' index:
495 | 
496 |     $ curl -XPUT 'localhost:9200/robot/'
497 | 
498 | As of 1.1.1, "robot" index is created automatically.
499 | 
500 | ## FAQ
501 | 
502 | ### What does "No scraping rule." mean?
503 | 
504 | In a river setting, "crawl.url" is starting urls to crawl a site, "crawl.includeFilter" filters urls whether are crawled or not, and "crawl.target.pattern.url" is a rule to store extracted web data.
505 | If a crawling url does not match "crawl.target.pattern.url", you would see the message.
506 | Therefore, it means the crawled url does not have an extraction rule.
507 | 
508 | ### How to extract an attribute of meta tag
509 | 
510 | For example, if you want to grab a content of description's meta tag, the configuration is below:
511 | 
512 |     ...
513 |     "target" : [
514 |     ...
515 |       "properties" : {
516 |     ...
517 |         "meta" : {
518 |           "attr" : "meta[name=description]",
519 |           "args" : [ "content" ]
520 |         },
521 | 
522 | ### Incremental crawling dose not work?
523 | 
524 | "url" field needs to be "not_analyzed" in a mapping of your stored index.
525 | See [Create Index To Store Crawl Data](https://github.com/codelibs/elasticsearch-river-web#create-index-to-store-crawl-data "Create Index To Store Crawl Data").
526 | 
527 | 
528 | ### Where is crawled data stored?
529 | 
530 | crawled data are stored to "robot" index during cralwing, data extracted from them are stored to your index specified by a river setting, and then data in "robot" index are removed when the crawler is finished.
531 | 


--------------------------------------------------------------------------------
/bin/riverweb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd `dirname $0`
 4 | cd ..
 5 | BASE_DIR=`pwd`
 6 | 
 7 | LIB_DIR=$BASE_DIR/lib
 8 | 
 9 | if [ -d /var/log/riverweb ] ; then 
10 |   RIVERWEB_OPT="$RIVERWEB_OPT -Driverweb.log.file=/var/log/riverweb/riverweb.log"
11 | elif [ -d $BASE_DIR/logs ] ; then
12 |   RIVERWEB_OPT="$RIVERWEB_OPT -Driverweb.log.file=$BASE_DIR/logs/riverweb.log"
13 | fi
14 | 
15 | if [ -d /etc/riverweb ] ; then 
16 |   CONFIG_DIR=/etc/riverweb
17 | else
18 |   CONFIG_DIR=$BASE_DIR/config
19 | fi
20 | 
21 | CP_PATH=$CONFIG_DIR
22 | for JAR_FILE in `ls $LIB_DIR/*.jar 2>/dev/null` ; do
23 |   CP_PATH="$CP_PATH:$JAR_FILE"
24 | done
25 | 
26 | java $RIVERWEB_OPT -cp $CP_PATH org.codelibs.riverweb.RiverWeb $@
27 | 
28 | 


--------------------------------------------------------------------------------
/config/log4j.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
 3 | <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
 4 | 	<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
 5 | 		<param name="Target" value="System.out" />
 6 | 		<layout class="org.apache.log4j.PatternLayout">
 7 | 			<param name="ConversionPattern" value="%d [%t] %-5p %m%n" />
 8 | 		</layout>
 9 | 	</appender>
10 | 	<appender name="file" class="org.apache.log4j.FileAppender">
11 | 		<param name="File" value="${riverweb.log.file}" />
12 | 		<param name="Append" value="false" />
13 | 		<layout class="org.apache.log4j.PatternLayout">
14 | 			<param name="ConversionPattern" value="%d [%t] %-5p %m%n" />
15 | 		</layout>
16 | 	</appender>
17 | 	<appender name="rollingFile" class="org.apache.log4j.RollingFileAppender">
18 | 		<param name="File" value="${riverweb.log.file}" />
19 | 		<param name="Append" value="true" />
20 | 		<param name="MaxFileSize" value="10MB" />
21 | 		<param name="MaxBackupIndex" value="20" />
22 | 		<layout class="org.apache.log4j.PatternLayout">
23 | 			<param name="ConversionPattern" value="%d [%t] %-5p %m%n" />
24 | 		</layout>
25 | 	</appender>
26 | 	<root>
27 | 		<priority value="info" />
28 | 		<appender-ref ref="rollingFile" />
29 | 	</root>
30 | </log4j:configuration>
31 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>org.codelibs</groupId>
  5 | 	<artifactId>river-web</artifactId>
  6 | 	<version>2.4.0-SNAPSHOT</version>
  7 | 	<packaging>jar</packaging>
  8 | 	<name>River Web for Elasticsearch</name>
  9 | 	<description>This application crawls web sites and extracts the content from Web.</description>
 10 | 	<inceptionYear>2011</inceptionYear>
 11 | 	<url>https://github.com/codelibs/elasticsearch-river-web</url>
 12 | 	<licenses>
 13 | 		<license>
 14 | 			<name>The Apache Software License, Version 2.0</name>
 15 | 			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 16 | 			<distribution>repo</distribution>
 17 | 		</license>
 18 | 	</licenses>
 19 | 	<organization>
 20 | 		<name>CodeLibs</name>
 21 | 		<url>http://www.codelibs.org/</url>
 22 | 	</organization>
 23 | 	<issueManagement>
 24 | 		<url>https://github.com/codelibs/elasticsearch-river-web/issues</url>
 25 | 	</issueManagement>
 26 | 	<scm>
 27 | 		<connection>scm:git:git@github.com:codelibs/elasticsearch-river-web.git</connection>
 28 | 		<developerConnection>scm:git:git@github.com:codelibs/elasticsearch-river-web.git</developerConnection>
 29 | 		<url>git@github.com:codelibs/elasticsearch-river-web.git</url>
 30 | 	</scm>
 31 | 	<distributionManagement>
 32 | 		<repository>
 33 | 			<id>codelibs-repository</id>
 34 | 			<url>ftp://maven.codelibs.org/home/codelibs/maven/</url>
 35 | 		</repository>
 36 | 	</distributionManagement>
 37 | 	<repositories>
 38 | 		<repository>
 39 | 			<id>snapshots-repo</id>
 40 | 			<url>https://oss.sonatype.org/content/repositories/snapshots</url>
 41 | 			<releases>
 42 | 				<enabled>false</enabled>
 43 | 			</releases>
 44 | 			<snapshots>
 45 | 				<enabled>true</enabled>
 46 | 			</snapshots>
 47 | 		</repository>
 48 | 	</repositories>
 49 | 	<properties>
 50 | 		<elasticsearch.version>2.4.0</elasticsearch.version>
 51 | 		<dbflute.version>1.1.1</dbflute.version>
 52 | 		<lastadi.version>0.7.0</lastadi.version>
 53 | 		<crawler.version>1.0.10</crawler.version>
 54 | 		<jta.version>1.2</jta.version>
 55 | 		<slf4j.version>1.7.21</slf4j.version>
 56 | 		<lucene.version>5.5.2</lucene.version>
 57 | 		<utflute.version>0.6.1A</utflute.version>
 58 | 	</properties>
 59 | 	<build>
 60 | 		<resources>
 61 | 			<resource>
 62 | 				<directory>${basedir}/src/main/resources</directory>
 63 | 				<includes>
 64 | 					<include>**/*.*</include>
 65 | 				</includes>
 66 | 				<filtering>true</filtering>
 67 | 			</resource>
 68 | 		</resources>
 69 | 		<plugins>
 70 | 			<plugin>
 71 | 				<groupId>org.apache.maven.plugins</groupId>
 72 | 				<artifactId>maven-compiler-plugin</artifactId>
 73 | 				<version>2.3.2</version>
 74 | 				<configuration>
 75 | 					<source>1.8</source>
 76 | 					<target>1.8</target>
 77 | 					<encoding>UTF-8</encoding>
 78 | 				</configuration>
 79 | 			</plugin>
 80 | 			<plugin>
 81 | 				<groupId>org.apache.maven.plugins</groupId>
 82 | 				<artifactId>maven-surefire-plugin</artifactId>
 83 | 				<version>2.11</version>
 84 | 				<configuration>
 85 | 					<includes>
 86 | 						<include>**/*Test.java</include>
 87 | 					</includes>
 88 | 				</configuration>
 89 | 			</plugin>
 90 | 			<plugin>
 91 | 				<groupId>org.apache.maven.plugins</groupId>
 92 | 				<artifactId>maven-source-plugin</artifactId>
 93 | 				<version>2.1.2</version>
 94 | 				<executions>
 95 | 					<execution>
 96 | 						<id>attach-sources</id>
 97 | 						<goals>
 98 | 							<goal>jar</goal>
 99 | 						</goals>
100 | 					</execution>
101 | 				</executions>
102 | 			</plugin>
103 | 			<plugin>
104 | 				<artifactId>maven-resources-plugin</artifactId>
105 | 				<version>2.6</version>
106 | 				<executions>
107 | 					<execution>
108 | 						<id>copy-resources</id>
109 | 						<phase>prepare-package</phase>
110 | 						<goals>
111 | 							<goal>copy-resources</goal>
112 | 						</goals>
113 | 						<configuration>
114 | 							<outputDirectory>${project.build.directory}/bin</outputDirectory>
115 | 							<resources>
116 | 								<resource>
117 | 									<directory>${basedir}/bin</directory>
118 | 									<filtering>true</filtering>
119 | 									<excludes>
120 | 										<exclude>*.exe</exclude>
121 | 									</excludes>
122 | 								</resource>
123 | 								<resource>
124 | 									<directory>${basedir}/bin</directory>
125 | 									<filtering>false</filtering>
126 | 									<includes>
127 | 										<include>*.exe</include>
128 | 									</includes>
129 | 								</resource>
130 | 							</resources>
131 | 						</configuration>
132 | 					</execution>
133 | 				</executions>
134 | 			</plugin>
135 | 			<plugin>
136 | 				<artifactId>maven-assembly-plugin</artifactId>
137 | 				<version>2.4</version>
138 | 				<configuration>
139 | 					<appendAssemblyId>false</appendAssemblyId>
140 | 					<outputDirectory>${project.build.directory}/releases/</outputDirectory>
141 | 					<descriptors>
142 | 						<descriptor>${basedir}/src/main/assemblies/targz-bin.xml</descriptor>
143 | 						<descriptor>${basedir}/src/main/assemblies/zip-bin.xml</descriptor>
144 | 					</descriptors>
145 | 				</configuration>
146 | 				<executions>
147 | 					<execution>
148 | 						<phase>package</phase>
149 | 						<goals>
150 | 							<goal>single</goal>
151 | 						</goals>
152 | 					</execution>
153 | 				</executions>
154 | 			</plugin>
155 | 			<plugin>
156 | 				<groupId>org.codehaus.mojo</groupId>
157 | 				<artifactId>buildnumber-maven-plugin</artifactId>
158 | 				<version>1.2</version>
159 | 				<executions>
160 | 					<execution>
161 | 						<phase>validate</phase>
162 | 						<goals>
163 | 							<goal>create</goal>
164 | 						</goals>
165 | 					</execution>
166 | 				</executions>
167 | 				<configuration>
168 | 					<doCheck>false</doCheck>
169 | 					<doUpdate>false</doUpdate>
170 | 				</configuration>
171 | 			</plugin>
172 | 			<plugin>
173 | 				<artifactId>maven-dependency-plugin</artifactId>
174 | 				<version>2.8</version>
175 | 				<executions>
176 | 					<execution>
177 | 						<phase>prepare-package</phase>
178 | 						<goals>
179 | 							<goal>copy-dependencies</goal>
180 | 						</goals>
181 | 						<configuration>
182 | 							<outputDirectory>${project.build.directory}/lib</outputDirectory>
183 | 							<includeScope>runtime</includeScope>
184 | 						</configuration>
185 | 					</execution>
186 | 				</executions>
187 | 			</plugin>
188 | 			<plugin>
189 | 				<groupId>org.codehaus.mojo</groupId>
190 | 				<artifactId>rpm-maven-plugin</artifactId>
191 | 				<version>2.1-alpha-3</version>
192 | 				<configuration>
193 | 					<copyright>2015, CodeLibs Project</copyright>
194 | 					<distribution>CodeLibs</distribution>
195 | 					<group>Application/Internet</group>
196 | 					<packager>CodeLibs</packager>
197 | 					<prefix>/usr</prefix>
198 | 					<changelogFile>src/changelog</changelogFile>
199 | 					<defineStatements>
200 | 						<defineStatement>_unpackaged_files_terminate_build 0</defineStatement>
201 | 						<defineStatement>_binaries_in_noarch_packages_terminate_build 0</defineStatement>
202 | 					</defineStatements>
203 | 					<defaultFilemode>644</defaultFilemode>
204 | 					<defaultDirmode>755</defaultDirmode>
205 | 					<defaultUsername>root</defaultUsername>
206 | 					<defaultGroupname>root</defaultGroupname>
207 | 					<keyname>${gpg.key}</keyname>
208 | 					<keypath>${gpg.keyring}</keypath>
209 | 					<keyPassphrase>
210 | 						<passphrase>${gpg.passphrase}</passphrase>
211 | 					</keyPassphrase>
212 | 					<mappings>
213 | 						<mapping>
214 | 							<directory>/var/log/riverweb/</directory>
215 | 							<filemode>755</filemode>
216 | 							<username>riverweb</username>
217 | 							<groupname>riverweb</groupname>
218 | 						</mapping>
219 | 						<mapping>
220 | 							<directory>/usr/share/riverweb/bin/</directory>
221 | 							<filemode>755</filemode>
222 | 							<sources>
223 | 								<source>
224 | 									<location>target/bin</location>
225 | 									<includes>
226 | 										<include>riverweb</include>
227 | 									</includes>
228 | 								</source>
229 | 							</sources>
230 | 						</mapping>
231 | 						<mapping>
232 | 							<directory>/usr/share/riverweb/lib</directory>
233 | 							<sources>
234 | 								<source>
235 | 									<location>target/lib/</location>
236 | 								</source>
237 | 								<source>
238 | 									<location>${project.build.directory}/</location>
239 | 									<includes>
240 | 										<include>${project.build.finalName}.jar</include>
241 | 									</includes>
242 | 									<excludes>
243 | 										<exclude>lucene-spatial*</exclude>
244 | 										<exclude>lucene-suggest*</exclude>
245 | 										<exclude>lucene-sandbox*</exclude>
246 | 										<exclude>lucene-queryparser*</exclude>
247 | 										<exclude>lucene-queries*</exclude>
248 | 										<exclude>lucene-misc*</exclude>
249 | 										<exclude>lucene-memory*</exclude>
250 | 										<exclude>lucene-join*</exclude>
251 | 										<exclude>lucene-highlighter*</exclude>
252 | 										<exclude>lucene-grouping*</exclude>
253 | 									</excludes>
254 | 								</source>
255 | 							</sources>
256 | 						</mapping>
257 | 						<mapping>
258 | 							<directory>/usr/share/riverweb/</directory>
259 | 							<sources>
260 | 								<source>
261 | 									<location>.</location>
262 | 									<includes>
263 | 										<include>LICENSE.txt</include>
264 | 										<include>NOTICE.txt</include>
265 | 										<include>README.md</include>
266 | 									</includes>
267 | 								</source>
268 | 							</sources>
269 | 						</mapping>
270 | 						<mapping>
271 | 							<directory>/etc/riverweb/</directory>
272 | 							<sources>
273 | 								<source>
274 | 									<location>config</location>
275 | 									<includes>
276 | 										<include>log4j.xml</include>
277 | 									</includes>
278 | 								</source>
279 | 								<source>
280 | 									<location>src/main/resources</location>
281 | 									<includes>
282 | 										<include>riverweb.properties</include>
283 | 									</includes>
284 | 								</source>
285 | 							</sources>
286 | 						</mapping>
287 | 					</mappings>
288 | 					<preinstallScriptlet>
289 | 						<scriptFile>src/rpm/scripts/preinstall</scriptFile>
290 | 						<fileEncoding>utf-8</fileEncoding>
291 | 					</preinstallScriptlet>
292 | 					<postinstallScriptlet>
293 | 						<scriptFile>src/rpm/scripts/postinstall</scriptFile>
294 | 						<fileEncoding>utf-8</fileEncoding>
295 | 					</postinstallScriptlet>
296 | 					<preremoveScriptlet>
297 | 						<scriptFile>src/rpm/scripts/preremove</scriptFile>
298 | 						<fileEncoding>utf-8</fileEncoding>
299 | 					</preremoveScriptlet>
300 | 					<postremoveScriptlet>
301 | 						<scriptFile>src/rpm/scripts/postremove</scriptFile>
302 | 						<fileEncoding>utf-8</fileEncoding>
303 | 					</postremoveScriptlet>
304 | 				</configuration>
305 | 			</plugin>
306 | 		</plugins>
307 | 		<extensions>
308 | 			<extension>
309 | 				<groupId>org.apache.maven.wagon</groupId>
310 | 				<artifactId>wagon-ftp</artifactId>
311 | 				<version>1.0-beta-6</version>
312 | 			</extension>
313 | 		</extensions>
314 | 	</build>
315 | 	<dependencies>
316 | 		<dependency>
317 | 			<groupId>org.elasticsearch</groupId>
318 | 			<artifactId>elasticsearch</artifactId>
319 | 			<version>${elasticsearch.version}</version>
320 | 		</dependency>
321 | 		<dependency>
322 | 			<groupId>org.lastaflute</groupId>
323 | 			<artifactId>lasta-di</artifactId>
324 | 			<version>${lastadi.version}</version>
325 | 		</dependency>
326 | 		<dependency>
327 | 			<groupId>javax.transaction</groupId>
328 | 			<artifactId>javax.transaction-api</artifactId>
329 | 			<version>${jta.version}</version>
330 | 		</dependency>
331 | 		<dependency>
332 | 			<groupId>args4j</groupId>
333 | 			<artifactId>args4j</artifactId>
334 | 			<version>2.32</version>
335 | 		</dependency>
336 | 		<dependency>
337 | 			<groupId>org.codelibs</groupId>
338 | 			<artifactId>corelib</artifactId>
339 | 			<version>0.3.2</version>
340 | 		</dependency>
341 | 		<dependency>
342 | 			<groupId>org.codelibs.fess</groupId>
343 | 			<artifactId>fess-crawler-lasta</artifactId>
344 | 			<version>${crawler.version}</version>
345 | 		</dependency>
346 | 		<dependency>
347 | 			<groupId>org.codelibs.fess</groupId>
348 | 			<artifactId>fess-crawler-es</artifactId>
349 | 			<version>${crawler.version}</version>
350 | 		</dependency>
351 | 		<dependency>
352 | 			<groupId>org.codelibs.fess</groupId>
353 | 			<artifactId>fess-crawler-webdriver</artifactId>
354 | 			<version>${crawler.version}</version>
355 | 		</dependency>
356 | 		<dependency>
357 | 			<groupId>org.slf4j</groupId>
358 | 			<artifactId>slf4j-log4j12</artifactId>
359 | 			<version>${slf4j.version}</version>
360 | 		</dependency>
361 | 		<dependency>
362 | 			<groupId>org.slf4j</groupId>
363 | 			<artifactId>jcl-over-slf4j</artifactId>
364 | 			<version>${slf4j.version}</version>
365 | 		</dependency>
366 | 		<dependency>
367 | 			<groupId>org.jsoup</groupId>
368 | 			<artifactId>jsoup</artifactId>
369 | 			<version>1.8.3</version>
370 | 		</dependency>
371 | 		<dependency>
372 | 			<groupId>junit</groupId>
373 | 			<artifactId>junit</artifactId>
374 | 			<version>4.11</version>
375 | 			<scope>test</scope>
376 | 		</dependency>
377 | 		<dependency>
378 | 			<groupId>org.dbflute.utflute</groupId>
379 | 			<artifactId>utflute-lasta-di</artifactId>
380 | 			<version>${utflute.version}</version>
381 | 			<scope>test</scope>
382 | 		</dependency>
383 | 		<dependency>
384 | 			<groupId>org.codelibs</groupId>
385 | 			<artifactId>elasticsearch-cluster-runner</artifactId>
386 | 			<version>2.3.0.0</version>
387 | 			<scope>test</scope>
388 | 		</dependency>
389 | 	</dependencies>
390 | </project>
391 | 


--------------------------------------------------------------------------------
/src/main/assemblies/common-bin.xml:
--------------------------------------------------------------------------------
 1 | <component>
 2 | 	<dependencySets>
 3 | 		<dependencySet>
 4 | 			<outputDirectory>/lib</outputDirectory>
 5 | 			<useTransitiveFiltering>true</useTransitiveFiltering>
 6 | 			<excludes>
 7 | 				<exclude>commons-logging:commons-logging</exclude>
 8 | 				<exclude>org.apache.lucene:lucene-spatial</exclude>
 9 | 				<exclude>org.apache.lucene:lucene-suggest</exclude>
10 | 				<exclude>org.apache.lucene:lucene-sandbox</exclude>
11 | 				<exclude>org.apache.lucene:lucene-queryparser</exclude>
12 | 				<exclude>org.apache.lucene:lucene-queries</exclude>
13 | 				<exclude>org.apache.lucene:lucene-misc</exclude>
14 | 				<exclude>org.apache.lucene:lucene-memory</exclude>
15 | 				<exclude>org.apache.lucene:lucene-join</exclude>
16 | 				<exclude>org.apache.lucene:lucene-highlighter</exclude>
17 | 				<exclude>org.apache.lucene:lucene-grouping</exclude>
18 | 			</excludes>
19 | 		</dependencySet>
20 | 		<dependencySet>
21 | 			<outputDirectory>/lib</outputDirectory>
22 | 			<useTransitiveDependencies>false</useTransitiveDependencies>
23 | 			<includes>
24 | 				<include>org.codelibs:river-web</include>
25 | 			</includes>
26 | 		</dependencySet>
27 | 	</dependencySets>
28 | 	<fileSets>
29 | 		<fileSet>
30 | 			<directory>config</directory>
31 | 			<outputDirectory>config</outputDirectory>
32 | 			<includes>
33 | 				<include>log4j.xml</include>
34 | 			</includes>
35 | 		</fileSet>
36 | 		<fileSet>
37 | 			<directory>src/main/resources</directory>
38 | 			<outputDirectory>config</outputDirectory>
39 | 			<includes>
40 | 				<include>riverweb.properties</include>
41 | 			</includes>
42 | 		</fileSet>
43 | 		<fileSet>
44 | 			<filtered>true</filtered>
45 | 			<directory>bin</directory>
46 | 			<outputDirectory>bin</outputDirectory>
47 | 			<lineEnding>dos</lineEnding>
48 | 			<includes>
49 | 				<include>riverweb.bat</include>
50 | 			</includes>
51 | 		</fileSet>
52 | 		<fileSet>
53 | 			<filtered>false</filtered>
54 | 			<directory>bin</directory>
55 | 			<outputDirectory>bin</outputDirectory>
56 | 			<includes>
57 | 				<include>*.exe</include>
58 | 			</includes>
59 | 		</fileSet>
60 | 		<fileSet>
61 | 			<filtered>true</filtered>
62 | 			<directory>bin</directory>
63 | 			<outputDirectory>bin</outputDirectory>
64 | 			<fileMode>0755</fileMode>
65 | 			<directoryMode>0755</directoryMode>
66 | 			<lineEnding>unix</lineEnding>
67 | 			<includes>
68 | 				<include>riverweb</include>
69 | 			</includes>
70 | 		</fileSet>
71 | 		<fileSet>
72 | 			<directory>src/main/resources</directory>
73 | 			<outputDirectory>logs</outputDirectory>
74 | 			<includes>
75 | 				<include>.gitkeep</include>
76 | 			</includes>
77 | 		</fileSet>
78 | 	</fileSets>
79 | 	<files>
80 | 		<file>
81 | 			<source>README.md</source>
82 | 			<outputDirectory>/</outputDirectory>
83 | 		</file>
84 | 		<file>
85 | 			<source>LICENSE.txt</source>
86 | 			<outputDirectory>/</outputDirectory>
87 | 		</file>
88 | 		<file>
89 | 			<source>NOTICE.txt</source>
90 | 			<outputDirectory>/</outputDirectory>
91 | 		</file>
92 | 	</files>
93 | </component>
94 | 


--------------------------------------------------------------------------------
/src/main/assemblies/targz-bin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 | 	<id>targz</id>
 4 | 	<formats>
 5 | 		<format>tar.gz</format>
 6 | 	</formats>
 7 | 
 8 | 	<includeBaseDirectory>true</includeBaseDirectory>
 9 | 
10 | 	<componentDescriptors>
11 | 		<componentDescriptor>src/main/assemblies/common-bin.xml</componentDescriptor>
12 | 	</componentDescriptors>
13 | </assembly>
14 | 


--------------------------------------------------------------------------------
/src/main/assemblies/zip-bin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 | 	<id>zip</id>
 4 | 	<formats>
 5 | 		<format>zip</format>
 6 | 	</formats>
 7 | 
 8 | 	<includeBaseDirectory>true</includeBaseDirectory>
 9 | 
10 | 	<componentDescriptors>
11 | 		<componentDescriptor>src/main/assemblies/common-bin.xml</componentDescriptor>
12 | 	</componentDescriptors>
13 | </assembly>
14 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/RiverWeb.java:
--------------------------------------------------------------------------------
  1 | package org.codelibs.riverweb;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.List;
  6 | import java.util.Locale;
  7 | import java.util.Map;
  8 | import java.util.UUID;
  9 | import java.util.concurrent.ExecutorService;
 10 | import java.util.concurrent.Executors;
 11 | import java.util.concurrent.Future;
 12 | import java.util.concurrent.atomic.AtomicLong;
 13 | import java.util.function.IntConsumer;
 14 | import java.util.stream.Stream;
 15 | 
 16 | import javax.annotation.Resource;
 17 | 
 18 | import org.apache.http.auth.AuthScheme;
 19 | import org.apache.http.auth.AuthScope;
 20 | import org.apache.http.auth.Credentials;
 21 | import org.apache.http.auth.NTCredentials;
 22 | import org.apache.http.auth.UsernamePasswordCredentials;
 23 | import org.apache.http.impl.auth.BasicScheme;
 24 | import org.apache.http.impl.auth.DigestScheme;
 25 | import org.apache.http.impl.auth.NTLMScheme;
 26 | import org.codelibs.core.lang.StringUtil;
 27 | import org.codelibs.fess.crawler.Crawler;
 28 | import org.codelibs.fess.crawler.CrawlerContext;
 29 | import org.codelibs.fess.crawler.client.CrawlerClient;
 30 | import org.codelibs.fess.crawler.client.CrawlerClientFactory;
 31 | import org.codelibs.fess.crawler.client.http.Authentication;
 32 | import org.codelibs.fess.crawler.client.http.HcHttpClient;
 33 | import org.codelibs.fess.crawler.client.http.RequestHeader;
 34 | import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
 35 | import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
 36 | import org.codelibs.fess.crawler.service.impl.EsDataService;
 37 | import org.codelibs.fess.crawler.service.impl.EsUrlFilterService;
 38 | import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
 39 | import org.codelibs.riverweb.config.RiverConfig;
 40 | import org.codelibs.riverweb.config.RiverConfigManager;
 41 | import org.codelibs.riverweb.interval.WebRiverIntervalController;
 42 | import org.codelibs.riverweb.util.ConfigProperties;
 43 | import org.codelibs.riverweb.util.ScriptUtils;
 44 | import org.codelibs.riverweb.util.SettingsUtils;
 45 | import org.elasticsearch.action.get.GetResponse;
 46 | import org.elasticsearch.action.search.SearchRequestBuilder;
 47 | import org.elasticsearch.index.IndexNotFoundException;
 48 | import org.elasticsearch.index.engine.DocumentAlreadyExistsException;
 49 | import org.elasticsearch.index.query.QueryBuilders;
 50 | import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
 51 | import org.kohsuke.args4j.CmdLineParser;
 52 | import org.kohsuke.args4j.Option;
 53 | import org.kohsuke.args4j.ParserProperties;
 54 | import org.lastaflute.di.core.SingletonLaContainer;
 55 | import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
 56 | import org.slf4j.Logger;
 57 | import org.slf4j.LoggerFactory;
 58 | 
 59 | public class RiverWeb {
 60 |     public static final Logger logger = LoggerFactory.getLogger(RiverWeb.class);
 61 | 
 62 |     private static final String NTLM_SCHEME = "NTLM";
 63 | 
 64 |     private static final String DIGEST_SCHEME = "DIGEST";
 65 | 
 66 |     private static final String BASIC_SCHEME = "BASIC";
 67 | 
 68 |     @Option(name = "--queue-timeout")
 69 |     protected long queueTimeout = 300000; // 5min
 70 | 
 71 |     @Option(name = "--threads")
 72 |     protected int numThreads = 1;
 73 | 
 74 |     @Option(name = "--interval")
 75 |     protected long interval = 1000;
 76 | 
 77 |     @Option(name = "--config-id")
 78 |     protected String configId;
 79 | 
 80 |     @Option(name = "--session-id")
 81 |     protected String sessionId;
 82 | 
 83 |     @Option(name = "--cleanup")
 84 |     protected boolean cleanup;
 85 | 
 86 |     @Option(name = "--es-hosts")
 87 |     protected String esHosts;
 88 | 
 89 |     @Option(name = "--cluster-name")
 90 |     protected String clusterName;
 91 | 
 92 |     @Option(name = "--quiet")
 93 |     protected boolean quiet;
 94 | 
 95 |     @Option(name = "--queue-query")
 96 |     protected String queueQuery;
 97 | 
 98 |     @Resource
 99 |     protected org.codelibs.fess.crawler.client.EsClient esClient;
100 | 
101 |     @Resource
102 |     protected ConfigProperties config;
103 | 
104 |     @Resource
105 |     protected RiverConfigManager riverConfigManager;
106 | 
107 |     @Resource
108 |     protected String defaultUserAgent;
109 | 
110 |     protected static IntConsumer exitMethod = System::exit;
111 | 
112 |     public static void main(final String[] args) {
113 |         Runtime.getRuntime().addShutdownHook(new Thread() {
114 |             @Override
115 |             public void run() {
116 |                 synchronized (this) {
117 |                     SingletonLaContainerFactory.destroy();
118 |                 }
119 |             }
120 |         });
121 | 
122 |         SingletonLaContainerFactory.init();
123 |         final RiverWeb riverWeb = SingletonLaContainer.getComponent(RiverWeb.class);
124 | 
125 |         final CmdLineParser parser = new CmdLineParser(riverWeb, ParserProperties.defaults().withUsageWidth(80));
126 |         try {
127 |             parser.parseArgument(args);
128 |         } catch (final Exception e) {
129 |             parser.printUsage(System.out);
130 |             exitMethod.accept(1);
131 |             return;
132 |         }
133 | 
134 |         try {
135 |             exitMethod.accept(riverWeb.execute());
136 |         } catch (final Exception e) {
137 |             riverWeb.print(e.getMessage());
138 |             exitMethod.accept(1);
139 |             logger.error("Failed to process your request.", e);
140 |         } finally {
141 |             SingletonLaContainerFactory.destroy();
142 |         }
143 |     }
144 | 
145 |     private void print(final String format, final Object... args) {
146 |         final String log = String.format(format, args);
147 |         if (quiet) {
148 |             logger.info(log);
149 |         } else {
150 |             System.out.println(log);
151 |         }
152 |     }
153 | 
154 |     private int execute() {
155 |         // update esClient
156 |         esClient.setClusterName(config.getElasticsearchClusterName(clusterName));
157 |         esClient.setAddresses(config.getElasticsearchHosts(esHosts));
158 |         esClient.connect();
159 | 
160 |         if (StringUtil.isNotBlank(configId)) {
161 |             return crawl(SingletonLaContainer.getComponent(Crawler.class), configId, sessionId);
162 |         } else {
163 |             final String configIndex = config.getConfigIndex();
164 |             final String queueType = config.getQueueType();
165 |             final ExecutorService threadPool = Executors.newFixedThreadPool(numThreads);
166 |             final Future<?>[] results = new Future[numThreads];
167 |             for (int i = 0; i < numThreads; i++) {
168 |                 final int threadId = i + 1;
169 |                 results[i] = threadPool.submit(() -> {
170 |                     AtomicLong lastProcessed = new AtomicLong(System.currentTimeMillis());
171 |                     while (SingletonLaContainerFactory.hasContainer()
172 |                             && (queueTimeout <= 0 || lastProcessed.get() + queueTimeout > System.currentTimeMillis())) {
173 |                         logger.debug("Checking queue: {}/{}", configIndex, queueType);
174 |                         try {
175 |                             final SearchRequestBuilder builder = esClient.prepareSearch(configIndex).setTypes(queueType);
176 |                             if (StringUtil.isNotBlank(queueQuery)) {
177 |                                 builder.setQuery(queueQuery);
178 |                             } else {
179 |                                 builder.setQuery(
180 |                                         QueryBuilders.functionScoreQuery().add(ScoreFunctionBuilders.randomFunction(System.nanoTime())));
181 |                             }
182 |                             builder.setSize(config.getQueueParsingSize()).execute().actionGet().getHits().forEach(hit -> {
183 |                                 if (esClient.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()).execute().actionGet().isFound()) {
184 |                                     Map<String, Object> source = hit.getSource();
185 |                                     final Object configId = source.get("config_id");
186 |                                     final String sessionId = (String) source.get("session_id");
187 |                                     if (configId instanceof String) {
188 |                                         print("Config %s is started with Session %s.", configId, sessionId);
189 |                                         try {
190 |                                             crawl(SingletonLaContainer.getComponent(Crawler.class), configId.toString(), sessionId);
191 |                                         } finally {
192 |                                             print("Config %s is finished.", configId);
193 |                                             lastProcessed.set(System.currentTimeMillis());
194 |                                         }
195 |                                     }
196 |                                 } else if (logger.isDebugEnabled()) {
197 |                                     logger.debug("No data in queue: " + hit.getIndex() + "/" + hit.getType() + "/" + hit.getId());
198 |                                 }
199 |                             });
200 |                         } catch (IndexNotFoundException e) {
201 |                             logger.debug("Index is not found.", e);
202 |                         } catch (Exception e) {
203 |                             logger.warn("Failed to process a queue.", e);
204 |                         }
205 |                         try {
206 |                             Thread.sleep(interval);
207 |                         } catch (InterruptedException e) {
208 |                             // ignore
209 |                         }
210 |                     }
211 |                     print("Thread %d is finished.", threadId);
212 |                 });
213 |             }
214 |             Stream.of(results).forEach(f -> {
215 |                 try {
216 |                     f.get();
217 |                 } catch (Exception e) {
218 |                     // ignore
219 |                 }
220 |             });
221 |             threadPool.shutdown();
222 |             return 0;
223 |         }
224 |     }
225 | 
226 |     private int crawl(Crawler crawler, String configId, String sessionId) {
227 |         // Load config data
228 |         final String configIndex = config.getConfigIndex();
229 |         final String configType = config.getConfigType();
230 |         final GetResponse response = esClient.prepareGet(configIndex, configType, configId).execute().actionGet();
231 |         if (!response.isExists()) {
232 |             print("Config ID %s is not found in %s/%s.", configId, configIndex, configType);
233 |             return 1;
234 |         }
235 | 
236 |         final Map<String, Object> crawlSettings = response.getSource();
237 | 
238 |         if (StringUtil.isBlank(sessionId)) {
239 |             sessionId = UUID.randomUUID().toString();
240 |         }
241 | 
242 |         final Map<String, Object> vars = new HashMap<String, Object>();
243 |         vars.put("configId", configId);
244 |         vars.put("client", esClient);
245 |         vars.put("sessionId", sessionId);
246 | 
247 |         final RiverConfig riverConfig = riverConfigManager.get(sessionId);
248 |         final Map<String, Object> scriptSettings = SettingsUtils.get(crawlSettings, "script");
249 |         try {
250 |             // invoke execute event script
251 |             ScriptUtils.execute(scriptSettings, "execute", v -> {
252 |                 v.putAll(vars);
253 |                 v.put("container", SingletonLaContainerFactory.getContainer());
254 |                 v.put("settings", crawlSettings);
255 |                 v.put("logger", RiverWeb.logger);
256 |             });
257 | 
258 |             @SuppressWarnings("unchecked")
259 |             final List<Map<String, Object>> targetList = (List<Map<String, Object>>) crawlSettings.get("target");
260 |             if (targetList == null || targetList.isEmpty()) {
261 |                 print("No targets for crawling.");
262 |                 return 1;
263 |             }
264 | 
265 |             crawler.setSessionId(sessionId);
266 | 
267 |             // HttpClient Parameters
268 |             final Map<String, Object> paramMap = new HashMap<String, Object>();
269 |             final CrawlerClientFactory clientFactory = crawler.getClientFactory();
270 | 
271 |             final Integer connectionTimeout = SettingsUtils.get(crawlSettings, "connection_timeout", config.getConnectionTimeout());
272 |             if (connectionTimeout != null) {
273 |                 paramMap.put(HcHttpClient.CONNECTION_TIMEOUT_PROPERTY, connectionTimeout);
274 |             }
275 | 
276 |             final Integer soTimeout = SettingsUtils.get(crawlSettings, "so_timeout", config.getSoTimeout());
277 |             if (soTimeout != null) {
278 |                 paramMap.put(HcHttpClient.SO_TIMEOUT_PROPERTY, soTimeout);
279 |             }
280 | 
281 |             // web driver
282 |             @SuppressWarnings("unchecked")
283 |             final List<String> wdUrlList = (List<String>) crawlSettings.get("web_driver_urls");
284 |             if (wdUrlList != null) {
285 |                 CrawlerClient client = SingletonLaContainer.getComponent("webDriverClient");
286 |                 wdUrlList.stream().forEach(regex -> clientFactory.addClient(regex, client, 0));
287 |             }
288 | 
289 |             clientFactory.setInitParameterMap(paramMap);
290 | 
291 |             // user agent
292 |             final String userAgent = SettingsUtils.get(crawlSettings, "user_agent", defaultUserAgent);
293 |             if (StringUtil.isNotBlank(userAgent)) {
294 |                 paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);
295 |             }
296 | 
297 |             // robots.txt parser
298 |             final Boolean robotsTxtEnabled = SettingsUtils.get(crawlSettings, "robots_txt", config.isRobotsTxtEnabled());
299 |             paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY, robotsTxtEnabled);
300 | 
301 |             // redirect automatically
302 |             final Boolean redirectsEnabled = SettingsUtils.get(crawlSettings, "auto_redirect", config.isRedirectsEnabled());
303 |             paramMap.put(HcHttpClient.REDIRECTS_ENABLED, redirectsEnabled);
304 | 
305 |             // proxy
306 |             final Map<String, Object> proxyMap = SettingsUtils.get(crawlSettings, "proxy", null);
307 |             if (proxyMap != null) {
308 |                 final Object host = proxyMap.get("host");
309 |                 if (host != null) {
310 |                     paramMap.put(HcHttpClient.PROXY_HOST_PROPERTY, host);
311 |                     final Object portObj = proxyMap.get("port");
312 |                     if (portObj instanceof Integer) {
313 |                         paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY, portObj);
314 |                     } else {
315 |                         paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY, Integer.valueOf(8080));
316 |                     }
317 |                 }
318 |             }
319 | 
320 |             // authentications
321 |             // "authentications":[{"scope":{"scheme":"","host":"","port":0,"realm":""},
322 |             //   "credentials":{"username":"","password":""}},{...}]
323 |             final List<Map<String, Object>> authList = SettingsUtils.get(crawlSettings, "authentications", null);
324 |             if (authList != null && !authList.isEmpty()) {
325 |                 final List<Authentication> basicAuthList = new ArrayList<Authentication>();
326 |                 for (final Map<String, Object> authObj : authList) {
327 |                     @SuppressWarnings("unchecked")
328 |                     final Map<String, Object> scopeMap = (Map<String, Object>) authObj.get("scope");
329 |                     String scheme = SettingsUtils.get(scopeMap, "scheme", StringUtil.EMPTY).toUpperCase(Locale.ENGLISH);
330 |                     if (StringUtil.isBlank(scheme)) {
331 |                         logger.warn("Invalid authentication: " + authObj);
332 |                         continue;
333 |                     }
334 |                     @SuppressWarnings("unchecked")
335 |                     final Map<String, Object> credentialMap = (Map<String, Object>) authObj.get("credentials");
336 |                     final String username = SettingsUtils.get(credentialMap, "username", null);
337 |                     if (StringUtil.isBlank(username)) {
338 |                         logger.warn("Invalid authentication: " + authObj);
339 |                         continue;
340 |                     }
341 |                     final String host = SettingsUtils.get(authObj, "host", AuthScope.ANY_HOST);
342 |                     final int port = SettingsUtils.get(authObj, "port", AuthScope.ANY_PORT);
343 |                     final String realm = SettingsUtils.get(authObj, "realm", AuthScope.ANY_REALM);
344 |                     final String password = SettingsUtils.get(credentialMap, "password", null);
345 | 
346 |                     AuthScheme authScheme = null;
347 |                     Credentials credentials = null;
348 |                     if (BASIC_SCHEME.equalsIgnoreCase(scheme)) {
349 |                         authScheme = new BasicScheme();
350 |                         credentials = new UsernamePasswordCredentials(username, password);
351 |                     } else if (DIGEST_SCHEME.equals(scheme)) {
352 |                         authScheme = new DigestScheme();
353 |                         credentials = new UsernamePasswordCredentials(username, password);
354 |                     } else if (NTLM_SCHEME.equals(scheme)) {
355 |                         authScheme = new NTLMScheme(new JcifsEngine());
356 |                         scheme = AuthScope.ANY_SCHEME;
357 |                         final String workstation = SettingsUtils.get(credentialMap, "workstation", null);
358 |                         final String domain = SettingsUtils.get(credentialMap, "domain", null);
359 |                         credentials = new NTCredentials(username, password, workstation == null ? StringUtil.EMPTY : workstation,
360 |                                 domain == null ? StringUtil.EMPTY : domain);
361 |                     }
362 | 
363 |                     final AuthenticationImpl auth =
364 |                             new AuthenticationImpl(new AuthScope(host, port, realm, scheme), credentials, authScheme);
365 |                     basicAuthList.add(auth);
366 |                 }
367 |                 paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY, basicAuthList.toArray(new Authentication[basicAuthList.size()]));
368 |             }
369 | 
370 |             // request header
371 |             // "headers":[{"name":"","value":""},{}]
372 |             final List<Map<String, Object>> headerList = SettingsUtils.get(crawlSettings, "headers", null);
373 |             if (headerList != null && !headerList.isEmpty()) {
374 |                 final List<RequestHeader> requestHeaderList = new ArrayList<RequestHeader>();
375 |                 for (final Map<String, Object> headerObj : headerList) {
376 |                     final String name = SettingsUtils.get(headerObj, "name", null);
377 |                     final String value = SettingsUtils.get(headerObj, "value", null);
378 |                     if (name != null && value != null) {
379 |                         requestHeaderList.add(new RequestHeader(name, value));
380 |                     }
381 |                 }
382 |                 paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, requestHeaderList.toArray(new RequestHeader[requestHeaderList.size()]));
383 |             }
384 | 
385 |             // url
386 |             @SuppressWarnings("unchecked")
387 |             final List<String> urlList = (List<String>) crawlSettings.get("urls");
388 |             if (urlList == null || urlList.isEmpty()) {
389 |                 print("No url for crawling.");
390 |                 return 1;
391 |             }
392 |             for (final String url : urlList) {
393 |                 try {
394 |                     crawler.addUrl(url);
395 |                 } catch (DocumentAlreadyExistsException e) {
396 |                     logger.warn(url + " exists in " + sessionId);
397 |                 }
398 |             }
399 |             // include regex
400 |             @SuppressWarnings("unchecked")
401 |             final List<String> includeFilterList = (List<String>) crawlSettings.get("include_urls");
402 |             if (includeFilterList != null) {
403 |                 for (final String regex : includeFilterList) {
404 |                     try {
405 |                         crawler.addIncludeFilter(regex);
406 |                     } catch (DocumentAlreadyExistsException e) {
407 |                         logger.warn(regex + " exists in " + sessionId);
408 |                     }
409 |                 }
410 |             }
411 |             // exclude regex
412 |             @SuppressWarnings("unchecked")
413 |             final List<String> excludeFilterList = (List<String>) crawlSettings.get("exclude_urls");
414 |             if (excludeFilterList != null) {
415 |                 for (final String regex : excludeFilterList) {
416 |                     try {
417 |                         crawler.addExcludeFilter(regex);
418 |                     } catch (DocumentAlreadyExistsException e) {
419 |                         logger.warn(regex + " exists in " + sessionId);
420 |                     }
421 |                 }
422 |             }
423 | 
424 |             final CrawlerContext robotContext = crawler.getCrawlerContext();
425 | 
426 |             // max depth
427 |             final int maxDepth = SettingsUtils.get(crawlSettings, "max_depth", -1);
428 | 
429 |             robotContext.setMaxDepth(maxDepth);
430 |             // max access count
431 |             final int maxAccessCount = SettingsUtils.get(crawlSettings, "max_access_count", 100);
432 |             robotContext.setMaxAccessCount(maxAccessCount);
433 |             // num of thread
434 |             final int numOfThread = SettingsUtils.get(crawlSettings, "num_of_thread", 5);
435 |             robotContext.setNumOfThread(numOfThread);
436 |             // interval
437 |             final long interval = SettingsUtils.get(crawlSettings, "interval", 1000L);
438 |             final WebRiverIntervalController intervalController = (WebRiverIntervalController) crawler.getIntervalController();
439 |             intervalController.setDelayMillisForWaitingNewUrl(interval);
440 | 
441 |             // river params
442 |             riverConfig.setIndex(SettingsUtils.get(crawlSettings, "index", "web"));
443 |             riverConfig.setType(SettingsUtils.get(crawlSettings, "type", configId));
444 |             riverConfig.setOverwrite(SettingsUtils.get(crawlSettings, "overwrite", Boolean.FALSE));
445 |             riverConfig.setIncremental(SettingsUtils.get(crawlSettings, "incremental", Boolean.FALSE));
446 |             riverConfig.setScriptSettings(scriptSettings);
447 | 
448 |             // crawl config
449 |             for (final Map<String, Object> targetMap : targetList) {
450 |                 @SuppressWarnings("unchecked")
451 |                 final Map<String, Object> patternMap = (Map<String, Object>) targetMap.get("pattern");
452 |                 @SuppressWarnings("unchecked")
453 |                 final Map<String, Map<String, Object>> propMap = (Map<String, Map<String, Object>>) targetMap.get("properties");
454 |                 if (patternMap != null && propMap != null) {
455 |                     if (logger.isDebugEnabled()) {
456 |                         logger.debug("patternMap: " + patternMap);
457 |                         logger.debug("propMap: " + propMap);
458 |                     }
459 |                     @SuppressWarnings("unchecked")
460 |                     final Map<String, Object> settingMap = (Map<String, Object>) targetMap.get("settings");
461 |                     riverConfig.addScrapingRule(settingMap, patternMap, propMap);
462 |                 } else {
463 |                     logger.warn("Invalid pattern or target: patternMap: " + patternMap + ", propMap: " + propMap);
464 |                 }
465 |             }
466 | 
467 |             // run s2robot
468 |             crawler.execute();
469 | 
470 |             crawler.stop();
471 | 
472 |         } finally {
473 |             // invoke finish event script
474 |             ScriptUtils.execute(scriptSettings, "finish", v -> {
475 |                 v.putAll(vars);
476 |                 v.put("container", SingletonLaContainerFactory.getContainer());
477 |                 v.put("settings", crawlSettings);
478 |                 v.put("logger", RiverWeb.logger);
479 |             });
480 |             riverConfigManager.remove(sessionId);
481 | 
482 |             if (cleanup) {
483 |                 final EsUrlFilterService urlFilterService = SingletonLaContainer.getComponent(EsUrlFilterService.class);
484 |                 final EsUrlQueueService urlQueueService = SingletonLaContainer.getComponent(EsUrlQueueService.class);
485 |                 final EsDataService dataService = SingletonLaContainer.getComponent(EsDataService.class);
486 | 
487 |                 try {
488 |                     // clear url filter
489 |                     urlFilterService.delete(sessionId);
490 |                 } catch (Exception e) {
491 |                     logger.warn("Failed to delete UrlFilter for " + sessionId, e);
492 |                 }
493 | 
494 |                 try {
495 |                     // clear queue
496 |                     urlQueueService.clearCache();
497 |                     urlQueueService.delete(sessionId);
498 |                 } catch (Exception e) {
499 |                     logger.warn("Failed to delete UrlQueue for " + sessionId, e);
500 |                 }
501 | 
502 |                 try {
503 |                     // clear
504 |                     dataService.delete(sessionId);
505 |                 } catch (Exception e) {
506 |                     logger.warn("Failed to delete AccessResult for " + sessionId, e);
507 |                 }
508 |             }
509 |         }
510 | 
511 |         return 0;
512 |     }
513 | 
514 | }
515 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/ScriptExecutionException.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb;
 2 | 
 3 | public class ScriptExecutionException extends RuntimeException {
 4 | 
 5 |     private static final long serialVersionUID = 1L;
 6 | 
 7 |     public ScriptExecutionException(final String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public ScriptExecutionException(final String message, final Throwable cause) {
12 |         super(message, cause);
13 |     }
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/WebRiverConstants.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb;
 2 | 
 3 | public final class WebRiverConstants {
 4 | 
 5 |     private WebRiverConstants() {
 6 |     }
 7 | 
 8 |     public static final String DEFAULT_SCRIPT_LANG = "javascript";
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/app/service/ScriptService.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.app.service;
 2 | 
 3 | import java.io.File;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import java.util.Map;
 7 | 
 8 | import javax.annotation.Resource;
 9 | import javax.script.ScriptEngine;
10 | import javax.script.ScriptEngineManager;
11 | import javax.script.ScriptException;
12 | 
13 | import org.codelibs.core.io.FileUtil;
14 | import org.codelibs.fess.crawler.client.EsClient;
15 | import org.codelibs.riverweb.ScriptExecutionException;
16 | import org.elasticsearch.action.get.GetResponse;
17 | import org.elasticsearch.script.ScriptService.ScriptType;
18 | 
19 | public class ScriptService {
20 |     protected static final String SCRIPT_INDEX = ".scripts";
21 | 
22 |     @Resource
23 |     protected EsClient esClient;
24 | 
25 |     public Object execute(final String lang, final String script, final ScriptType scriptType, final Map<String, Object> localVars) {
26 |         final ScriptEngineManager manager = new ScriptEngineManager();
27 |         final ScriptEngine engine = manager.getEngineByName(lang);
28 | 
29 |         for (final Map.Entry<String, Object> entry : localVars.entrySet()) {
30 |             engine.put(entry.getKey(), entry.getValue());
31 |         }
32 |         try {
33 |             return engine.eval(getScriptContent(lang, script, scriptType));
34 |         } catch (final ScriptException e) {
35 |             throw new ScriptExecutionException("lang: " + lang + ", script: " + script + ", type: " + scriptType, e);
36 |         }
37 |     }
38 | 
39 |     private String getScriptContent(final String lang, final String script, final ScriptType scriptType) {
40 |         switch (scriptType) {
41 |         case INLINE:
42 |             return script;
43 |         case FILE:
44 |             if (Files.exists(Paths.get(script))) {
45 |                 return FileUtil.readText(new File(script));
46 |             } else {
47 |                 return FileUtil.readText(script);
48 |             }
49 |         case INDEXED:
50 |             final GetResponse response = esClient.prepareGet(SCRIPT_INDEX, lang, script).execute().actionGet();
51 |             if (!response.isExists()) {
52 |                 throw new ScriptExecutionException("/" + SCRIPT_INDEX + "/" + lang + "/" + script + " does not exist.");
53 |             }
54 |             final Map<String, Object> source = response.getSource();
55 |             if (source != null) {
56 |                 return (String) source.get("script");
57 |             }
58 |             break;
59 |         default:
60 |             break;
61 |         }
62 |         return null;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/config/RiverConfig.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.config;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | import org.codelibs.fess.crawler.entity.ResponseData;
 8 | import org.codelibs.riverweb.entity.ScrapingRule;
 9 | 
10 | public class RiverConfig {
11 | 
12 |     private String index;
13 | 
14 |     private String type;
15 | 
16 |     private boolean overwrite;
17 | 
18 |     private boolean incremental;
19 | 
20 |     private Map<String, Object> scriptSettings;
21 | 
22 |     private final List<ScrapingRule> scrapingRuleList = new ArrayList<>();
23 | 
24 |     public String getIndex() {
25 |         return index;
26 |     }
27 | 
28 |     public void setIndex(final String index) {
29 |         this.index = index;
30 |     }
31 | 
32 |     public String getType() {
33 |         return type;
34 |     }
35 | 
36 |     public void setType(final String type) {
37 |         this.type = type;
38 |     }
39 | 
40 |     public boolean isOverwrite() {
41 |         return overwrite;
42 |     }
43 | 
44 |     public void setOverwrite(final boolean overwrite) {
45 |         this.overwrite = overwrite;
46 |     }
47 | 
48 |     public boolean isIncremental() {
49 |         return incremental;
50 |     }
51 | 
52 |     public void setIncremental(final boolean incremental) {
53 |         this.incremental = incremental;
54 |     }
55 | 
56 |     public void addScrapingRule(final Map<String, Object> settingMap, final Map<String, Object> patternMap,
57 |             final Map<String, Map<String, Object>> scrapingRuleMap) {
58 |         scrapingRuleList.add(new ScrapingRule(settingMap, patternMap, scrapingRuleMap));
59 |     }
60 | 
61 |     public ScrapingRule getScrapingRule(final ResponseData responseData) {
62 |         for (final ScrapingRule scrapingRule : scrapingRuleList) {
63 |             if (scrapingRule.matches(responseData)) {
64 |                 return scrapingRule;
65 |             }
66 |         }
67 |         return null;
68 |     }
69 | 
70 |     public Map<String, Object> getScriptSettings() {
71 |         return scriptSettings;
72 |     }
73 | 
74 |     public void setScriptSettings(final Map<String, Object> scriptSettings) {
75 |         this.scriptSettings = scriptSettings;
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/config/RiverConfigManager.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.config;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class RiverConfigManager {
 7 |     protected Map<String, RiverConfig> configMap = new HashMap<>();
 8 | 
 9 |     public RiverConfig get(final String sessionId) {
10 |         synchronized (configMap) {
11 |             if (configMap.containsKey(sessionId)) {
12 |                 return configMap.get(sessionId);
13 |             }
14 |             RiverConfig config = new RiverConfig();
15 |             configMap.put(sessionId, config);
16 |             return config;
17 |         }
18 |     }
19 | 
20 |     public RiverConfig remove(final String sessionId) {
21 |         synchronized (configMap) {
22 |             return configMap.remove(sessionId);
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/crawler/RwCrawlerThread.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.crawler;
 2 | 
 3 | import java.util.Date;
 4 | 
 5 | import org.codelibs.fess.crawler.CrawlerThread;
 6 | import org.codelibs.fess.crawler.client.CrawlerClient;
 7 | import org.codelibs.fess.crawler.client.EsClient;
 8 | import org.codelibs.fess.crawler.entity.UrlQueue;
 9 | import org.codelibs.riverweb.config.RiverConfig;
10 | import org.codelibs.riverweb.config.RiverConfigManager;
11 | import org.codelibs.riverweb.util.ConversionUtil;
12 | import org.elasticsearch.action.search.SearchResponse;
13 | import org.elasticsearch.index.query.QueryBuilders;
14 | import org.elasticsearch.search.SearchHitField;
15 | import org.elasticsearch.search.SearchHits;
16 | import org.elasticsearch.search.sort.SortOrder;
17 | import org.lastaflute.di.core.SingletonLaContainer;
18 | import org.slf4j.Logger;
19 | import org.slf4j.LoggerFactory;
20 | 
21 | public class RwCrawlerThread extends CrawlerThread {
22 |     private static final Logger logger = LoggerFactory.getLogger(RwCrawlerThread.class);
23 | 
24 |     @Override
25 |     protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
26 |         final RiverConfigManager riverConfigManager = SingletonLaContainer.getComponent(RiverConfigManager.class);
27 |         final RiverConfig riverConfig = riverConfigManager.get(crawlerContext.getSessionId());
28 |         if (riverConfig.isIncremental()) {
29 |             final EsClient esClient = SingletonLaContainer.getComponent(EsClient.class);
30 |             try {
31 |                 final SearchResponse response = esClient.prepareSearch(riverConfig.getIndex()).setTypes(riverConfig.getType())
32 |                         .setQuery(QueryBuilders.termQuery("url", urlQueue.getUrl())).addField("lastModified")
33 |                         .addSort("lastModified", SortOrder.DESC).execute().actionGet();
34 |                 final SearchHits hits = response.getHits();
35 |                 if (hits.getTotalHits() > 0) {
36 |                     final SearchHitField lastModifiedField = hits.getAt(0).getFields().get("lastModified");
37 |                     if (lastModifiedField != null) {
38 |                         final Date lastModified = ConversionUtil.convert(lastModifiedField.getValue(), Date.class);
39 |                         if (lastModified != null) {
40 |                             urlQueue.setLastModified(lastModified.getTime());
41 |                         }
42 |                     }
43 |                 }
44 |             } catch (final Exception e) {
45 |                 logger.debug("Failed to retrieve lastModified.", e);
46 |             }
47 |         }
48 |         return super.isContentUpdated(client, urlQueue);
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/entity/ScrapingRule.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.entity;
 2 | 
 3 | import java.lang.reflect.Field;
 4 | import java.util.Collections;
 5 | import java.util.LinkedHashMap;
 6 | import java.util.Map;
 7 | import java.util.regex.Pattern;
 8 | 
 9 | import org.codelibs.core.beans.BeanDesc;
10 | import org.codelibs.core.beans.factory.BeanDescFactory;
11 | import org.codelibs.core.lang.FieldUtil;
12 | import org.codelibs.fess.crawler.entity.ResponseData;
13 | import org.codelibs.riverweb.util.SettingsUtils;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | public class ScrapingRule {
18 |     private static final Logger logger = LoggerFactory.getLogger(ScrapingRule.class);
19 | 
20 |     final Map<String, Pattern> patternMap = new LinkedHashMap<String, Pattern>();
21 | 
22 |     final Map<String, Object> settingMap;
23 | 
24 |     final Map<String, Map<String, Object>> ruleMap;
25 | 
26 |     public ScrapingRule(final Map<String, Object> settingMap, final Map<String, Object> paramPatternMap,
27 |             final Map<String, Map<String, Object>> ruleMap) {
28 |         if (settingMap == null) {
29 |             this.settingMap = Collections.emptyMap();
30 |         } else {
31 |             this.settingMap = settingMap;
32 |         }
33 |         this.ruleMap = ruleMap;
34 |         for (final Map.Entry<String, Object> entry : paramPatternMap.entrySet()) {
35 |             final Object value = entry.getValue();
36 |             if (value instanceof String) {
37 |                 patternMap.put(entry.getKey(), Pattern.compile(value.toString()));
38 |             }
39 |         }
40 |         if (logger.isDebugEnabled()) {
41 |             logger.debug("patternMap: " + patternMap);
42 |         }
43 |     }
44 | 
45 |     public boolean matches(final ResponseData responseData) {
46 |         if (patternMap.isEmpty()) {
47 |             return false;
48 |         }
49 | 
50 |         try {
51 |             final BeanDesc beanDesc = BeanDescFactory.getBeanDesc(responseData.getClass());
52 |             for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
53 |                 final Field field = beanDesc.getFieldDesc(entry.getKey()).getField();
54 |                 final Object value = FieldUtil.get(field, responseData);
55 |                 if (value == null || !entry.getValue().matcher(value.toString()).matches()) {
56 |                     return false;
57 |                 }
58 |             }
59 |             return true;
60 |         } catch (final Exception e) {
61 |             logger.warn("Invalid parameters: " + responseData, e);
62 |             return false;
63 |         }
64 |     }
65 | 
66 |     public Map<String, Map<String, Object>> getRuleMap() {
67 |         return ruleMap;
68 |     }
69 | 
70 |     public <T, V> T getSetting(final String key, final T defaultValue) {
71 |         return SettingsUtils.get(settingMap, key, defaultValue);
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/interval/WebRiverIntervalController.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.interval;
 2 | 
 3 | import org.codelibs.fess.crawler.interval.impl.DefaultIntervalController;
 4 | 
 5 | public class WebRiverIntervalController extends DefaultIntervalController {
 6 |     public void setDelayMillisForWaitingNewUrl(final long delayMillisForWaitingNewUrl) {
 7 |         this.delayMillisForWaitingNewUrl = delayMillisForWaitingNewUrl;
 8 |     }
 9 | 
10 |     public long getDelayMillisForWaitingNewUrl() {
11 |         return delayMillisForWaitingNewUrl;
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/transformer/ScrapingTransformer.java:
--------------------------------------------------------------------------------
  1 | package org.codelibs.riverweb.transformer;
  2 | 
  3 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
  4 | 
  5 | import java.io.BufferedInputStream;
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.io.IOException;
  9 | import java.io.InputStream;
 10 | import java.lang.reflect.Method;
 11 | import java.util.ArrayList;
 12 | import java.util.Collections;
 13 | import java.util.Date;
 14 | import java.util.HashMap;
 15 | import java.util.HashSet;
 16 | import java.util.LinkedHashMap;
 17 | import java.util.List;
 18 | import java.util.Map;
 19 | import java.util.Set;
 20 | import java.util.regex.Matcher;
 21 | import java.util.regex.Pattern;
 22 | import java.util.stream.Collectors;
 23 | 
 24 | import javax.annotation.PostConstruct;
 25 | 
 26 | import org.codelibs.core.beans.BeanDesc;
 27 | import org.codelibs.core.beans.factory.BeanDescFactory;
 28 | import org.codelibs.core.beans.util.BeanUtil;
 29 | import org.codelibs.core.io.CopyUtil;
 30 | import org.codelibs.core.io.FileUtil;
 31 | import org.codelibs.core.lang.MethodUtil;
 32 | import org.codelibs.core.lang.StringUtil;
 33 | import org.codelibs.core.misc.Base64Util;
 34 | import org.codelibs.fess.crawler.Constants;
 35 | import org.codelibs.fess.crawler.builder.RequestDataBuilder;
 36 | import org.codelibs.fess.crawler.client.EsClient;
 37 | import org.codelibs.fess.crawler.entity.AccessResultData;
 38 | import org.codelibs.fess.crawler.entity.RequestData;
 39 | import org.codelibs.fess.crawler.entity.ResponseData;
 40 | import org.codelibs.fess.crawler.entity.ResultData;
 41 | import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 42 | import org.codelibs.fess.crawler.helper.EncodingHelper;
 43 | import org.codelibs.fess.crawler.transformer.impl.HtmlTransformer;
 44 | import org.codelibs.riverweb.WebRiverConstants;
 45 | import org.codelibs.riverweb.app.service.ScriptService;
 46 | import org.codelibs.riverweb.config.RiverConfig;
 47 | import org.codelibs.riverweb.config.RiverConfigManager;
 48 | import org.codelibs.riverweb.entity.ScrapingRule;
 49 | import org.codelibs.riverweb.util.SettingsUtils;
 50 | import org.elasticsearch.index.query.QueryBuilders;
 51 | import org.elasticsearch.script.ScriptService.ScriptType;
 52 | import org.jsoup.Jsoup;
 53 | import org.jsoup.nodes.Element;
 54 | import org.jsoup.select.Elements;
 55 | import org.lastaflute.di.core.SingletonLaContainer;
 56 | import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
 57 | import org.slf4j.Logger;
 58 | import org.slf4j.LoggerFactory;
 59 | 
 60 | public class ScrapingTransformer extends HtmlTransformer {
 61 | 
 62 |     private static final long DEFAULT_MAX_ATTACHMENT_SIZE = 1000 * 1000; // 1M
 63 | 
 64 |     private static final String VALUE_QUERY_TYPE = "value";
 65 | 
 66 |     private static final String TYPE_QUERY_TYPE = "type";
 67 | 
 68 |     private static final String SCRIPT_QUERY_TYPE = "script";
 69 | 
 70 |     private static final String ARGS_QUERY_TYPE = "args";
 71 | 
 72 |     private static final String IS_ARRAY_PROP_NAME = "is_array";
 73 | 
 74 |     private static final String IS_DISTINCT_PROP_NAME = "is_distinct";
 75 | 
 76 |     private static final String IS_CHILD_URL_PROP_NAME = "is_child";
 77 | 
 78 |     private static final String TRIM_SPACES_PROP_NAME = "trim_spaces";
 79 | 
 80 |     private static final String TIMESTAMP_FIELD = "@timestamp";
 81 | 
 82 |     private static final String POSITION_FIELD = "position";
 83 | 
 84 |     private static final String ARRAY_PROPERTY_PREFIX = "[]";
 85 | 
 86 |     private static final Logger logger = LoggerFactory.getLogger(ScrapingTransformer.class);
 87 | 
 88 |     private static final String[] queryTypes = new String[] { "className", "data", "html", "id", "ownText", "tagName", "text", "val",
 89 |             "nodeName", "outerHtml", "attr", "baseUri", "absUrl" };
 90 | 
 91 |     public String[] copiedResonseDataFields = new String[] { "url", "parentUrl", "httpStatusCode", "method", "charSet", "contentLength",
 92 |             "mimeType", "executionTime", "lastModified" };
 93 | 
 94 |     private EsClient esClient;
 95 | 
 96 |     protected RiverConfigManager riverConfigManager;
 97 | 
 98 |     protected ThreadLocal<Set<String>> childUrlSetLocal = new ThreadLocal<Set<String>>();
 99 | 
100 |     protected ThreadLocal<RiverConfig> riverConfigLocal = new ThreadLocal<>();
101 | 
102 | 
103 |     @PostConstruct
104 |     public void init() {
105 |         esClient = SingletonLaContainer.getComponent(EsClient.class);
106 |         riverConfigManager = SingletonLaContainer.getComponent(RiverConfigManager.class);
107 |     }
108 | 
109 |     @Override
110 |     public ResultData transform(final ResponseData responseData) {
111 |         final RiverConfig riverConfig = riverConfigManager.get(responseData.getSessionId());
112 | 
113 |         try {
114 |             riverConfigLocal.set(riverConfig);
115 |             return super.transform(responseData);
116 |         } finally {
117 |             riverConfigLocal.remove();
118 |             childUrlSetLocal.remove();
119 |         }
120 |     }
121 | 
122 |     @Override
123 |     protected void updateCharset(final ResponseData responseData) {
124 |         int preloadSize = preloadSizeForCharset;
125 |         final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData);
126 |         if (scrapingRule != null) {
127 |             final Integer s = scrapingRule.getSetting("preloadSizeForCharset", Integer.valueOf(0));
128 |             if (s.intValue() > 0) {
129 |                 preloadSize = s.intValue();
130 |             }
131 |         }
132 |         final String encoding = loadCharset(responseData.getResponseBody(), preloadSize);
133 |         if (encoding == null) {
134 |             if (defaultEncoding == null) {
135 |                 responseData.setCharSet(Constants.UTF_8);
136 |             } else if (responseData.getCharSet() == null) {
137 |                 responseData.setCharSet(defaultEncoding);
138 |             }
139 |         } else {
140 |             responseData.setCharSet(encoding.trim());
141 |         }
142 | 
143 |         if (!isSupportedCharset(responseData.getCharSet())) {
144 |             responseData.setCharSet(Constants.UTF_8);
145 |         }
146 |     }
147 | 
148 |     protected String loadCharset(final InputStream inputStream, final int preloadSize) {
149 |         BufferedInputStream bis = null;
150 |         String encoding = null;
151 |         try {
152 |             bis = new BufferedInputStream(inputStream);
153 |             final byte[] buffer = new byte[preloadSize];
154 |             final int size = bis.read(buffer);
155 |             if (size != -1) {
156 |                 final String content = new String(buffer, 0, size);
157 |                 encoding = parseCharset(content);
158 |             }
159 |         } catch (final IOException e) {
160 |             throw new CrawlingAccessException("Could not load a content.", e);
161 |         }
162 | 
163 |         try {
164 |             final EncodingHelper encodingHelper = SingletonLaContainer.getComponent(EncodingHelper.class);
165 |             encoding = encodingHelper.normalize(encoding);
166 |         } catch (final Exception e) {
167 |             // NOP
168 |         }
169 | 
170 |         return encoding;
171 |     }
172 | 
173 |     @Override
174 |     protected void storeData(final ResponseData responseData, final ResultData resultData) {
175 |         File file = null;
176 |         try {
177 |             final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData);
178 |             if (scrapingRule == null) {
179 |                 logger.info("Skip Scraping: " + responseData.getUrl());
180 |                 return;
181 |             }
182 | 
183 |             file = File.createTempFile("river-web-", ".tmp");
184 |             CopyUtil.copy(responseData.getResponseBody(), file);
185 |             processData(scrapingRule, file, responseData, resultData);
186 |         } catch (final IOException e) {
187 |             throw new CrawlingAccessException("Failed to create a temp file.", e);
188 |         } finally {
189 |             if (file != null && !file.delete()) {
190 |                 logger.warn("Failed to delete " + file.getAbsolutePath());
191 |             }
192 |         }
193 |     }
194 | 
195 |     protected void processData(final ScrapingRule scrapingRule, final File file, final ResponseData responseData,
196 |             final ResultData resultData) {
197 |         final Map<String, Map<String, Object>> scrapingRuleMap = scrapingRule.getRuleMap();
198 | 
199 |         org.jsoup.nodes.Document document = null;
200 |         String charsetName = responseData.getCharSet();
201 |         if (charsetName == null) {
202 |             charsetName = Constants.UTF_8;
203 |         }
204 | 
205 |         final Boolean isHtmlParsed = scrapingRule.getSetting("html", Boolean.TRUE);
206 |         if (isHtmlParsed.booleanValue()) {
207 |             try (InputStream is = new BufferedInputStream(new FileInputStream(file))) {
208 |                 document = Jsoup.parse(is, charsetName, responseData.getUrl());
209 |             } catch (final IOException e) {
210 |                 throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
211 |             }
212 |         }
213 | 
214 |         final Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
215 |         BeanUtil.copyBeanToMap(responseData, dataMap, op -> {
216 |             op.include(copiedResonseDataFields).excludeNull().excludeWhitespace();
217 |         });
218 |         if (logger.isDebugEnabled()) {
219 |             logger.debug("ruleMap: " + scrapingRuleMap);
220 |             logger.debug("dataMap: " + dataMap);
221 |         }
222 |         for (final Map.Entry<String, Map<String, Object>> entry : scrapingRuleMap.entrySet()) {
223 |             final String propName = entry.getKey();
224 |             final Map<String, Object> params = entry.getValue();
225 |             final boolean isTrimSpaces = SettingsUtils.get(params, TRIM_SPACES_PROP_NAME, Boolean.FALSE).booleanValue();
226 |             boolean isArray = SettingsUtils.get(params, IS_ARRAY_PROP_NAME, Boolean.FALSE).booleanValue();
227 |             boolean isChildUrl = SettingsUtils.get(params, IS_CHILD_URL_PROP_NAME, Boolean.FALSE).booleanValue();
228 |             boolean isDistinct = SettingsUtils.get(params, IS_DISTINCT_PROP_NAME, Boolean.FALSE).booleanValue();
229 | 
230 |             final List<String> strList = new ArrayList<String>();
231 | 
232 |             final Object value = SettingsUtils.get(params, VALUE_QUERY_TYPE, null);
233 |             final String type = SettingsUtils.get(params, TYPE_QUERY_TYPE, null);
234 |             if (value != null) {
235 |                 if (value instanceof String) {
236 |                     strList.add(trimSpaces(value.toString(), isTrimSpaces));
237 |                 } else if (value instanceof List) {
238 |                     @SuppressWarnings("unchecked")
239 |                     final List<Object> list = (List<Object>) value;
240 |                     for (final Object obj : list) {
241 |                         strList.add(trimSpaces(obj.toString(), isTrimSpaces));
242 |                     }
243 |                 }
244 |             } else if ("data".equals(type) || "attachment".equals(type)) {
245 |                 final long maxFileSize = SettingsUtils.get(params, "maxFileSize", DEFAULT_MAX_ATTACHMENT_SIZE);
246 |                 final long fileSize = file.length();
247 |                 if (fileSize <= maxFileSize) {
248 |                     strList.add(Base64Util.encode(FileUtil.readBytes(file)));
249 |                     isArray = false;
250 |                     isChildUrl = false;
251 |                     isDistinct = false;
252 |                 } else {
253 |                     logger.info("The max file size(" + fileSize + "/" + maxFileSize + " is exceeded: " + responseData.getUrl());
254 |                 }
255 |             } else if ("source".equals(type)) {
256 |                 try {
257 |                     strList.add(trimSpaces(FileUtil.readText(file, charsetName), isTrimSpaces));
258 |                 } catch (Exception e) {
259 |                     logger.warn("Failed to read type:source from " + responseData.getUrl(), e);
260 |                 }
261 |             } else if (document != null) {
262 |                 processCssQuery(document, propName, params, isTrimSpaces, strList);
263 |             }
264 | 
265 |             Object propertyValue;
266 |             final ScriptInfo scriptInfo = getScriptValue(params);
267 |             if (isDistinct) {
268 |                 final Set<String> strSet = new HashSet<>();
269 |                 final List<String> distinctList = strList.stream().filter(s -> strSet.add(s) && (!isTrimSpaces || StringUtil.isNotBlank(s)))
270 |                         .collect(Collectors.toList());
271 |                 strList.clear();
272 |                 strList.addAll(distinctList);
273 |             }
274 |             if (scriptInfo == null) {
275 |                 propertyValue = isArray ? strList : String.join(" ", strList);
276 |             } else {
277 |                 final Map<String, Object> vars = new HashMap<String, Object>();
278 |                 vars.put("container", SingletonLaContainerFactory.getContainer());
279 |                 vars.put("client", esClient);
280 |                 vars.put("data", responseData);
281 |                 vars.put("result", resultData);
282 |                 vars.put("property", propName);
283 |                 vars.put("parameters", params);
284 |                 vars.put("array", isArray);
285 |                 vars.put("list", strList);
286 |                 if (isArray) {
287 |                     final List<Object> list = new ArrayList<Object>();
288 |                     for (int i = 0; i < strList.size(); i++) {
289 |                         final Map<String, Object> localVars = new HashMap<String, Object>(vars);
290 |                         localVars.put("index", i);
291 |                         localVars.put("value", String.join(" ", strList));
292 |                         list.add(executeScript(scriptInfo.getLang(), scriptInfo.getScript(), scriptInfo.getScriptType(), localVars));
293 |                     }
294 |                     propertyValue = list;
295 |                 } else {
296 |                     vars.put("value", String.join(" ", strList));
297 |                     propertyValue = executeScript(scriptInfo.getLang(), scriptInfo.getScript(), scriptInfo.getScriptType(), vars);
298 |                 }
299 |             }
300 |             addPropertyData(dataMap, propName, propertyValue);
301 |             if (isChildUrl) {
302 |                 Set<String> childUrlSet = childUrlSetLocal.get();
303 |                 if (childUrlSet == null) {
304 |                     childUrlSet = new HashSet<String>();
305 |                     childUrlSetLocal.set(childUrlSet);
306 |                 }
307 |                 if (propertyValue instanceof String) {
308 |                     final String str = (String) propertyValue;
309 |                     if (StringUtil.isNotBlank(str)) {
310 |                         childUrlSet.add(str);
311 |                     }
312 |                 } else if (propertyValue instanceof List) {
313 |                     @SuppressWarnings("unchecked")
314 |                     final List<Object> list = (List<Object>) propertyValue;
315 |                     for (final Object obj : list) {
316 |                         final String str = obj.toString();
317 |                         if (StringUtil.isNotBlank(str)) {
318 |                             childUrlSet.add(str);
319 |                         }
320 |                     }
321 |                 }
322 |             }
323 |         }
324 | 
325 |         storeIndex(responseData, dataMap);
326 |     }
327 | 
328 |     private Object executeScript(final String lang, final String script, final String scriptTypeValue, final Map<String, Object> vars) {
329 |         ScriptType scriptType;
330 |         if (ScriptType.FILE.toString().equalsIgnoreCase(scriptTypeValue)) {
331 |             scriptType = ScriptType.FILE;
332 |         } else if (ScriptType.INDEXED.toString().equalsIgnoreCase(scriptTypeValue)) {
333 |             scriptType = ScriptType.INDEXED;
334 |         } else {
335 |             scriptType = ScriptType.INLINE;
336 |         }
337 |         vars.put("logger", logger);
338 |         final ScriptService scriptService = SingletonLaContainer.getComponent(ScriptService.class);
339 |         return scriptService.execute(lang, script, scriptType, vars);
340 |     }
341 | 
342 |     protected ScriptInfo getScriptValue(final Map<String, Object> params) {
343 |         final Object value = SettingsUtils.get(params, SCRIPT_QUERY_TYPE, null);
344 |         if (value == null) {
345 |             return null;
346 |         } else if (value instanceof String) {
347 |             return new ScriptInfo(value.toString());
348 |         } else if (value instanceof List) {
349 |             @SuppressWarnings("unchecked")
350 |             final List<CharSequence> list = (List<CharSequence>) value;
351 |             return new ScriptInfo(String.join("", list));
352 |         } else if (value instanceof Map) {
353 |             @SuppressWarnings("unchecked")
354 |             final Map<String, Object> scriptMap = (Map<String, Object>) value;
355 |             final String script = SettingsUtils.get(scriptMap, SCRIPT_QUERY_TYPE);
356 |             if (script == null) {
357 |                 return null;
358 |             }
359 |             return new ScriptInfo(script, SettingsUtils.get(scriptMap, "lang", WebRiverConstants.DEFAULT_SCRIPT_LANG),
360 |                     SettingsUtils.get(scriptMap, "script_type", "inline"));
361 |         }
362 |         return null;
363 |     }
364 | 
365 |     private static class ScriptInfo {
366 |         private final String script;
367 | 
368 |         private final String lang;
369 | 
370 |         private final String scriptType;
371 | 
372 |         ScriptInfo(final String script) {
373 |             this(script, WebRiverConstants.DEFAULT_SCRIPT_LANG, "inline");
374 |         }
375 | 
376 |         ScriptInfo(final String script, final String lang, final String scriptType) {
377 |             this.script = script;
378 |             this.lang = lang;
379 |             this.scriptType = scriptType;
380 |         }
381 | 
382 |         public String getScript() {
383 |             return script;
384 |         }
385 | 
386 |         public String getLang() {
387 |             return lang;
388 |         }
389 | 
390 |         public String getScriptType() {
391 |             return scriptType;
392 |         }
393 |     }
394 | 
395 |     protected void processCssQuery(final org.jsoup.nodes.Document document, final String propName, final Map<String, Object> params,
396 |             final boolean isTrimSpaces, final List<String> strList) {
397 |         for (final String queryType : queryTypes) {
398 |             final Object queryObj = SettingsUtils.get(params, queryType, null);
399 |             Element[] elements = null;
400 |             if (queryObj instanceof String) {
401 |                 elements = getElements(new Element[] { document }, queryObj.toString());
402 |             } else if (queryObj instanceof List) {
403 |                 @SuppressWarnings("unchecked")
404 |                 final List<String> queryList = (List<String>) queryObj;
405 |                 elements = getElements(new Element[] { document }, queryList, propName.startsWith(ARRAY_PROPERTY_PREFIX));
406 |             }
407 |             if (elements != null) {
408 |                 for (final Element element : elements) {
409 |                     if (element == null) {
410 |                         strList.add(null);
411 |                     } else {
412 |                         final List<Object> argList = SettingsUtils.get(params, ARGS_QUERY_TYPE, Collections.emptyList());
413 |                         try {
414 |                             final Method queryMethod = getQueryMethod(element, queryType, argList);
415 |                             strList.add(trimSpaces(
416 |                                     (String) MethodUtil.invoke(queryMethod, element, argList.toArray(new Object[argList.size()])),
417 |                                     isTrimSpaces));
418 |                         } catch (final Exception e) {
419 |                             logger.warn("Could not invoke " + queryType + " on " + element, e);
420 |                             strList.add(null);
421 |                         }
422 |                     }
423 |                 }
424 |                 break;
425 |             }
426 |         }
427 |     }
428 | 
429 |     protected Method getQueryMethod(final Element element, final String queryType, final List<Object> argList) {
430 |         final BeanDesc elementDesc = BeanDescFactory.getBeanDesc(element.getClass());
431 |         if (argList == null || argList.isEmpty()) {
432 |             return elementDesc.getMethodDesc(queryType).getMethod();
433 |         } else {
434 |             final Class<?>[] paramTypes = new Class[argList.size()];
435 |             for (int i = 0; i < paramTypes.length; i++) {
436 |                 paramTypes[i] = String.class;
437 |             }
438 |             return elementDesc.getMethodDesc(queryType, paramTypes).getMethod();
439 |         }
440 |     }
441 | 
442 |     protected Element[] getElements(final Element[] elements, final List<String> queries, final boolean isArrayProperty) {
443 |         Element[] targets = elements;
444 |         for (final String query : queries) {
445 |             final List<Element> elementList = new ArrayList<Element>();
446 |             for (final Element element : targets) {
447 |                 if (element == null) {
448 |                     elementList.add(null);
449 |                 } else {
450 |                     final Element[] childElements = getElements(new Element[] { element }, query);
451 |                     if (childElements.length == 0 && isArrayProperty) {
452 |                         elementList.add(null);
453 |                     } else {
454 |                         for (final Element childElement : childElements) {
455 |                             elementList.add(childElement);
456 |                         }
457 |                     }
458 |                 }
459 |             }
460 |             targets = elementList.toArray(new Element[elementList.size()]);
461 |         }
462 |         return targets;
463 |     }
464 | 
465 |     protected Element[] getElements(final Element[] elements, final String query) {
466 |         Element[] targets = elements;
467 |         final Pattern pattern = Pattern.compile(":eq\\(([0-9]+)\\)|:lt\\(([0-9]+)\\)|:gt\\(([0-9]+)\\)");
468 |         final Matcher matcher = pattern.matcher(query);
469 |         final StringBuffer buf = new StringBuffer();
470 |         while (matcher.find()) {
471 |             final String value = matcher.group();
472 |             matcher.appendReplacement(buf, "");
473 |             if (buf.charAt(buf.length() - 1) != ' ') {
474 |                 try {
475 |                     final int index = Integer.parseInt(matcher.group(1));
476 |                     final List<Element> elementList = new ArrayList<Element>();
477 |                     final String childQuery = buf.toString();
478 |                     for (final Element element : targets) {
479 |                         final Elements childElements = element.select(childQuery);
480 |                         if (value.startsWith(":eq")) {
481 |                             if (index < childElements.size()) {
482 |                                 elementList.add(childElements.get(index));
483 |                             }
484 |                         } else if (value.startsWith(":lt")) {
485 |                             for (int i = 0; i < childElements.size() && i < index; i++) {
486 |                                 elementList.add(childElements.get(i));
487 |                             }
488 |                         } else if (value.startsWith(":gt")) {
489 |                             for (int i = index + 1; i < childElements.size(); i++) {
490 |                                 elementList.add(childElements.get(i));
491 |                             }
492 |                         }
493 |                     }
494 |                     targets = elementList.toArray(new Element[elementList.size()]);
495 |                     buf.setLength(0);
496 |                 } catch (final NumberFormatException e) {
497 |                     logger.warn("Invalid number: " + query, e);
498 |                     buf.append(value);
499 |                 }
500 |             } else {
501 |                 buf.append(value);
502 |             }
503 |         }
504 |         matcher.appendTail(buf);
505 |         final String lastQuery = buf.toString();
506 |         if (StringUtil.isNotBlank(lastQuery)) {
507 |             final List<Element> elementList = new ArrayList<Element>();
508 |             for (final Element element : targets) {
509 |                 if (element == null) {
510 |                     elementList.add(null);
511 |                 } else {
512 |                     final Elements childElements = element.select(lastQuery);
513 |                     for (int i = 0; i < childElements.size(); i++) {
514 |                         elementList.add(childElements.get(i));
515 |                     }
516 |                 }
517 |             }
518 |             targets = elementList.toArray(new Element[elementList.size()]);
519 |         }
520 |         return targets;
521 |     }
522 | 
523 |     protected String trimSpaces(final String value, final boolean trimSpaces) {
524 |         if (value == null) {
525 |             return null;
526 |         }
527 |         if (trimSpaces) {
528 |             return value.replaceAll("\\s+", " ").trim();
529 |         }
530 |         return value;
531 |     }
532 | 
533 |     protected void addPropertyData(final Map<String, Object> dataMap, final String key, final Object value) {
534 |         Map<String, Object> currentDataMap = dataMap;
535 |         final String[] keys = key.split("\\.");
536 |         for (int i = 0; i < keys.length - 1; i++) {
537 |             final String currentKey = keys[i];
538 |             @SuppressWarnings("unchecked")
539 |             Map<String, Object> map = (Map<String, Object>) currentDataMap.get(currentKey);
540 |             if (map == null) {
541 |                 map = new LinkedHashMap<String, Object>();
542 |                 currentDataMap.put(currentKey, map);
543 |             }
544 |             currentDataMap = map;
545 |         }
546 |         currentDataMap.put(keys[keys.length - 1], value);
547 |     }
548 | 
549 |     protected void storeIndex(final ResponseData responseData, final Map<String, Object> dataMap) {
550 |         final String sessionId = responseData.getSessionId();
551 |         final RiverConfig riverConfig = riverConfigLocal.get();
552 |         final String indexName = riverConfig.getIndex();
553 |         final String typeName = riverConfig.getType();
554 |         final boolean overwrite = riverConfig.isOverwrite();
555 | 
556 |         if (logger.isDebugEnabled()) {
557 |             logger.debug("Index: " + indexName + ", sessionId: " + sessionId + ", Data: " + dataMap);
558 |         }
559 | 
560 |         if (overwrite) {
561 |             final int count = esClient.deleteByQuery(indexName, typeName, QueryBuilders.termQuery("url", responseData.getUrl()));
562 |             if (count > 0) {
563 |                 esClient.admin().indices().prepareRefresh(indexName).execute().actionGet();
564 |             }
565 |         }
566 | 
567 |         @SuppressWarnings("unchecked")
568 |         final Map<String, Object> arrayDataMap = (Map<String, Object>) dataMap.remove(ARRAY_PROPERTY_PREFIX);
569 |         if (arrayDataMap != null) {
570 |             final Map<String, Object> flatArrayDataMap = new LinkedHashMap<String, Object>();
571 |             convertFlatMap("", arrayDataMap, flatArrayDataMap);
572 |             int maxSize = 0;
573 |             for (final Map.Entry<String, Object> entry : flatArrayDataMap.entrySet()) {
574 |                 final Object value = entry.getValue();
575 |                 if (value instanceof List) {
576 |                     @SuppressWarnings("rawtypes")
577 |                     final int size = ((List) value).size();
578 |                     if (size > maxSize) {
579 |                         maxSize = size;
580 |                     }
581 |                 }
582 |             }
583 |             for (int i = 0; i < maxSize; i++) {
584 |                 final Map<String, Object> newDataMap = new LinkedHashMap<String, Object>();
585 |                 newDataMap.put(POSITION_FIELD, i);
586 |                 deepCopy(dataMap, newDataMap);
587 |                 for (final Map.Entry<String, Object> entry : flatArrayDataMap.entrySet()) {
588 |                     final Object value = entry.getValue();
589 |                     if (value instanceof List) {
590 |                         @SuppressWarnings("unchecked")
591 |                         final List<Object> list = (List<Object>) value;
592 |                         if (i < list.size()) {
593 |                             addPropertyData(newDataMap, entry.getKey(), list.get(i));
594 |                         }
595 |                     } else if (i == 0) {
596 |                         addPropertyData(newDataMap, entry.getKey(), value);
597 |                     }
598 |                 }
599 |                 storeIndex(indexName, typeName, newDataMap);
600 |             }
601 |         } else {
602 |             storeIndex(indexName, typeName, dataMap);
603 |         }
604 |     }
605 | 
606 |     protected void storeIndex(final String indexName, final String typeName, final Map<String, Object> dataMap) {
607 |         dataMap.put(TIMESTAMP_FIELD, new Date());
608 | 
609 |         if (logger.isDebugEnabled()) {
610 |             logger.debug(indexName + "/" + typeName + " : dataMap" + dataMap);
611 |         }
612 | 
613 |         try {
614 |             esClient.prepareIndex(indexName, typeName).setRefresh(true).setSource(jsonBuilder().value(dataMap)).execute().actionGet();
615 |         } catch (final Exception e) {
616 |             logger.warn("Could not write a content into index.", e);
617 |         }
618 |     }
619 | 
620 |     protected void deepCopy(final Map<String, Object> oldMap, final Map<String, Object> newMap) {
621 |         final Map<String, Object> flatMap = new LinkedHashMap<String, Object>();
622 |         convertFlatMap("", oldMap, flatMap);
623 |         for (final Map.Entry<String, Object> entry : flatMap.entrySet()) {
624 |             addPropertyData(newMap, entry.getKey(), entry.getValue());
625 |         }
626 |     }
627 | 
628 |     @SuppressWarnings("unchecked")
629 |     protected void convertFlatMap(final String prefix, final Map<String, Object> oldMap, final Map<String, Object> newMap) {
630 |         for (final Map.Entry<String, Object> entry : oldMap.entrySet()) {
631 |             final Object value = entry.getValue();
632 |             if (value instanceof Map) {
633 |                 convertFlatMap(prefix + entry.getKey() + ".", (Map<String, Object>) value, newMap);
634 |             } else {
635 |                 newMap.put(prefix + entry.getKey(), value);
636 |             }
637 |         }
638 |     }
639 | 
640 |     @Override
641 |     protected void storeChildUrls(final ResponseData responseData, final ResultData resultData) {
642 |         final Set<String> childLinkSet = childUrlSetLocal.get();
643 |         if (childLinkSet != null) {
644 |             List<RequestData> requestDataList = convertChildUrlList(childLinkSet.stream().filter(u -> StringUtil.isNotBlank(u))
645 |                     .map(u -> RequestDataBuilder.newRequestData().get().url(u).build()).collect(Collectors.toList()));
646 |             resultData.addAllUrl(requestDataList);
647 | 
648 |             final RequestData requestData = responseData.getRequestData();
649 |             resultData.removeUrl(requestData);
650 |             resultData.removeUrl(getDuplicateUrl(requestData));
651 |         } else {
652 |             super.storeChildUrls(responseData, resultData);
653 |         }
654 |     }
655 | 
656 |     /**
657 |      * Returns data as XML content of String.
658 |      *
659 |      * @return XML content of String.
660 |      */
661 |     @Override
662 |     public Object getData(final AccessResultData accessResultData) {
663 |         return null;
664 |     }
665 | 
666 | }
667 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/util/ConfigProperties.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.util;
 2 | 
 3 | import java.util.stream.Stream;
 4 | 
 5 | import org.codelibs.core.lang.StringUtil;
 6 | import org.codelibs.core.misc.DynamicProperties;
 7 | 
 8 | public class ConfigProperties extends DynamicProperties {
 9 | 
10 |     private static final long serialVersionUID = 1L;
11 | 
12 |     public ConfigProperties(String path) {
13 |         super(path);
14 |     }
15 | 
16 |     public String getElasticsearchClusterName(final String clusterName) {
17 |         return clusterName == null ? getProperty("elasticsearch.cluster.name", "elasticsearch") : clusterName;
18 |     }
19 | 
20 |     public String[] getElasticsearchHosts(String esHosts) {
21 |         return Stream.of((esHosts == null ? getProperty("elasticsearch.hosts", "localhost") : esHosts).split(",")).map(host -> host.trim())
22 |                 .toArray(n -> new String[n]);
23 |     }
24 | 
25 |     public String getConfigIndex() {
26 |         return getProperty("config.index");
27 |     }
28 | 
29 |     public String getConfigType() {
30 |         return getProperty("config.type");
31 |     }
32 | 
33 |     public String getQueueType() {
34 |         return getProperty("queue.type");
35 |     }
36 | 
37 |     public boolean isRobotsTxtEnabled() {
38 |         return Boolean.valueOf(getProperty("robots.txt.enabled", Boolean.TRUE.toString()));
39 |     }
40 | 
41 |     public boolean isRedirectsEnabled() {
42 |         return Boolean.valueOf(getProperty("auto.redirect.enabled", Boolean.FALSE.toString()));
43 |     }
44 | 
45 |     public Integer getConnectionTimeout() {
46 |         String value = getProperty("timeout.connection");
47 |         if (StringUtil.isNotBlank(value)) {
48 |             return Integer.valueOf(value);
49 |         }
50 |         return null;
51 |     }
52 | 
53 |     public Integer getSoTimeout() {
54 |         String value = getProperty("timeout.socket");
55 |         if (StringUtil.isNotBlank(value)) {
56 |             return Integer.valueOf(value);
57 |         }
58 |         return null;
59 |     }
60 | 
61 |     public int getQueueParsingSize() {
62 |         String value = getProperty("queue.parsing.size");
63 |         if (StringUtil.isNotBlank(value)) {
64 |             return Integer.parseInt(value);
65 |         }
66 |         return 20;
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/util/ConversionUtil.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.util;
 2 | 
 3 | import java.text.ParseException;
 4 | import java.text.SimpleDateFormat;
 5 | import java.util.Date;
 6 | import java.util.TimeZone;
 7 | 
 8 | public class ConversionUtil {
 9 | 
10 |     public static final String ISO_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
11 | 
12 |     public static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC");
13 | 
14 |     public static <T> T convert(Object value, Class<T> clazz) {
15 |         if (value instanceof CharSequence) {
16 |             final String text = value.toString();
17 |             if (clazz.isAssignableFrom(Integer.class)) {
18 |                 return (T) Integer.valueOf(text);
19 |             } else if (clazz.isAssignableFrom(Long.class)) {
20 |                 return (T) Long.valueOf(text);
21 |             } else if (clazz.isAssignableFrom(Date.class)) {
22 |                 return (T) parseDate(text);
23 |             } else if (clazz.isAssignableFrom(String.class)) {
24 |                 return (T) text;
25 |             }
26 |         } else if (value instanceof Number) {
27 |             final Number v = (Number) value;
28 |             if (clazz.isAssignableFrom(String.class)) {
29 |                 return (T) v.toString();
30 |             } else if (clazz.isAssignableFrom(Integer.class)) {
31 |                 return (T) Integer.valueOf(v.intValue());
32 |             } else if (clazz.isAssignableFrom(Long.class)) {
33 |                 return (T) Long.valueOf(v.intValue());
34 |             } else if (clazz.isAssignableFrom(Date.class)) {
35 |                 return (T) new Date(v.longValue());
36 |             }
37 |         } else if (value instanceof Date) {
38 |             final Date d = (Date) value;
39 |             if (clazz.isAssignableFrom(String.class)) {
40 |                 return (T) formatDate(d);
41 |             } else if (clazz.isAssignableFrom(Integer.class)) {
42 |                 return (T) Integer.valueOf((int) d.getTime());
43 |             } else if (clazz.isAssignableFrom(Long.class)) {
44 |                 return (T) Long.valueOf(d.getTime());
45 |             } else if (clazz.isAssignableFrom(Date.class)) {
46 |                 return (T) d;
47 |             }
48 |         }
49 |         return null;
50 |     }
51 | 
52 |     public static String formatDate(final Date date) {
53 |         if (date == null) {
54 |             return null;
55 |         }
56 | 
57 |         final SimpleDateFormat sdf = new SimpleDateFormat(ISO_DATETIME_FORMAT);
58 |         sdf.setTimeZone(TIMEZONE_UTC);
59 |         return sdf.format(date);
60 |     }
61 | 
62 |     public static Date parseDate(final String value) {
63 |         if (value == null) {
64 |             return null;
65 |         }
66 |         try {
67 |             final SimpleDateFormat sdf = new SimpleDateFormat(ISO_DATETIME_FORMAT);
68 |             sdf.setTimeZone(TIMEZONE_UTC);
69 |             return sdf.parse(value);
70 |         } catch (final ParseException e) {
71 |             return null;
72 |         }
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/util/ScriptUtils.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.util;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | import java.util.function.Consumer;
 6 | 
 7 | import org.codelibs.core.lang.StringUtil;
 8 | import org.codelibs.riverweb.WebRiverConstants;
 9 | import org.codelibs.riverweb.app.service.ScriptService;
10 | import org.elasticsearch.script.ScriptService.ScriptType;
11 | import org.lastaflute.di.core.SingletonLaContainer;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | public class ScriptUtils {
16 |     public static final Logger logger = LoggerFactory.getLogger(ScriptUtils.class);
17 | 
18 |     private ScriptUtils() {
19 |         // nothing
20 |     }
21 | 
22 |     public static Object execute(final Map<String, Object> scriptSettings, final String target, final Consumer<Map<String, Object>> vars) {
23 |         final String script = SettingsUtils.get(scriptSettings, target);
24 |         final String lang = SettingsUtils.get(scriptSettings, "lang", WebRiverConstants.DEFAULT_SCRIPT_LANG);
25 |         final String scriptTypeValue = SettingsUtils.get(scriptSettings, "script_type", "inline");
26 |         ScriptType scriptType;
27 |         if (ScriptType.FILE.toString().equalsIgnoreCase(scriptTypeValue)) {
28 |             scriptType = ScriptType.FILE;
29 |         } else if (ScriptType.INDEXED.toString().equalsIgnoreCase(scriptTypeValue)) {
30 |             scriptType = ScriptType.INDEXED;
31 |         } else {
32 |             scriptType = ScriptType.INLINE;
33 |         }
34 |         if (StringUtil.isNotBlank(script)) {
35 |             final Map<String, Object> localVars = new HashMap<String, Object>();
36 |             vars.accept(localVars);
37 |             try {
38 |                 final ScriptService scriptService = SingletonLaContainer.getComponent(ScriptService.class);
39 |                 final Object result = scriptService.execute(lang, script, scriptType, localVars);
40 |                 if (logger.isDebugEnabled()) {
41 |                     logger.debug("[{}] \"{}\" => {}", target, script, result);
42 |                 }
43 |                 return result;
44 |             } catch (final Exception e) {
45 |                 logger.warn("Failed to execute script: " + script, e);
46 |             }
47 |         }
48 |         return null;
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/codelibs/riverweb/util/SettingsUtils.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.util;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | public final class SettingsUtils {
 6 |     private SettingsUtils() {
 7 |     }
 8 | 
 9 |     public static <T, V> T get(final Map<String, V> settings, final String key) {
10 |         return get(settings, key, null);
11 |     }
12 | 
13 |     @SuppressWarnings("unchecked")
14 |     public static <T, V> T get(final Map<String, V> settings, final String key, final T defaultValue) {
15 |         if (settings != null) {
16 |             final V value = settings.get(key);
17 |             if (value instanceof Number) {
18 |                 if (defaultValue instanceof Integer) {
19 |                     return (T) Integer.valueOf(((Number) value).intValue());
20 |                 } else if (defaultValue instanceof Long) {
21 |                     return (T) Long.valueOf(((Number) value).longValue());
22 |                 } else if (defaultValue instanceof Float) {
23 |                     return (T) Float.valueOf(((Number) value).floatValue());
24 |                 } else if (defaultValue instanceof Double) {
25 |                     return (T) Double.valueOf(((Number) value).doubleValue());
26 |                 } else {
27 |                     return (T) value;
28 |                 }
29 |             } else if (value != null) {
30 |                 return (T) value;
31 |             }
32 |         }
33 |         return defaultValue;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/resources/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codelibs/elasticsearch-river-web/63e41a5b49a10f96629c40f9354e1dabf0144bdc/src/main/resources/.gitkeep


--------------------------------------------------------------------------------
/src/main/resources/app.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN" 
 3 | 	"http://dbflute.org/meta/lastadi10.dtd">
 4 | <components>
 5 | 	<include path="crawler_es.xml" />
 6 | 	<include path="config.xml" />
 7 | 
 8 | 	<component name="riverWeb" class="org.codelibs.riverweb.RiverWeb">
 9 | 	</component>
10 | 
11 | 	<component name="defaultUserAgent" class="java.lang.String">
12 | 		<arg>"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; River Web/${project.version})"
13 | 		</arg>
14 | 	</component>
15 | </components>
16 | 


--------------------------------------------------------------------------------
/src/main/resources/config.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN" 
 3 | 	"http://dbflute.org/meta/lastadi10.dtd">
 4 | <components>
 5 | 	<component name="config" class="org.codelibs.riverweb.util.ConfigProperties">
 6 | 		<arg>org.codelibs.core.io.ResourceUtil.getResourceAsFile("riverweb.properties").getAbsolutePath()
 7 | 		</arg>
 8 | 	</component>
 9 | 
10 | 	<component name="riverConfigManager" class="org.codelibs.riverweb.config.RiverConfigManager">
11 | 	</component>
12 | </components>
13 | 


--------------------------------------------------------------------------------
/src/main/resources/crawler/interval+.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
 3 |     "http://dbflute.org/meta/lastadi10.dtd">
 4 | <components namespace="fessCrawler">
 5 |     <include path="crawler/container.xml" />
 6 | 
 7 | 	<component name="intervalController" class="org.codelibs.riverweb.interval.WebRiverIntervalController" instance="prototype">
 8 | 	</component>
 9 | </components>
10 | 


--------------------------------------------------------------------------------
/src/main/resources/crawler/rule+.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
 3 |     "http://dbflute.org/meta/lastadi10.dtd">
 4 | <components namespace="fessCrawler">
 5 | 	<include path="crawler/container.xml" />
 6 | 	<include path="crawler/transformer.xml" />
 7 | 
 8 | 	<component name="ruleManager"
 9 | 		class="org.codelibs.fess.crawler.rule.impl.RuleManagerImpl">
10 | 		<postConstruct name="addRule">
11 | 			<arg>sitemapsRule</arg>
12 | 		</postConstruct>
13 | 		<postConstruct name="addRule">
14 | 			<arg>scrapingRule</arg>
15 | 		</postConstruct>
16 | 	</component>
17 | 
18 | 	<component name="sitemapsRule" class="org.codelibs.fess.crawler.rule.impl.SitemapsRule">
19 | 		<property name="ruleId">"sitemapsRule"</property>
20 | 		<property name="responseProcessor">
21 | 			<component
22 | 				class="org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor">
23 | 			</component>
24 | 		</property>
25 | 		<postConstruct name="addRule">
26 | 			<arg>"url"</arg>
27 | 			<arg>".*sitemap.*"
28 | 			</arg>
29 | 		</postConstruct>
30 | 	</component>
31 | 
32 | 	<component name="scrapingRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule">
33 | 		<property name="ruleId">"scraping"</property>
34 | 		<property name="defaultRule">true</property>
35 | 		<property name="responseProcessor">
36 | 			<component
37 | 				class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
38 | 				<property name="transformer">scrapingTransformer</property>
39 | 				<property name="successfulHttpCodes">(int[])[200]</property>
40 | 				<property name="notModifiedHttpCodes">(int[])[304]</property>
41 | 			</component>
42 | 		</property>
43 | 	</component>
44 | 
45 | </components>
46 | 


--------------------------------------------------------------------------------
/src/main/resources/crawler/transformer+.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
 3 |     "http://dbflute.org/meta/lastadi10.dtd">
 4 | <components namespace="fessCrawler">
 5 |     <include path="crawler/transformer_basic.xml"/>
 6 | 
 7 | 	<component name="scrapingFeatureMap" class="java.util.LinkedHashMap">
 8 | 		<postConstruct name="put">
 9 | 			<arg>"http://xml.org/sax/features/namespaces"</arg>
10 | 			<arg>"false"</arg>
11 | 		</postConstruct>
12 | 	</component>
13 | 
14 | 	<component name="scrapingTransformer"
15 | 		class="org.codelibs.riverweb.transformer.ScrapingTransformer"
16 | 		instance="singleton">
17 | 		<property name="name">"scrapingTransformer"</property>
18 | 		<property name="featureMap">scrapingFeatureMap</property>
19 | 		<property name="propertyMap">defaultPropertyMap</property>
20 | 		<property name="childUrlRuleMap">allChildUrlRuleMap</property>
21 | 	</component>
22 | </components>
23 | 


--------------------------------------------------------------------------------
/src/main/resources/crawler_es+crawlerThread.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
 3 | 	"http://dbflute.org/meta/lastadi10.dtd">
 4 | <components namespace="fessCrawler">
 5 | 	<!-- Crawler Thread -->
 6 | 	<component name="crawlerThread" class="org.codelibs.riverweb.crawler.RwCrawlerThread" instance="prototype" >
 7 | 	</component>
 8 | 
 9 | </components>
10 | 


--------------------------------------------------------------------------------
/src/main/resources/lasta_di.properties:
--------------------------------------------------------------------------------
 1 | # _/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/
 2 | # Lasta Di properties, you can set container's options
 3 | # _/_/_/_/_/_/_/_/_/_/
 4 | 
 5 | # location of smart-deploy mode e.g. maihama_env.properties: lasta_di.smart.deploy.mode
 6 | smart.deploy.mode.location = riverweb.properties: lasta_di.smart.deploy.mode
 7 | 
 8 | # package for smart deploy target e.g. org.docksidestage.app
 9 | smart.package1 = org.codelibs.fess.crawler
10 | smart.package1 = org.codelibs.riverweb.app
11 | 


--------------------------------------------------------------------------------
/src/main/resources/riverweb.properties:
--------------------------------------------------------------------------------
 1 | lasta_di.smart.deploy.mode = warm
 2 | 
 3 | # Elasticsearch
 4 | elasticsearch.hosts=localhost:9300
 5 | 
 6 | # Config Index
 7 | config.index=.river_web
 8 | config.type=config
 9 | queue.type=queue
10 | 
11 | 


--------------------------------------------------------------------------------
/src/test/java/org/codelibs/riverweb/RiverWebTest.java:
--------------------------------------------------------------------------------
  1 | package org.codelibs.riverweb;
  2 | 
  3 | import static org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner.newConfigs;
  4 | 
  5 | import java.util.UUID;
  6 | import java.util.function.IntConsumer;
  7 | 
  8 | import junit.framework.TestCase;
  9 | 
 10 | import org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner;
 11 | import org.codelibs.riverweb.RiverWeb;
 12 | import org.elasticsearch.action.index.IndexResponse;
 13 | import org.elasticsearch.action.search.SearchResponse;
 14 | import org.elasticsearch.common.settings.Settings.Builder;
 15 | import org.elasticsearch.index.query.QueryBuilders;
 16 | 
 17 | public class RiverWebTest extends TestCase {
 18 | 
 19 |     private ElasticsearchClusterRunner runner;
 20 | 
 21 |     private int numOfNode = 2;
 22 | 
 23 |     private String clusterName;
 24 | 
 25 |     @Override
 26 |     protected void setUp() throws Exception {
 27 |         // create runner instance
 28 |         clusterName = "es-river-web-" + UUID.randomUUID().toString();
 29 |         runner = new ElasticsearchClusterRunner();
 30 |         runner.onBuild(new ElasticsearchClusterRunner.Builder() {
 31 |             @Override
 32 |             public void build(final int number, final Builder settingsBuilder) {
 33 |                 settingsBuilder.put("http.cors.enabled", true);
 34 |                 settingsBuilder.put("http.cors.allow-origin", "*");
 35 |                 settingsBuilder.put("index.number_of_shards", 3);
 36 |                 settingsBuilder.put("index.number_of_replicas", 0);
 37 |                 settingsBuilder.putArray("discovery.zen.ping.unicast.hosts", "localhost:9301-9310");
 38 |                 settingsBuilder.put("index.unassigned.node_left.delayed_timeout", "0");
 39 |                 settingsBuilder.put("network.host", "0");
 40 |             }
 41 |         }).build(newConfigs().clusterName(clusterName).numOfNode(numOfNode));
 42 | 
 43 |         // wait for yellow status
 44 |         runner.ensureYellow();
 45 |     }
 46 | 
 47 |     @Override
 48 |     protected void tearDown() throws Exception {
 49 |         // close runner
 50 |         runner.close();
 51 |         // delete all files
 52 |         runner.clean();
 53 |     }
 54 | 
 55 |     public void test_basic() throws Exception {
 56 | 
 57 |         RiverWeb.exitMethod = new IntConsumer() {
 58 |             @Override
 59 |             public void accept(final int value) {
 60 |                 if (value != 0) {
 61 |                     fail();
 62 |                 }
 63 |             }
 64 |         };
 65 | 
 66 |         final String index = "webindex";
 67 |         final String type = "my_web";
 68 |         final String riverWebIndex = ".river_web";
 69 |         final String riverWebType = "config";
 70 | 
 71 |         // create an index
 72 |         runner.createIndex(index, null);
 73 |         runner.ensureYellow(index);
 74 | 
 75 |         // create a mapping
 76 |         final String mappingSource =
 77 |                 "{\"my_web\":{\"dynamic_templates\":[{\"url\":{\"match\":\"url\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"method\":{\"match\":\"method\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"charSet\":{\"match\":\"charSet\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"mimeType\":{\"match\":\"mimeType\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}}]}}";
 78 |         runner.createMapping(index, type, mappingSource);
 79 | 
 80 |         if (!runner.indexExists(index)) {
 81 |             fail();
 82 |         }
 83 | 
 84 |         String config = type;
 85 |         {
 86 |             final String riverWebSource = "{\"index\":\"" + index
 87 |                     + "\",\"urls\":[\"http://www.codelibs.org/\",\"http://fess.codelibs.org/\"]"
 88 |                     + ",\"include_urls\":[\"http://www.codelibs.org/.*\",\"http://fess.codelibs.org/.*\"]"
 89 |                     + ",\"exclude_urls\":[\".*\\\\.txt\",\".*\\\\.png\",\".*\\\\.gif\",\".*\\\\.js\",\".*\\\\.css\"]"
 90 |                     + ",\"max_depth\":5,\"max_access_count\":100,\"num_of_thread\":5,\"interval\":1000"
 91 |                     + ",\"target\":[{\"pattern\":{\"url\":\"http://www.codelibs.org/.*\",\"mimeType\":\"text/html\"}"
 92 |                     + ",\"properties\":{\"title\":{\"text\":\"title\"},\"body\":{\"text\":\"body\"},\"bodyAsHtml\":{\"html\":\"body\"},\"projects\":{\"text\":\"ul.nav-listlia\",\"is_array\":true}}}"
 93 |                     + ",{\"pattern\":{\"url\":\"http://fess.codelibs.org/.*\",\"mimeType\":\"text/html\"}"
 94 |                     + ",\"properties\":{\"title\":{\"text\":\"title\"},\"body\":{\"text\":\"body\",\"trim_spaces\":true},\"menus\":{\"text\":\"ul.nav-listlia\",\"is_array\":true}}}]}";
 95 |             final IndexResponse response = runner.insert(riverWebIndex, riverWebType, config, riverWebSource);
 96 |             if (!response.isCreated()) {
 97 |                 fail();
 98 |             }
 99 |         }
100 | 
101 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
102 |                 "--cluster-name", clusterName, "--cleanup" });
103 | 
104 |         assertTrue(runner.count(index, type).getHits().getTotalHits() + " >= 100",
105 |                 runner.count(index, type).getHits().getTotalHits() >= 100);
106 | 
107 |         runner.ensureYellow();
108 |     }
109 | 
110 |     public void test_overwrite() throws Exception {
111 | 
112 |         RiverWeb.exitMethod = new IntConsumer() {
113 |             @Override
114 |             public void accept(final int value) {
115 |                 if (value != 0) {
116 |                     fail();
117 |                 }
118 |             }
119 |         };
120 | 
121 |         final String index = "webindex";
122 |         final String type = "my_web";
123 |         final String riverWebIndex = ".river_web";
124 |         final String riverWebType = "config";
125 | 
126 |         // create an index
127 |         runner.createIndex(index, null);
128 |         runner.ensureYellow(index);
129 | 
130 |         // create a mapping
131 |         final String mappingSource =
132 |                 "{\"my_web\":{\"dynamic_templates\":[{\"url\":{\"match\":\"url\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"method\":{\"match\":\"method\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"charSet\":{\"match\":\"charSet\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"mimeType\":{\"match\":\"mimeType\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}}]}}";
133 |         runner.createMapping(index, type, mappingSource);
134 | 
135 |         if (!runner.indexExists(index)) {
136 |             fail();
137 |         }
138 | 
139 |         String config = type;
140 |         {
141 |             final String riverWebSource = "{\"index\":\"" + index + "\",\"type\":\"" + type
142 |                     + "\",\"urls\":[\"http://fess.codelibs.org/\"],\"include_urls\":[\"http://fess.codelibs.org/.*\"],\"max_depth\":1,\"max_access_count\":1,\"num_of_thread\":1,\"interval\":1000,\"overwrite\":true,\"target\":[{\"pattern\":{\"url\":\"http://fess.codelibs.org/.*\",\"mimeType\":\"text/html\"},\"properties\":{\"title\":{\"text\":\"title\"},\"body\":{\"text\":\"body\",\"trim_spaces\":true}}}]}";
143 |             final IndexResponse response = runner.insert(riverWebIndex, riverWebType, config, riverWebSource);
144 |             if (!response.isCreated()) {
145 |                 fail();
146 |             }
147 |         }
148 | 
149 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
150 |                 "--cluster-name", clusterName, "--cleanup" });
151 |         assertEquals(1, runner.count(index, type).getHits().getTotalHits());
152 |         SearchResponse response1 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 1);
153 | 
154 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
155 |                 "--cluster-name", clusterName, "--cleanup" });
156 |         assertEquals(1, runner.count(index, type).getHits().getTotalHits());
157 |         SearchResponse response2 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 1);
158 | 
159 |         assertFalse(response1.getHits().getHits()[0].getSource().get("@timestamp")
160 |                 .equals(response2.getHits().getHits()[0].getSource().get("@timestamp")));
161 | 
162 |         runner.ensureYellow();
163 |     }
164 | 
165 |     public void test_incremental() throws Exception {
166 | 
167 |         RiverWeb.exitMethod = new IntConsumer() {
168 |             @Override
169 |             public void accept(final int value) {
170 |                 if (value != 0) {
171 |                     fail();
172 |                 }
173 |             }
174 |         };
175 | 
176 |         final String index = "webindex";
177 |         final String type = "my_web";
178 |         final String riverWebIndex = ".river_web";
179 |         final String riverWebType = "config";
180 | 
181 |         // create an index
182 |         runner.createIndex(index, null);
183 |         runner.ensureYellow(index);
184 | 
185 |         // create a mapping
186 |         final String mappingSource =
187 |                 "{\"my_web\":{\"dynamic_templates\":[{\"url\":{\"match\":\"url\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"method\":{\"match\":\"method\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"charSet\":{\"match\":\"charSet\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"mimeType\":{\"match\":\"mimeType\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}}]}}";
188 |         runner.createMapping(index, type, mappingSource);
189 | 
190 |         if (!runner.indexExists(index)) {
191 |             fail();
192 |         }
193 | 
194 |         String config = type;
195 |         {
196 |             final String riverWebSource = "{\"index\":\"" + index + "\",\"type\":\"" + type
197 |                     + "\",\"urls\":[\"http://fess.codelibs.org/\"],\"include_urls\":[\"http://fess.codelibs.org/.*\"],\"max_depth\":1,\"max_access_count\":1,\"num_of_thread\":1,\"interval\":1000,\"incremental\":true,\"target\":[{\"pattern\":{\"url\":\"http://fess.codelibs.org/.*\",\"mimeType\":\"text/html\"},\"properties\":{\"title\":{\"text\":\"title\"},\"body\":{\"text\":\"body\",\"trim_spaces\":true}}}]}";
198 |             final IndexResponse response = runner.insert(riverWebIndex, riverWebType, config, riverWebSource);
199 |             if (!response.isCreated()) {
200 |                 fail();
201 |             }
202 |         }
203 | 
204 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
205 |                 "--cluster-name", clusterName, "--cleanup" });
206 |         assertEquals(1, runner.count(index, type).getHits().getTotalHits());
207 |         SearchResponse response1 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 1);
208 | 
209 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
210 |                 "--cluster-name", clusterName, "--cleanup" });
211 |         assertEquals(1, runner.count(index, type).getHits().getTotalHits());
212 |         SearchResponse response2 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 1);
213 | 
214 |         assertEquals(response1.getHits().getHits()[0].getSource().get("@timestamp"),
215 |                 response2.getHits().getHits()[0].getSource().get("@timestamp"));
216 | 
217 |         runner.ensureYellow();
218 |     }
219 | 
220 |     public void test_default() throws Exception {
221 | 
222 |         RiverWeb.exitMethod = new IntConsumer() {
223 |             @Override
224 |             public void accept(final int value) {
225 |                 if (value != 0) {
226 |                     fail();
227 |                 }
228 |             }
229 |         };
230 | 
231 |         final String index = "webindex";
232 |         final String type = "my_web";
233 |         final String riverWebIndex = ".river_web";
234 |         final String riverWebType = "config";
235 | 
236 |         // create an index
237 |         runner.createIndex(index, null);
238 |         runner.ensureYellow(index);
239 | 
240 |         // create a mapping
241 |         final String mappingSource =
242 |                 "{\"my_web\":{\"dynamic_templates\":[{\"url\":{\"match\":\"url\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"method\":{\"match\":\"method\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"charSet\":{\"match\":\"charSet\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}},{\"mimeType\":{\"match\":\"mimeType\",\"mapping\":{\"type\":\"string\",\"store\":\"yes\",\"index\":\"not_analyzed\"}}}]}}";
243 |         runner.createMapping(index, type, mappingSource);
244 | 
245 |         if (!runner.indexExists(index)) {
246 |             fail();
247 |         }
248 | 
249 |         String config = type;
250 |         {
251 |             final String riverWebSource = "{\"index\":\"" + index + "\",\"type\":\"" + type
252 |                     + "\",\"urls\":[\"http://fess.codelibs.org/\"],\"include_urls\":[\"http://fess.codelibs.org/.*\"],\"max_depth\":1,\"max_access_count\":1,\"num_of_thread\":1,\"interval\":1000,\"target\":[{\"pattern\":{\"url\":\"http://fess.codelibs.org/.*\",\"mimeType\":\"text/html\"},\"properties\":{\"title\":{\"text\":\"title\"},\"body\":{\"text\":\"body\",\"trim_spaces\":true}}}]}";
253 |             final IndexResponse response = runner.insert(riverWebIndex, riverWebType, config, riverWebSource);
254 |             if (!response.isCreated()) {
255 |                 fail();
256 |             }
257 |         }
258 | 
259 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
260 |                 "--cluster-name", clusterName, "--cleanup" });
261 |         assertEquals(1, runner.count(index, type).getHits().getTotalHits());
262 |         SearchResponse response1 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 1);
263 | 
264 |         RiverWeb.main(new String[] { "--config-id", config, "--es-hosts", "localhost:" + runner.node().settings().get("transport.tcp.port"),
265 |                 "--cluster-name", clusterName, "--cleanup" });
266 |         assertEquals(2, runner.count(index, type).getHits().getTotalHits());
267 |         SearchResponse response2 = runner.search(index, type, QueryBuilders.termQuery("url", "http://fess.codelibs.org/"), null, 0, 2);
268 | 
269 |         assertEquals(1, response1.getHits().getTotalHits());
270 |         assertEquals(2, response2.getHits().getTotalHits());
271 | 
272 |         runner.ensureYellow();
273 |     }
274 | }
275 | 


--------------------------------------------------------------------------------
/src/test/java/org/codelibs/riverweb/app/service/ScriptServiceTest.java:
--------------------------------------------------------------------------------
  1 | package org.codelibs.riverweb.app.service;
  2 | 
  3 | import static org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner.newConfigs;
  4 | 
  5 | import java.io.File;
  6 | import java.util.HashMap;
  7 | import java.util.Map;
  8 | import java.util.UUID;
  9 | 
 10 | import javax.annotation.Resource;
 11 | 
 12 | import org.codelibs.core.io.FileUtil;
 13 | import org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner;
 14 | import org.codelibs.fess.crawler.client.EsClient;
 15 | import org.dbflute.utflute.lastadi.ContainerTestCase;
 16 | import org.elasticsearch.common.settings.Settings.Builder;
 17 | import org.elasticsearch.script.ScriptService.ScriptType;
 18 | 
 19 | public class ScriptServiceTest extends ContainerTestCase {
 20 |     @Resource
 21 |     protected ScriptService scriptService;
 22 | 
 23 |     @Resource
 24 |     protected EsClient esClient;
 25 | 
 26 |     public void test_javascript_inline() throws Exception {
 27 |         String lang = "javascript";
 28 |         String script = "'test';";
 29 |         ScriptType scriptType = ScriptType.INLINE;
 30 |         Map<String, Object> localVars = new HashMap<>();
 31 | 
 32 |         assertEquals("test", scriptService.execute(lang, script, scriptType, localVars));
 33 | 
 34 |         script = "print('test');";
 35 |         assertNull(scriptService.execute(lang, script, scriptType, localVars));
 36 | 
 37 |         localVars.put("testVar", "aaa");
 38 |         script = "testVar;";
 39 |         assertEquals("aaa", scriptService.execute(lang, script, scriptType, localVars));
 40 |     }
 41 | 
 42 |     public void test_javascript_file() throws Exception {
 43 |         File tempFile = File.createTempFile("temp", ".txt");
 44 |         tempFile.deleteOnExit();
 45 |         FileUtil.writeBytes(tempFile.getAbsolutePath(), "'test';".getBytes());
 46 |         String lang = "javascript";
 47 |         String script = tempFile.getAbsolutePath();
 48 |         ScriptType scriptType = ScriptType.FILE;
 49 |         Map<String, Object> localVars = new HashMap<>();
 50 | 
 51 |         assertEquals("test", scriptService.execute(lang, script, scriptType, localVars));
 52 | 
 53 |         FileUtil.writeBytes(tempFile.getAbsolutePath(), "print('test');".getBytes());
 54 |         assertNull(scriptService.execute(lang, script, scriptType, localVars));
 55 | 
 56 |         localVars.put("testVar", "aaa");
 57 |         FileUtil.writeBytes(tempFile.getAbsolutePath(), "testVar;".getBytes());
 58 |         assertEquals("aaa", scriptService.execute(lang, script, scriptType, localVars));
 59 |     }
 60 | 
 61 |     public void test_javascript_indexed() throws Exception {
 62 |         // create runner instance
 63 |         String clusterName = "es-river-web-" + UUID.randomUUID().toString();
 64 |         ElasticsearchClusterRunner runner = new ElasticsearchClusterRunner();
 65 |         runner.onBuild(new ElasticsearchClusterRunner.Builder() {
 66 |             @Override
 67 |             public void build(final int number, final Builder settingsBuilder) {
 68 |                 settingsBuilder.put("http.cors.enabled", true);
 69 |                 settingsBuilder.put("http.cors.allow-origin", "*");
 70 |                 settingsBuilder.put("index.number_of_shards", 3);
 71 |                 settingsBuilder.put("index.number_of_replicas", 0);
 72 |                 settingsBuilder.putArray("discovery.zen.ping.unicast.hosts", "localhost:9301-9310");
 73 |                 settingsBuilder.put("index.unassigned.node_left.delayed_timeout", "0");
 74 |             }
 75 |         }).build(newConfigs().clusterName(clusterName).numOfNode(1));
 76 |         // wait for yellow status
 77 |         runner.ensureYellow();
 78 | 
 79 |         try {
 80 |             esClient.setClusterName(clusterName);
 81 |             esClient.setAddresses(new String[] { "localhost:" + runner.node().settings().get("transport.tcp.port") });
 82 |             esClient.connect();
 83 | 
 84 |             Map<String, Object> localVars = new HashMap<>();
 85 | 
 86 |             String lang = "javascript";
 87 |             ScriptType scriptType = ScriptType.INDEXED;
 88 |             String script = "script1";
 89 |             runner.insert(ScriptService.SCRIPT_INDEX, lang, script, "{\"script\":\"'test';\"}");
 90 | 
 91 |             assertEquals("test", scriptService.execute(lang, script, scriptType, localVars));
 92 | 
 93 |             script = "script2";
 94 |             runner.insert(ScriptService.SCRIPT_INDEX, lang, script, "{\"script\":\"print('test');\"}");
 95 |             assertNull(scriptService.execute(lang, script, scriptType, localVars));
 96 | 
 97 |             localVars.put("testVar", "aaa");
 98 |             script = "script3";
 99 |             runner.insert(ScriptService.SCRIPT_INDEX, lang, script, "{\"script\":\"testVar;\"}");
100 |             assertEquals("aaa", scriptService.execute(lang, script, scriptType, localVars));
101 |         } finally {
102 |             // close runner
103 |             runner.close();
104 |             // delete all files
105 |             runner.clean();
106 |         }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/java/org/codelibs/riverweb/transformer/ScrapingTransformerTest.java:
--------------------------------------------------------------------------------
 1 | package org.codelibs.riverweb.transformer;
 2 | 
 3 | import static org.hamcrest.core.Is.is;
 4 | import static org.junit.Assert.assertThat;
 5 | 
 6 | import java.io.InputStream;
 7 | import java.util.HashMap;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | 
11 | import org.apache.commons.io.IOUtils;
12 | import org.codelibs.core.io.ResourceUtil;
13 | import org.codelibs.fess.crawler.entity.ResponseData;
14 | import org.codelibs.fess.crawler.entity.ResultData;
15 | import org.codelibs.riverweb.config.RiverConfig;
16 | import org.codelibs.riverweb.config.RiverConfigManager;
17 | import org.junit.Test;
18 | 
19 | public class ScrapingTransformerTest {
20 |     @Test
21 |     public void fess_codelibs_org() {
22 |         final RiverConfigManager riverConfigManager = new RiverConfigManager();
23 |         final ScrapingTransformer transformer = new ScrapingTransformer() {
24 |             @SuppressWarnings("unchecked")
25 |             @Override
26 |             protected void storeIndex(final ResponseData responseData, final Map<String, Object> dataMap) {
27 |                 System.out.println(dataMap);
28 |                 assertThat(((List<String>) ((Map<String, Object>) dataMap.get("nav")).get("sideMenus")).size(), is(27));
29 |                 assertThat(((Map<String, Object>) dataMap.get("section1")).get("title").toString(), is("What is Fess?"));
30 |                 assertThat(((List<String>) ((Map<String, Object>) dataMap.get("section1")).get("body")).size(), is(2));
31 |                 assertThat(((Map<String, Object>) dataMap.get("section2")).get("title").toString(), is("Features"));
32 |                 assertThat(((List<String>) ((Map<String, Object>) dataMap.get("section2")).get("body")).size(), is(12));
33 |             }
34 |         };
35 |         transformer.riverConfigManager = riverConfigManager;
36 | 
37 |         final String sessionId = "test";
38 |         final String url = "http://fess.codelibs.org/";
39 |         final RiverConfig riverConfig = riverConfigManager.get(sessionId);
40 |         transformer.riverConfigLocal.set(riverConfig);
41 | 
42 |         final Map<String, Map<String, Object>> scrapingRuleMap = new HashMap<String, Map<String, Object>>();
43 |         addScrapingRuleMap(scrapingRuleMap, "text", "nav.sideMenus", "div.sidebar-nav ul li", Boolean.TRUE, Boolean.TRUE);
44 |         addScrapingRuleMap(scrapingRuleMap, "text", "section1.title", "div.section:eq(0) h2", null, null);
45 |         addScrapingRuleMap(scrapingRuleMap, "text", "section1.body", "div.section:eq(0) p", Boolean.TRUE, Boolean.TRUE);
46 |         addScrapingRuleMap(scrapingRuleMap, "text", "section2.title", "div.section:eq(1) h2", null, null);
47 |         addScrapingRuleMap(scrapingRuleMap, "text", "section2.body", "div.section:eq(1) ul li", Boolean.TRUE, Boolean.TRUE);
48 |         final Map<String, Object> patternMap = new HashMap<String, Object>();
49 |         patternMap.put("url", url);
50 |         riverConfig.addScrapingRule(null, patternMap, scrapingRuleMap);
51 |         InputStream is = null;
52 |         try {
53 |             final ResponseData responseData = new ResponseData();
54 |             responseData.setSessionId(sessionId);
55 |             responseData.setUrl(url);
56 |             responseData.setResponseBody(ResourceUtil.getResourceAsFile("html/fess_codelibs_org.html"), false);
57 |             responseData.setCharSet("UTF-8");
58 |             final ResultData resultData = new ResultData();
59 | 
60 |             transformer.storeData(responseData, resultData);
61 |         } finally {
62 |             IOUtils.closeQuietly(is);
63 |         }
64 |     }
65 | 
66 |     private void addScrapingRuleMap(final Map<String, Map<String, Object>> scrapingRuleMap, final String type, final String property,
67 |             final String path, final Boolean isArray, final Boolean trimSpaces) {
68 |         final Map<String, Object> valueMap = new HashMap<String, Object>();
69 |         valueMap.put(type, path);
70 |         if (isArray != null) {
71 |             valueMap.put("is_array", isArray);
72 |         }
73 |         if (trimSpaces != null) {
74 |             valueMap.put("trim_spaces", trimSpaces);
75 |         }
76 |         scrapingRuleMap.put(property, valueMap);
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/resources/html/fess_codelibs_org.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html  lang="en">
  3 |   <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Full Text Search Server: Fess - 
  6 |     Open Source Full Text Search Server - Fess</title>
  7 |     <meta name="robots" content="index,follow"/>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="keywords" content="Full Text Search,OSS,Free Software,Solr,namazu,nutch" /><meta name="description" content="Fess is OSS Full Text Search Server." />    <!-- Le styles -->
 10 |     <link href="./css/bootstrap.min.css" rel="stylesheet">
 11 |     <link href="./css/site.css" rel="stylesheet">
 12 |     <style type="text/css">
 13 |       body {
 14 |         padding-top: 60px;
 15 |         padding-bottom: 40px;
 16 |       }
 17 |       .sidebar-nav {
 18 |         padding: 9px 0;
 19 |       }
 20 | 
 21 |       @media (max-width: 980px) {
 22 |         /* Enable use of floated navbar text */
 23 |         .navbar-text.pull-right {
 24 |           float: none;
 25 |           padding-left: 5px;
 26 |           padding-right: 5px;
 27 |         }
 28 |       }
 29 |     </style>
 30 |     <link href="./css/bootstrap-responsive.min.css" rel="stylesheet">
 31 | 
 32 |     <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 33 |     <!--[if lt IE 9]>
 34 |       <script src="../assets/js/html5shiv.js"></script>
 35 |     <![endif]-->
 36 | <script type="text/javascript">
 37 |   var _gaq = _gaq || [];
 38 |   _gaq.push(['_setAccount', 'UA-34667351-2']);
 39 |   _gaq.push(['_trackPageview']);
 40 |   (function() {
 41 |     var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
 42 |     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
 43 |     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
 44 |   })();
 45 | </script>
 46 |   </head>
 47 | 
 48 |   <body>
 49 | 
 50 |     <div class="navbar navbar-fixed-top navbar-inverse">
 51 |       <div class="navbar-inner">
 52 |         <div class="container">
 53 |           <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
 54 |             <span class="icon-bar"></span>
 55 |           </a>
 56 |           <a class="brand" href="http://fess.codelibs.org/">Fess</a>
 57 | 
 58 |           <div class="nav-collapse">
 59 |             <ul class="nav">
 60 |   <li><a href="index.html">Overview</a></li>
 61 |   <li><a href="setup.html">Basic</a></li>
 62 |   <li><a href="apidocs/index.html">Documentation</a></li>
 63 |   <li><a href="project-info.html">Project Documentation</a></li>
 64 |               </ul>
 65 |             <p class="navbar-text pull-right"><a class="csupport" href="http://fess.codelibs.org/ja/">&#26085;&#26412;&#35486;</a></p>
 66 |           </div>
 67 |         </div>
 68 |       </div>
 69 |     </div>
 70 | 
 71 |     <div class="container">
 72 |       <div class="row-fluid">
 73 |         <div class="span3">
 74 |         <p><a href="/ja/">Japanese</a></p>
 75 |       
 76 |           <div>
 77 |         <span id="publishDate">Last Published: 2013-09-16</span>
 78 |       </div>
 79 |           <div class="well sidebar-nav">
 80 |             <ul class="nav nav-list">
 81 |             <li class="nav-header">Overview</li>
 82 |                                   <li class="active"><a href="#">Home</a></li>
 83 |                                 <li>              <a href="license.html" title="License">License</a>
 84 |   </li>
 85 |                                 <li>              <a href="downloads.html" title="Downloads">Downloads</a>
 86 |   </li>
 87 |                                 <li>              <a href="screenshot.html" title="Screenshot">Screenshot</a>
 88 |   </li>
 89 |                             <li class="nav-header">Basic</li>
 90 |                                   <li>              <a href="setup.html" title="Installation">Installation</a>
 91 |   </li>
 92 |                                 <li>              <a href="getting-started.html" title="Getting Started">Getting Started</a>
 93 |   </li>
 94 |                                 <li>              <a href="quick-start.html" title="Quick Startup">Quick Startup</a>
 95 |   </li>
 96 |                             <li class="nav-header">Documentation</li>
 97 |                                   <li>              <a href="apidocs/index.html" title="JavaDoc">JavaDoc</a>
 98 |   </li>
 99 |                             <li class="nav-header">Project Documentation</li>
100 |                                                                                                                                                                                                                                                                                                                       <li>              <a href="project-info.html" title="Project Information">Project Information</a>
101 |   </li>
102 |                 <li>
103 |       <ul class="nav nav-list">
104 |                               <li class="active"><a href="#">About</a></li>
105 |                                     <li>              <a href="team-list.html" title="Project Team">Project Team</a>
106 |   </li>
107 |                                     <li>              <a href="dependency-info.html" title="Dependency Information">Dependency Information</a>
108 |   </li>
109 |                                     <li>              <a href="plugins.html" title="Project Plugins">Project Plugins</a>
110 |   </li>
111 |                                     <li>              <a href="integration.html" title="Continuous Integration">Continuous Integration</a>
112 |   </li>
113 |                                     <li>              <a href="issue-tracking.html" title="Issue Tracking">Issue Tracking</a>
114 |   </li>
115 |                                     <li>              <a href="source-repository.html" title="Source Repository">Source Repository</a>
116 |   </li>
117 |                                     <li>              <a href="license.html" title="Project License">Project License</a>
118 |   </li>
119 |                                     <li>              <a href="plugin-management.html" title="Plugin Management">Plugin Management</a>
120 |   </li>
121 |                                     <li>              <a href="distribution-management.html" title="Distribution Management">Distribution Management</a>
122 |   </li>
123 |                                     <li>              <a href="project-summary.html" title="Project Summary">Project Summary</a>
124 |   </li>
125 |                                     <li>              <a href="mail-lists.html" title="Mailing Lists">Mailing Lists</a>
126 |   </li>
127 |                                     <li>              <a href="dependencies.html" title="Dependencies">Dependencies</a>
128 |   </li>
129 |                     </ul>
130 |       </li>
131 |                               </ul>
132 |           </div>
133 | <div id="adsleft" class="visible-desktop">
134 | <script type="text/javascript"><!--
135 | google_ad_client = "ca-pub-0248074489415800";
136 | google_ad_slot = "9609502582";
137 | google_ad_width = 250;
138 | google_ad_height = 250;
139 | //-->
140 | </script>
141 | <script type="text/javascript"
142 | src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
143 | </script>
144 | </div>
145 | <div id="sponsoredby">
146 | <div>Sponsored by</div>
147 | <div><a href="http://www.n2sm.net/n2search.html" target="_blank"><img src="/images/n2sm_banner.gif" alt="N2SM, Inc."/></a></div>
148 | </div>
149 |         </div>
150 |         <div class="span9">
151 | <div id="adstop" class="visible-desktop">
152 | <script type="text/javascript"><!--
153 | google_ad_client = "ca-pub-0248074489415800";
154 | google_ad_slot = "8132769380";
155 | google_ad_width = 728;
156 | google_ad_height = 90;
157 | //-->
158 | </script>
159 | <script type="text/javascript"
160 | src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
161 | </script>
162 | </div>
163 | <div>
164 | <a href="http://www.n2sm.net/recruit/product.html" target="_blank">[PR] &#x63a1;&#x7528;&#x60c5;&#x5831;&#xff1a;Fess&#x3092;&#x4e00;&#x7dd2;&#x306b;&#x958b;&#x767a;&#x3057;&#x307e;&#x305b;&#x3093;&#x304b;&#xff1f;&#x4e00;&#x7dd2;&#x306b;&#x65b0;&#x3057;&#x3044;&#x3082;&#x306e;&#x306b;&#x30c1;&#x30e3;&#x30ec;&#x30f3;&#x30b8;&#x3057;&#x305f;&#x3044;&#x65b9;&#x3092;&#x52df;&#x96c6;&#x3057;&#x3066;&#x3044;&#x307e;&#x3059;&#x3002;</a>
165 | </div>
166 | <h1>
167 |     Open Source Full Text Search Server - Fess</h1>
168 | 
169 |   
170 |     <div class="section"><h2>What is Fess?<a name="What_is_Fess"></a></h2>
171 |       <p>Fess is Java based full text search server provided as OSS product. You can install and run Fess quickly on any platforms, which have Java runtime environment. Fess is provided under Apache license.</p>
172 |       <img src="./images/ja/fess_search_result_mini.png" alt="Fess Screen Shot" />
173 |       <p>Fess is built on <a class="externalLink" href="http://www.seasar.org/">Seasar2</a> framework, and contains <a class="externalLink" href="http://lucene.apache.org/solr/">Apache Solr</a>. A crawler used by Fess is <a class="externalLink" href="http://s2robot.sandbox.seasar.org/">S2Robot</a>, which can crawl documents on Web/Local File System and support many file formats, such as MS Office, pdf and zip.</p>
174 |     </div>
175 |     <div class="section"><h2>Features<a name="Features"></a></h2>
176 |       <ul>
177 |         <li>Easy Installation/Configuration</li>
178 |         <li>Apache License (OSS)</li>
179 |         <li>OS-independent (Runs on Java)</li>
180 |         <li>Crawl documents on Web/File System/DB/Windows Shared Folder</li>
181 |         <li>Support documents of MS Office, PDF,...</li>
182 |         <li>Use Apache Solr as a search engine</li>
183 |         <li>Support Japanese mobile devices</li>
184 |         <li>Provide a browser based administative page</li>
185 |         <li>Support a web page for BASIC/DIGEST/NTLM authentication</li>
186 |         <li>Support a role authentication</li>
187 |         <li>Support XML/JSON/JSONP format</li>
188 |         <li>Provide a search log and statistics</li>
189 |       </ul>
190 |     </div>
191 |     <div class="section"><h2>Demo Site<a name="Demo_Site"></a></h2>
192 |       <div>
193 |         <div style="width:220px;float:left;">
194 |           <a class="externalLink" href="http://search.n2sm.co.jp/fess/" target="_blank"><img src="/images/ja/demo-1.png" alt="" /></a>
195 |           <div>Standard</div>
196 |         </div>
197 |         <div style="width:220px;float:left;">
198 |           <a class="externalLink" href="http://demo.n2search.net/aspdemo/" target="_blank"><img src="/images/ja/demo-2.png" alt="" /></a>
199 |           <div>Enterprice Search</div>
200 |         </div>
201 |         <div style="width:220px;float:left;">
202 |           <a class="externalLink" href="http://demo.n2search.net/ecdemo/" target="_blank"><img src="/images/ja/demo-3.png" alt="" /></a>
203 |           <div>EC Site</div>
204 |         </div>
205 |         <div style="clear:both;"></div>
206 |       </div>
207 |     </div>
208 |     <div class="section"><h2>News<a name="News"></a></h2>
209 |       <dl>
210 |         <dt>2013-09-07</dt>
211 |         <dd>
212 |           <a class="externalLink" href="http://sourceforge.jp/projects/fess/news/24407">Fess 8.2.0 Released</a>
213 |         </dd>
214 |         <dt>2013-05-13</dt>
215 |         <dd>
216 |           <a class="externalLink" href="http://sourceforge.jp/projects/fess/news/24242">Fess 8.1.0 Released</a>
217 |         </dd>
218 |         <dt>2013-02-23</dt>
219 |         <dd>
220 |           <a class="externalLink" href="http://sourceforge.jp/projects/fess/news/24130">Fess 8.0.0 Released</a>
221 |         </dd>
222 |       </dl>
223 |       <p><a href="news.html">More news...</a></p>
224 |     </div>
225 |   
226 | 
227 | <div id="adsbottom" class="visible-desktop">
228 | <script type="text/javascript"><!--
229 | google_ad_client = "ca-pub-0248074489415800";
230 | google_ad_slot = "6656036184";
231 | google_ad_width = 728;
232 | google_ad_height = 90;
233 | //-->
234 | </script>
235 | <script type="text/javascript"
236 | src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
237 | </script>
238 | </div>
239 |         </div>
240 |       </div>
241 | 
242 |       <hr>
243 | 
244 |       <footer>
245 |         <p class="pull-left">
246 |         Copyright &#169;                    2009-2013
247 |                         <a href="http://www.codelibs.org/">CodeLibs</a>.
248 |             All Rights Reserved.<br/>
249 |         Sponsored by <a href="http://www.n2sm.net/" >N2SM, Inc.</a>
250 |         </p>
251 |         <p class="pull-right">
252 | <a href="http://sourceforge.jp/projects/fess/"><img height="39" border="0" width="125" alt="SourceForge.JP" src="http://sourceforge.jp/sflogo.php?group_id=4342&amp;type=2"></a>
253 |         </p>
254 |       </footer>
255 | 
256 |     </div>
257 | <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
258 | <script src="./js/bootstrap.min.js"></script>
259 |   </body>
260 | </html>
261 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
 3 | <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
 4 | 	<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
 5 | 		<param name="Target" value="System.out" />
 6 | 		<layout class="org.apache.log4j.PatternLayout">
 7 | 			<param name="ConversionPattern" value="%d [%t] %-5p %m%n" />
 8 | 		</layout>
 9 | 	</appender>
10 | 	<root>
11 | 		<priority value="info" />
12 | 		<appender-ref ref="stdout" />
13 | 	</root>
14 | </log4j:configuration>
15 | 


--------------------------------------------------------------------------------