├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── dev-tools
    └── release.py
├── pom.xml
└── src
    ├── main
        ├── assemblies
        │   └── plugin.xml
        ├── java
        │   └── org
        │   │   └── elasticsearch
        │   │       ├── plugin
        │   │           └── river
        │   │           │   └── wikipedia
        │   │           │       └── WikipediaRiverPlugin.java
        │   │       └── river
        │   │           └── wikipedia
        │   │               ├── WikipediaRiver.java
        │   │               ├── WikipediaRiverModule.java
        │   │               ├── bzip2
        │   │                   ├── BZip2Constants.java
        │   │                   ├── CBZip2InputStream.java
        │   │                   ├── CBZip2OutputStream.java
        │   │                   └── CRC.java
        │   │               └── support
        │   │                   ├── InfoBox.java
        │   │                   ├── IteratorHandler.java
        │   │                   ├── PageCallbackHandler.java
        │   │                   ├── SAXPageCallbackHandler.java
        │   │                   ├── WikiPage.java
        │   │                   ├── WikiPageIterator.java
        │   │                   ├── WikiTextParser.java
        │   │                   ├── WikiXMLParser.java
        │   │                   ├── WikiXMLParserFactory.java
        │   │                   ├── WikiXMLSAXParser.java
        │   │                   └── package-info.java
        └── resources
        │   └── es-plugin.properties
    └── test
        └── java
            └── org
                └── elasticsearch
                    └── river
                        └── wikipedia
                            ├── WikipediaRiverTest.java
                            └── helper
                                ├── HttpClient.java
                                └── HttpClientResponse.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | /data
 2 | /work
 3 | /logs
 4 | /.idea
 5 | /target
 6 | .DS_Store
 7 | *.iml
 8 | /.project
 9 | /.settings
10 | /.classpath
11 | /plugin_tools
12 | /.local-execution-hints.log
13 | /.local-*-execution-hints.log
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing to elasticsearch
 2 | =============================
 3 | 
 4 | Elasticsearch is an open source project and we love to receive contributions from our community — you! There are many ways to contribute, from writing tutorials or blog posts, improving the documentation, submitting bug reports and feature requests or writing code which can be incorporated into Elasticsearch itself.
 5 | 
 6 | Bug reports
 7 | -----------
 8 | 
 9 | If you think you have found a bug in Elasticsearch, first make sure that you are testing against the [latest version of Elasticsearch](http://www.elasticsearch.org/download/) - your issue may already have been fixed. If not, search our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub in case a similar issue has already been opened.
10 | 
11 | It is very helpful if you can prepare a reproduction of the bug. In other words, provide a small test case which we can run to confirm your bug. It makes it easier to find the problem and to fix it. Test cases should be provided as `curl` commands which we can copy and paste into a terminal to run it locally, for example:
12 | 
13 | ```sh
14 | # delete the index
15 | curl -XDELETE localhost:9200/test
16 | 
17 | # insert a document
18 | curl -XPUT localhost:9200/test/test/1 -d '{
19 |  "title": "test document"
20 | }'
21 | 
22 | # this should return XXXX but instead returns YYY
23 | curl ....
24 | ```
25 | 
26 | Provide as much information as you can. You may think that the problem lies with your query, when actually it depends on how your data is indexed. The easier it is for us to recreate your problem, the faster it is likely to be fixed.
27 | 
28 | Feature requests
29 | ----------------
30 | 
31 | If you find yourself wishing for a feature that doesn't exist in Elasticsearch, you are probably not alone. There are bound to be others out there with similar needs. Many of the features that Elasticsearch has today have been added because our users saw the need.
32 | Open an issue on our [issues list](https://github.com/elasticsearch/elasticsearch/issues) on GitHub which describes the feature you would like to see, why you need it, and how it should work.
33 | 
34 | Contributing code and documentation changes
35 | -------------------------------------------
36 | 
37 | If you have a bugfix or new feature that you would like to contribute to Elasticsearch, please find or open an issue about it first. Talk about what you would like to do. It may be that somebody is already working on it, or that there are particular issues that you should know about before implementing the change.
38 | 
39 | We enjoy working with contributors to get their code accepted. There are many approaches to fixing a problem and it is important to find the best approach before writing too much code.
40 | 
41 | The process for contributing to any of the [Elasticsearch repositories](https://github.com/elasticsearch/) is similar. Details for individual projects can be found below.
42 | 
43 | ### Fork and clone the repository
44 | 
45 | You will need to fork the main Elasticsearch code or documentation repository and clone it to your local machine. See 
46 | [github help page](https://help.github.com/articles/fork-a-repo) for help.
47 | 
48 | Further instructions for specific projects are given below.
49 | 
50 | ### Submitting your changes
51 | 
52 | Once your changes and tests are ready to submit for review:
53 | 
54 | 1. Test your changes
55 | Run the test suite to make sure that nothing is broken.
56 | 
57 | 2. Sign the Contributor License Agreement
58 | Please make sure you have signed our [Contributor License Agreement](http://www.elasticsearch.org/contributor-agreement/). We are not asking you to assign copyright to us, but to give us the right to distribute your code without restriction. We ask this of all contributors in order to assure our users of the origin and continuing existence of the code. You only need to sign the CLA once.
59 | 
60 | 3. Rebase your changes
61 | Update your local repository with the most recent code from the main Elasticsearch repository, and rebase your branch on top of the latest master branch. We prefer your changes to be squashed into a single commit.
62 | 
63 | 4. Submit a pull request
64 | Push your local changes to your forked copy of the repository and [submit a pull request](https://help.github.com/articles/using-pull-requests). In the pull request, describe what your changes do and mention the number of the issue where discussion has taken place, eg "Closes #123".
65 | 
66 | Then sit back and wait. There will probably be discussion about the pull request and, if any changes are needed, we would love to work with you to get your pull request merged into Elasticsearch.
67 | 
68 | 
69 | Contributing to the Elasticsearch plugin
70 | ----------------------------------------
71 | 
72 | **Repository:** [https://github.com/elasticsearch/elasticsearch-river-wikipedia](https://github.com/elasticsearch/elasticsearch-river-wikipedia)
73 | 
74 | Make sure you have [Maven](http://maven.apache.org) installed, as Elasticsearch uses it as its build system. Integration with IntelliJ and Eclipse should work out of the box. Eclipse users can automatically configure their IDE by running `mvn eclipse:eclipse` and then importing the project into their workspace: `File > Import > Existing project into workspace`.
75 | 
76 | Please follow these formatting guidelines:
77 | 
78 | * Java indent is 4 spaces
79 | * Line width is 140 characters
80 | * The rest is left to Java coding standards
81 | * Disable “auto-format on save” to prevent unnecessary format changes. This makes reviews much harder as it generates unnecessary formatting changes. If your IDE supports formatting only modified chunks that is fine to do.
82 | 
83 | To create a distribution from the source, simply run:
84 | 
85 | ```sh
86 | cd elasticsearch-river-wikipedia/
87 | mvn clean package -DskipTests
88 | ```
89 | 
90 | You will find the newly built packages under: `./target/releases/`.
91 | 
92 | Before submitting your changes, run the test suite to make sure that nothing is broken, with:
93 | 
94 | ```sh
95 | mvn clean test
96 | ```
97 | 
98 | Source: [Contributing to elasticsearch](http://www.elasticsearch.org/contributing-to-elasticsearch/)
99 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **Important**: This project has been stopped since elasticsearch 2.0.
  2 | 
  3 | ----
  4 | 
  5 | Wikipedia River Plugin for Elasticsearch
  6 | ==================================
  7 | 
  8 | The Wikipedia River plugin allows index wikipedia.
  9 | 
 10 | **Rivers are [deprecated](https://www.elastic.co/blog/deprecating_rivers) and will be removed in the future.**
 11 | Have a look at [stream2es](https://github.com/elastic/stream2es#wikipedia).
 12 | 
 13 | In order to install the plugin, run: 
 14 | 
 15 | ```sh
 16 | bin/plugin install elasticsearch/elasticsearch-river-wikipedia/2.6.0
 17 | ```
 18 | 
 19 | You need to install a version matching your Elasticsearch version:
 20 | 
 21 | |       Elasticsearch    | Wikipedia River Plugin|                                                             Docs                                                                   |
 22 | |------------------------|-------------------|------------------------------------------------------------------------------------------------------------------------------------|
 23 | |    master              | Build from source | See below                                                                                                                          |
 24 | |    es-1.x              | Build from source  | [2.7.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/es-1.x/#version-270-snapshot-for-elasticsearch-1x)|
 25 | |    es-1.6              |     2.6.0         | [2.6.0](https://github.com/elastic/elasticsearch-river-wikipedia/tree/v2.6.0/#version-260-for-elasticsearch-16)                  |
 26 | |    es-1.5              |     2.5.0         | [2.5.0](https://github.com/elastic/elasticsearch-river-wikipedia/tree/v2.5.0/#version-250-for-elasticsearch-15)                  |
 27 | |    es-1.4              |     2.4.1         | [2.4.1](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/v2.4.1/#version-241-for-elasticsearch-14)                  |
 28 | |    es-1.3              |     2.3.0         | [2.3.0](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/v2.3.0/#version-230-for-elasticsearch-13)                  |
 29 | |    es-1.2              |     2.2.0         | [2.2.0](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/v2.2.0/#wikipedia-river-plugin-for-elasticsearch)      |
 30 | |    es-1.0              |     2.0.0         | [2.0.0](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/v2.0.0/#wikipedia-river-plugin-for-elasticsearch)      |
 31 | |    es-0.90             |     1.3.0         | [1.3.0](https://github.com/elasticsearch/elasticsearch-river-wikipedia/tree/v1.3.0/#wikipedia-river-plugin-for-elasticsearch)      |
 32 | 
 33 | To build a `SNAPSHOT` version, you need to build it with Maven:
 34 | 
 35 | ```bash
 36 | mvn clean install
 37 | plugin --install river-wikipedia \ 
 38 |        --url file:target/releases/elasticsearch-river-wikipedia-X.X.X-SNAPSHOT.zip
 39 | ```
 40 | 
 41 | Create river
 42 | ------------
 43 | 
 44 | A simple river to index [Wikipedia](http://en.wikipedia.org) (English pages). Create it using:
 45 | 
 46 | ```sh
 47 | curl -XPUT localhost:9200/_river/my_river/_meta -d '
 48 | {
 49 |     "type" : "wikipedia"
 50 | }
 51 | '
 52 | ```
 53 | 
 54 | The default download is the latest [wikipedia dump](http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2). It can be changed using:
 55 | 
 56 | ```javascript
 57 | {
 58 |     "type" : "wikipedia",
 59 |     "wikipedia" : {
 60 |         "url" : "url to link to wikipedia dump"
 61 |     }
 62 | }
 63 | ```
 64 | 
 65 | The index name defaults to the river name, and the type defaults to `page`. Both can be changed in the index section:
 66 | 
 67 | ```javascript
 68 | {
 69 |     "type" : "wikipedia",
 70 |     "index" : {
 71 |         "index" : "my_index",
 72 |         "type" : "my_type"
 73 |     }
 74 | }
 75 | ```
 76 | 
 77 | Since 1.3.0, by default, `bulk` size is `100`. A bulk is flushed every `5s`. Number of concurrent requests allowed to be executed is 1.
 78 | You can modify those settings within index section:
 79 | 
 80 | ```javascript
 81 | {
 82 |     "type" : "wikipedia",
 83 |     "index" : {
 84 |         "index" : "my_index",
 85 |         "type" : "my_type",
 86 |         "bulk_size" : 1000,
 87 |         "flush_interval" : "1s",
 88 |         "max_concurrent_bulk" : 3
 89 |     }
 90 | }
 91 | ```
 92 | 
 93 | Mapping
 94 | -------
 95 | 
 96 | By default, wikipedia river will generate the following mapping:
 97 | 
 98 | ```javascript
 99 | {
100 |    "page": {
101 |       "properties": {
102 |          "category": {
103 |             "type": "string"
104 |          },
105 |          "disambiguation": {
106 |             "type": "boolean"
107 |          },
108 |          "link": {
109 |             "type": "string"
110 |          },
111 |          "redirect": {
112 |             "type": "boolean"
113 |          },
114 |          "redirect_page": {
115 |             "type": "string"
116 |          },
117 |          "special": {
118 |             "type": "boolean"
119 |          },
120 |          "stub": {
121 |             "type": "boolean"
122 |          },
123 |          "text": {
124 |             "type": "string"
125 |          },
126 |          "title": {
127 |             "type": "string"
128 |          }
129 |       }
130 |    }
131 | }
132 | ```
133 | 
134 | 
135 | License
136 | -------
137 | 
138 |     This software is licensed under the Apache 2 license, quoted below.
139 | 
140 |     Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
141 | 
142 |     Licensed under the Apache License, Version 2.0 (the "License"); you may not
143 |     use this file except in compliance with the License. You may obtain a copy of
144 |     the License at
145 | 
146 |         http://www.apache.org/licenses/LICENSE-2.0
147 | 
148 |     Unless required by applicable law or agreed to in writing, software
149 |     distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
150 |     WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
151 |     License for the specific language governing permissions and limitations under
152 |     the License.
153 | 


--------------------------------------------------------------------------------
/dev-tools/release.py:
--------------------------------------------------------------------------------
  1 | # Licensed to Elasticsearch under one or more contributor
  2 | # license agreements. See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright
  4 | # ownership. Elasticsearch licenses this file to you under
  5 | # the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance  with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on
 13 | # an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 14 | # either express or implied. See the License for the specific
 15 | # language governing permissions and limitations under the License.
 16 | 
 17 | import datetime
 18 | import os
 19 | import shutil
 20 | import sys
 21 | import time
 22 | import urllib
 23 | import urllib.request
 24 | import zipfile
 25 | 
 26 | from os.path import dirname, abspath
 27 | 
 28 | """
 29 |  This tool builds a release from the a given elasticsearch plugin branch.
 30 | 
 31 |  It is basically a wrapper on top of launch_release.py which:
 32 | 
 33 |  - tries to get a more recent version of launch_release.py in ...
 34 |  - download it if needed
 35 |  - launch it passing all arguments to it, like:
 36 | 
 37 |    $ python3 dev_tools/release.py --branch master --publish --remote origin
 38 | 
 39 |  Important options:
 40 | 
 41 |    # Dry run
 42 |    $ python3 dev_tools/release.py
 43 | 
 44 |    # Dry run without tests
 45 |    python3 dev_tools/release.py --skiptests
 46 | 
 47 |    # Release, publish artifacts and announce
 48 |    $ python3 dev_tools/release.py --publish
 49 | 
 50 |  See full documentation in launch_release.py
 51 | """
 52 | env = os.environ
 53 | 
 54 | # Change this if the source repository for your scripts is at a different location
 55 | SOURCE_REPO = 'elasticsearch/elasticsearch-plugins-script'
 56 | # We define that we should download again the script after 1 days
 57 | SCRIPT_OBSOLETE_DAYS = 1
 58 | # We ignore in master.zip file the following files
 59 | IGNORED_FILES = ['.gitignore', 'README.md']
 60 | 
 61 | 
 62 | ROOT_DIR = abspath(os.path.join(abspath(dirname(__file__)), '../'))
 63 | TARGET_TOOLS_DIR = ROOT_DIR + '/plugin_tools'
 64 | DEV_TOOLS_DIR = ROOT_DIR + '/dev-tools'
 65 | BUILD_RELEASE_FILENAME = 'release.zip'
 66 | BUILD_RELEASE_FILE = TARGET_TOOLS_DIR + '/' + BUILD_RELEASE_FILENAME
 67 | SOURCE_URL = 'https://github.com/%s/archive/master.zip' % SOURCE_REPO
 68 | 
 69 | # Download a recent version of the release plugin tool
 70 | try:
 71 |     os.mkdir(TARGET_TOOLS_DIR)
 72 |     print('directory %s created' % TARGET_TOOLS_DIR)
 73 | except FileExistsError:
 74 |     pass
 75 | 
 76 | 
 77 | try:
 78 |     # we check latest update. If we ran an update recently, we
 79 |     # are not going to check it again
 80 |     download = True
 81 | 
 82 |     try:
 83 |         last_download_time = datetime.datetime.fromtimestamp(os.path.getmtime(BUILD_RELEASE_FILE))
 84 |         if (datetime.datetime.now()-last_download_time).days < SCRIPT_OBSOLETE_DAYS:
 85 |             download = False
 86 |     except FileNotFoundError:
 87 |         pass
 88 | 
 89 |     if download:
 90 |         urllib.request.urlretrieve(SOURCE_URL, BUILD_RELEASE_FILE)
 91 |         with zipfile.ZipFile(BUILD_RELEASE_FILE) as myzip:
 92 |             for member in myzip.infolist():
 93 |                 filename = os.path.basename(member.filename)
 94 |                 # skip directories
 95 |                 if not filename:
 96 |                     continue
 97 |                 if filename in IGNORED_FILES:
 98 |                     continue
 99 | 
100 |                 # copy file (taken from zipfile's extract)
101 |                 source = myzip.open(member.filename)
102 |                 target = open(os.path.join(TARGET_TOOLS_DIR, filename), "wb")
103 |                 with source, target:
104 |                     shutil.copyfileobj(source, target)
105 |                     # We keep the original date
106 |                     date_time = time.mktime(member.date_time + (0, 0, -1))
107 |                     os.utime(os.path.join(TARGET_TOOLS_DIR, filename), (date_time, date_time))
108 |         print('plugin-tools updated from %s' % SOURCE_URL)
109 | except urllib.error.HTTPError:
110 |     pass
111 | 
112 | 
113 | # Let see if we need to update the release.py script itself
114 | source_time = os.path.getmtime(TARGET_TOOLS_DIR + '/release.py')
115 | repo_time = os.path.getmtime(DEV_TOOLS_DIR + '/release.py')
116 | if source_time > repo_time:
117 |     input('release.py needs an update. Press a key to update it...')
118 |     shutil.copyfile(TARGET_TOOLS_DIR + '/release.py', DEV_TOOLS_DIR + '/release.py')
119 | 
120 | # We can launch the build process
121 | try:
122 |     PYTHON = 'python'
123 |     # make sure python3 is used if python3 is available
124 |     # some systems use python 2 as default
125 |     os.system('python3 --version > /dev/null 2>&1')
126 |     PYTHON = 'python3'
127 | except RuntimeError:
128 |     pass
129 | 
130 | release_args = ''
131 | for x in range(1, len(sys.argv)):
132 |     release_args += ' ' + sys.argv[x]
133 | 
134 | os.system('%s %s/build_release.py %s' % (PYTHON, TARGET_TOOLS_DIR, release_args))
135 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>org.elasticsearch</groupId>
 8 |     <artifactId>elasticsearch-river-wikipedia</artifactId>
 9 |     <version>3.0.0-SNAPSHOT</version>
10 |     <packaging>jar</packaging>
11 |     <name>Elasticsearch Wikipedia River plugin</name>
12 |     <description>The Wikipedia River plugin allows index wikipedia</description>
13 |     <url>https://github.com/elastic/elasticsearch-river-wikipedia/</url>
14 |     <inceptionYear>2009</inceptionYear>
15 |     <licenses>
16 |         <license>
17 |             <name>The Apache Software License, Version 2.0</name>
18 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
19 |             <distribution>repo</distribution>
20 |         </license>
21 |     </licenses>
22 |     <scm>
23 |         <connection>scm:git:git@github.com:elastic/elasticsearch-river-wikipedia.git</connection>
24 |         <developerConnection>scm:git:git@github.com:elastic/elasticsearch-river-wikipedia.git</developerConnection>
25 |         <url>http://github.com/elastic/elasticsearch-river-wikipedia</url>
26 |     </scm>
27 | 
28 |     <parent>
29 |         <groupId>org.elasticsearch</groupId>
30 |         <artifactId>elasticsearch-plugin</artifactId>
31 |         <version>2.0.0-SNAPSHOT</version>
32 |     </parent>
33 | 
34 |     <properties>
35 |         <!-- You can add any specific project property here -->
36 |         <!-- no unit tests -->
37 |         <tests.ifNoTests>warn</tests.ifNoTests>
38 |     </properties>
39 | 
40 |     <build>
41 |         <plugins>
42 |             <plugin>
43 |                 <groupId>org.apache.maven.plugins</groupId>
44 |                 <artifactId>maven-assembly-plugin</artifactId>
45 |             </plugin>
46 |         </plugins>
47 |     </build>
48 | 
49 |     <repositories>
50 |         <repository>
51 |             <id>oss-snapshots</id>
52 |             <name>Sonatype OSS Snapshots</name>
53 |             <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
54 |         </repository>
55 |     </repositories>
56 | </project>
57 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>plugin</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <dependencySets>
 9 |         <dependencySet>
10 |             <outputDirectory>/</outputDirectory>
11 |             <useProjectArtifact>true</useProjectArtifact>
12 |             <useTransitiveFiltering>true</useTransitiveFiltering>
13 |             <excludes>
14 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
15 |             </excludes>
16 |         </dependencySet>
17 |     </dependencySets>
18 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/river/wikipedia/WikipediaRiverPlugin.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.plugin.river.wikipedia;
21 | 
22 | import org.elasticsearch.common.inject.Inject;
23 | import org.elasticsearch.plugins.AbstractPlugin;
24 | import org.elasticsearch.river.RiversModule;
25 | import org.elasticsearch.river.wikipedia.WikipediaRiverModule;
26 | 
27 | /**
28 |  *
29 |  */
30 | public class WikipediaRiverPlugin extends AbstractPlugin {
31 | 
32 |     @Inject
33 |     public WikipediaRiverPlugin() {
34 |     }
35 | 
36 |     @Override
37 |     public String name() {
38 |         return "river-wikipedia";
39 |     }
40 | 
41 |     @Override
42 |     public String description() {
43 |         return "River Wikipedia Plugin";
44 |     }
45 | 
46 |     public void onModule(RiversModule module) {
47 |         module.registerRiver("wikipedia", WikipediaRiverModule.class);
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiver.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia;
 21 | 
 22 | import org.elasticsearch.ExceptionsHelper;
 23 | import org.elasticsearch.action.bulk.BulkItemResponse;
 24 | import org.elasticsearch.action.bulk.BulkProcessor;
 25 | import org.elasticsearch.action.bulk.BulkRequest;
 26 | import org.elasticsearch.action.bulk.BulkResponse;
 27 | import org.elasticsearch.action.index.IndexRequest;
 28 | import org.elasticsearch.client.Client;
 29 | import org.elasticsearch.cluster.block.ClusterBlockException;
 30 | import org.elasticsearch.common.inject.Inject;
 31 | import org.elasticsearch.common.unit.TimeValue;
 32 | import org.elasticsearch.common.util.concurrent.EsExecutors;
 33 | import org.elasticsearch.common.xcontent.XContentBuilder;
 34 | import org.elasticsearch.common.xcontent.XContentFactory;
 35 | import org.elasticsearch.common.xcontent.support.XContentMapValues;
 36 | import org.elasticsearch.indices.IndexAlreadyExistsException;
 37 | import org.elasticsearch.river.AbstractRiverComponent;
 38 | import org.elasticsearch.river.River;
 39 | import org.elasticsearch.river.RiverName;
 40 | import org.elasticsearch.river.RiverSettings;
 41 | import org.elasticsearch.river.wikipedia.support.PageCallbackHandler;
 42 | import org.elasticsearch.river.wikipedia.support.WikiPage;
 43 | import org.elasticsearch.river.wikipedia.support.WikiXMLParser;
 44 | import org.elasticsearch.river.wikipedia.support.WikiXMLParserFactory;
 45 | 
 46 | import java.io.IOException;
 47 | import java.net.MalformedURLException;
 48 | import java.net.URL;
 49 | import java.util.Map;
 50 | 
 51 | /**
 52 |  *
 53 |  */
 54 | public class WikipediaRiver extends AbstractRiverComponent implements River {
 55 | 
 56 |     private StringBuilder sb = new StringBuilder();
 57 | 
 58 |     private final Client client;
 59 | 
 60 |     private final URL url;
 61 | 
 62 |     private final String indexName;
 63 | 
 64 |     private final String typeName;
 65 | 
 66 |     private final int bulkSize;
 67 | 
 68 |     private volatile Thread thread;
 69 | 
 70 |     private volatile boolean closed = false;
 71 | 
 72 |     private final TimeValue bulkFlushInterval;
 73 |     private volatile BulkProcessor bulkProcessor;
 74 |     private final int maxConcurrentBulk;
 75 |     private Parser parser;
 76 | 
 77 | 
 78 |     @SuppressWarnings({"unchecked"})
 79 |     @Inject
 80 |     public WikipediaRiver(RiverName riverName, RiverSettings settings, Client client) throws MalformedURLException {
 81 |         super(riverName, settings);
 82 |         this.client = client;
 83 | 
 84 |         String url = "http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2";
 85 |         if (settings.settings().containsKey("wikipedia")) {
 86 |             Map<String, Object> wikipediaSettings = (Map<String, Object>) settings.settings().get("wikipedia");
 87 |             url = XContentMapValues.nodeStringValue(wikipediaSettings.get("url"), url);
 88 |         }
 89 | 
 90 |         logger.info("creating wikipedia stream river for [{}]", url);
 91 |         this.url = new URL(url);
 92 | 
 93 |         if (settings.settings().containsKey("index")) {
 94 |             Map<String, Object> indexSettings = (Map<String, Object>) settings.settings().get("index");
 95 |             this.indexName = XContentMapValues.nodeStringValue(indexSettings.get("index"), riverName.name());
 96 |             this.typeName = XContentMapValues.nodeStringValue(indexSettings.get("type"), "page");
 97 |             this.bulkSize = XContentMapValues.nodeIntegerValue(indexSettings.get("bulk_size"), 100);
 98 |             this.bulkFlushInterval = TimeValue.parseTimeValue(XContentMapValues.nodeStringValue(
 99 |                     indexSettings.get("flush_interval"), "5s"), TimeValue.timeValueSeconds(5));
100 |             this.maxConcurrentBulk = XContentMapValues.nodeIntegerValue(indexSettings.get("max_concurrent_bulk"), 1);
101 |         } else {
102 |             this.indexName = riverName.name();
103 |             this.typeName = "page";
104 |             this.bulkSize = 100;
105 |             this.maxConcurrentBulk = 1;
106 |             this.bulkFlushInterval = TimeValue.timeValueSeconds(5);
107 |         }
108 | 
109 |         WikiXMLParser xmlParser = WikiXMLParserFactory.getSAXParser(this.url);
110 |         try {
111 |             xmlParser.setPageCallback(new PageCallback());
112 |         } catch (Exception e) {
113 |             logger.error("failed to create xmlParser", e);
114 |             return;
115 |         }
116 |         parser = new Parser(xmlParser);
117 |         thread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "wikipedia_slurper").newThread(parser);
118 |     }
119 | 
120 |     @Override
121 |     public void start() {
122 |         logger.info("starting wikipedia stream");
123 |         try {
124 |             client.admin().indices().prepareCreate(indexName).execute().actionGet();
125 |         } catch (Exception e) {
126 |             if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
127 |                 // that's fine
128 |             } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException) {
129 |                 // ok, not recovered yet..., lets start indexing and hope we recover by the first bulk
130 |                 // TODO: a smarter logic can be to register for cluster event listener here, and only start sampling when the block is removed...
131 |             } else {
132 |                 logger.warn("failed to create index [{}], disabling river...", e, indexName);
133 |                 return;
134 |             }
135 |         }
136 | 
137 |         // Creating bulk processor
138 |         this.bulkProcessor = BulkProcessor.builder(client, new BulkProcessor.Listener() {
139 |             @Override
140 |             public void beforeBulk(long executionId, BulkRequest request) {
141 |                 logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions());
142 |             }
143 | 
144 |             @Override
145 |             public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
146 |                 logger.debug("Executed bulk composed of {} actions", request.numberOfActions());
147 |                 if (response.hasFailures()) {
148 |                     logger.warn("There was failures while executing bulk", response.buildFailureMessage());
149 |                     if (logger.isDebugEnabled()) {
150 |                         for (BulkItemResponse item : response.getItems()) {
151 |                             if (item.isFailed()) {
152 |                                 logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(),
153 |                                         item.getType(), item.getId(), item.getOpType(), item.getFailureMessage());
154 |                             }
155 |                         }
156 |                     }
157 |                 }
158 |             }
159 | 
160 |             @Override
161 |             public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
162 |                 logger.warn("Error executing bulk", failure);
163 |             }
164 |         })
165 |                 .setBulkActions(bulkSize)
166 |                 .setConcurrentRequests(maxConcurrentBulk)
167 |                 .setFlushInterval(bulkFlushInterval)
168 |                 .build();
169 | 
170 |         // Start wikipedia slurper
171 |         thread.start();
172 |     }
173 | 
174 |     @Override
175 |     public void close() {
176 |         logger.info("closing wikipedia river");
177 |         closed = true;
178 |         if (thread != null) {
179 |             thread.interrupt();
180 |         }
181 |         if (parser != null) {
182 |             parser.close();
183 |         }
184 | 
185 |         if (this.bulkProcessor != null) {
186 |             this.bulkProcessor.close();
187 |         }
188 |     }
189 | 
190 |     private class Parser implements Runnable {
191 |         private final WikiXMLParser parser;
192 | 
193 |         private Parser(WikiXMLParser parser) {
194 |             this.parser = parser;
195 |         }
196 | 
197 |         @Override
198 |         public void run() {
199 |             try {
200 |                 parser.parse();
201 |             } catch (Exception e) {
202 |                 if (closed) {
203 |                     return;
204 |                 }
205 |                 logger.error("failed to parse stream", e);
206 |             }
207 |         }
208 | 
209 |         public void close() {
210 |             if (parser != null) {
211 |                 try {
212 |                     parser.close();
213 |                 } catch (IOException e) {
214 |                     logger.error("failed to close parser", e);
215 |                 }
216 |             }
217 |         }
218 |     }
219 | 
220 |     private class PageCallback implements PageCallbackHandler {
221 | 
222 |         @Override
223 |         public void process(WikiPage page) {
224 |             if (closed) {
225 |                 return;
226 |             }
227 |             String title = stripTitle(page.getTitle());
228 |             if (logger.isTraceEnabled()) {
229 |                 logger.trace("page {} : {}", page.getID(), page.getTitle());
230 |             }
231 |             try {
232 |                 XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
233 |                 builder.field("title", title);
234 |                 builder.field("text", page.getText());
235 |                 builder.field("redirect", page.isRedirect());
236 |                 builder.field("redirect_page", page.getRedirectPage());
237 |                 builder.field("special", page.isSpecialPage());
238 |                 builder.field("stub", page.isStub());
239 |                 builder.field("disambiguation", page.isDisambiguationPage());
240 | 
241 |                 builder.startArray("category");
242 |                 for (String s : page.getCategories()) {
243 |                     builder.value(s);
244 |                 }
245 |                 builder.endArray();
246 | 
247 |                 builder.startArray("link");
248 |                 for (String s : page.getLinks()) {
249 |                     builder.value(s);
250 |                 }
251 |                 builder.endArray();
252 | 
253 |                 builder.endObject();
254 | 
255 |                 if (closed) {
256 |                     logger.warn("river was closing while processing wikipedia page [{}]/[{}]. Operation skipped.",
257 |                             page.getID(), page.getTitle());
258 |                     return;
259 |                 }
260 | 
261 |                 bulkProcessor.add(new IndexRequest(indexName, typeName, page.getID()).source(builder));
262 |             } catch (Exception e) {
263 |                 logger.warn("failed to construct index request", e);
264 |             }
265 |         }
266 |     }
267 | 
268 | 
269 |     private String stripTitle(String title) {
270 |         sb.setLength(0);
271 |         sb.append(title);
272 |         while (sb.length() > 0 && (sb.charAt(sb.length() - 1) == '\n' || (sb.charAt(sb.length() - 1) == ' '))) {
273 |             sb.deleteCharAt(sb.length() - 1);
274 |         }
275 |         return sb.toString();
276 |     }
277 | }
278 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiverModule.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia;
21 | 
22 | import org.elasticsearch.common.inject.AbstractModule;
23 | import org.elasticsearch.river.River;
24 | 
25 | /**
26 |  *
27 |  */
28 | public class WikipediaRiverModule extends AbstractModule {
29 | 
30 |     @Override
31 |     protected void configure() {
32 |         bind(River.class).to(WikipediaRiver.class).asEagerSingleton();
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/bzip2/BZip2Constants.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia.bzip2;
 21 | 
 22 | /**
 23 |  * Base class for both the compress and decompress classes.
 24 |  * Holds common arrays, and static data.
 25 |  * <p>
 26 |  * This interface is public for historical purposes.
 27 |  * You should have no need to use it.
 28 |  * </p>
 29 |  */
 30 | public interface BZip2Constants {
 31 | 
 32 |     int baseBlockSize = 100000;
 33 |     int MAX_ALPHA_SIZE = 258;
 34 |     int MAX_CODE_LEN = 23;
 35 |     int RUNA = 0;
 36 |     int RUNB = 1;
 37 |     int N_GROUPS = 6;
 38 |     int G_SIZE = 50;
 39 |     int N_ITERS = 4;
 40 |     int MAX_SELECTORS = (2 + (900000 / G_SIZE));
 41 |     int NUM_OVERSHOOT_BYTES = 20;
 42 | 
 43 |     /**
 44 |      * This array really shouldn't be here.
 45 |      * Again, for historical purposes it is.
 46 |      * <p/>
 47 |      * <p>FIXME: This array should be in a private or package private
 48 |      * location, since it could be modified by malicious code.</p>
 49 |      */
 50 |     int[] rNums = {
 51 |             619, 720, 127, 481, 931, 816, 813, 233, 566, 247,
 52 |             985, 724, 205, 454, 863, 491, 741, 242, 949, 214,
 53 |             733, 859, 335, 708, 621, 574, 73, 654, 730, 472,
 54 |             419, 436, 278, 496, 867, 210, 399, 680, 480, 51,
 55 |             878, 465, 811, 169, 869, 675, 611, 697, 867, 561,
 56 |             862, 687, 507, 283, 482, 129, 807, 591, 733, 623,
 57 |             150, 238, 59, 379, 684, 877, 625, 169, 643, 105,
 58 |             170, 607, 520, 932, 727, 476, 693, 425, 174, 647,
 59 |             73, 122, 335, 530, 442, 853, 695, 249, 445, 515,
 60 |             909, 545, 703, 919, 874, 474, 882, 500, 594, 612,
 61 |             641, 801, 220, 162, 819, 984, 589, 513, 495, 799,
 62 |             161, 604, 958, 533, 221, 400, 386, 867, 600, 782,
 63 |             382, 596, 414, 171, 516, 375, 682, 485, 911, 276,
 64 |             98, 553, 163, 354, 666, 933, 424, 341, 533, 870,
 65 |             227, 730, 475, 186, 263, 647, 537, 686, 600, 224,
 66 |             469, 68, 770, 919, 190, 373, 294, 822, 808, 206,
 67 |             184, 943, 795, 384, 383, 461, 404, 758, 839, 887,
 68 |             715, 67, 618, 276, 204, 918, 873, 777, 604, 560,
 69 |             951, 160, 578, 722, 79, 804, 96, 409, 713, 940,
 70 |             652, 934, 970, 447, 318, 353, 859, 672, 112, 785,
 71 |             645, 863, 803, 350, 139, 93, 354, 99, 820, 908,
 72 |             609, 772, 154, 274, 580, 184, 79, 626, 630, 742,
 73 |             653, 282, 762, 623, 680, 81, 927, 626, 789, 125,
 74 |             411, 521, 938, 300, 821, 78, 343, 175, 128, 250,
 75 |             170, 774, 972, 275, 999, 639, 495, 78, 352, 126,
 76 |             857, 956, 358, 619, 580, 124, 737, 594, 701, 612,
 77 |             669, 112, 134, 694, 363, 992, 809, 743, 168, 974,
 78 |             944, 375, 748, 52, 600, 747, 642, 182, 862, 81,
 79 |             344, 805, 988, 739, 511, 655, 814, 334, 249, 515,
 80 |             897, 955, 664, 981, 649, 113, 974, 459, 893, 228,
 81 |             433, 837, 553, 268, 926, 240, 102, 654, 459, 51,
 82 |             686, 754, 806, 760, 493, 403, 415, 394, 687, 700,
 83 |             946, 670, 656, 610, 738, 392, 760, 799, 887, 653,
 84 |             978, 321, 576, 617, 626, 502, 894, 679, 243, 440,
 85 |             680, 879, 194, 572, 640, 724, 926, 56, 204, 700,
 86 |             707, 151, 457, 449, 797, 195, 791, 558, 945, 679,
 87 |             297, 59, 87, 824, 713, 663, 412, 693, 342, 606,
 88 |             134, 108, 571, 364, 631, 212, 174, 643, 304, 329,
 89 |             343, 97, 430, 751, 497, 314, 983, 374, 822, 928,
 90 |             140, 206, 73, 263, 980, 736, 876, 478, 430, 305,
 91 |             170, 514, 364, 692, 829, 82, 855, 953, 676, 246,
 92 |             369, 970, 294, 750, 807, 827, 150, 790, 288, 923,
 93 |             804, 378, 215, 828, 592, 281, 565, 555, 710, 82,
 94 |             896, 831, 547, 261, 524, 462, 293, 465, 502, 56,
 95 |             661, 821, 976, 991, 658, 869, 905, 758, 745, 193,
 96 |             768, 550, 608, 933, 378, 286, 215, 979, 792, 961,
 97 |             61, 688, 793, 644, 986, 403, 106, 366, 905, 644,
 98 |             372, 567, 466, 434, 645, 210, 389, 550, 919, 135,
 99 |             780, 773, 635, 389, 707, 100, 626, 958, 165, 504,
100 |             920, 176, 193, 713, 857, 265, 203, 50, 668, 108,
101 |             645, 990, 626, 197, 510, 357, 358, 850, 858, 364,
102 |             936, 638
103 |     };
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/bzip2/CBZip2InputStream.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia.bzip2;
 21 | 
 22 | import java.io.IOException;
 23 | import java.io.InputStream;
 24 | 
 25 | /**
 26 |  * An input stream that decompresses from the BZip2 format (without the file
 27 |  * header chars) to be read as any other stream.
 28 |  * <p/>
 29 |  * <p>The decompression requires large amounts of memory. Thus you
 30 |  * should call the {@link #close() close()} method as soon as
 31 |  * possible, to force <tt>CBZip2InputStream</tt> to release the
 32 |  * allocated memory.  See {@link CBZip2OutputStream
 33 |  * CBZip2OutputStream} for information about memory usage.</p>
 34 |  * <p/>
 35 |  * <p><tt>CBZip2InputStream</tt> reads bytes from the compressed
 36 |  * source stream via the single byte {@link java.io.InputStream#read()
 37 |  * read()} method exclusively. Thus you should consider to use a
 38 |  * buffered source stream.</p>
 39 |  * <p/>
 40 |  * <p>Instances of this class are not threadsafe.</p>
 41 |  */
 42 | public class CBZip2InputStream extends InputStream implements BZip2Constants {
 43 | 
 44 |     private static void reportCRCError() throws IOException {
 45 |         // The clean way would be to throw an exception.
 46 |         throw new IOException("crc error");
 47 |     }
 48 | 
 49 |     private void makeMaps() {
 50 |         final boolean[] inUse = this.data.inUse;
 51 |         final byte[] seqToUnseq = this.data.seqToUnseq;
 52 | 
 53 |         int nInUseShadow = 0;
 54 | 
 55 |         for (int i = 0; i < 256; i++) {
 56 |             if (inUse[i])
 57 |                 seqToUnseq[nInUseShadow++] = (byte) i;
 58 |         }
 59 | 
 60 |         this.nInUse = nInUseShadow;
 61 |     }
 62 | 
 63 |     /**
 64 |      * Index of the last char in the block, so the block size == last + 1.
 65 |      */
 66 |     private int last;
 67 | 
 68 |     /**
 69 |      * Index in zptr[] of original string after sorting.
 70 |      */
 71 |     private int origPtr;
 72 | 
 73 |     /**
 74 |      * always: in the range 0 .. 9.
 75 |      * The current block size is 100000 * this number.
 76 |      */
 77 |     private int blockSize100k;
 78 | 
 79 |     private boolean blockRandomised;
 80 | 
 81 |     private int bsBuff;
 82 |     private int bsLive;
 83 |     private final CRC crc = new CRC();
 84 | 
 85 |     private int nInUse;
 86 | 
 87 |     private InputStream in;
 88 | 
 89 |     private int currentChar = -1;
 90 | 
 91 |     private static final int EOF = 0;
 92 |     private static final int START_BLOCK_STATE = 1;
 93 |     private static final int RAND_PART_A_STATE = 2;
 94 |     private static final int RAND_PART_B_STATE = 3;
 95 |     private static final int RAND_PART_C_STATE = 4;
 96 |     private static final int NO_RAND_PART_A_STATE = 5;
 97 |     private static final int NO_RAND_PART_B_STATE = 6;
 98 |     private static final int NO_RAND_PART_C_STATE = 7;
 99 | 
100 |     private int currentState = START_BLOCK_STATE;
101 | 
102 |     private int storedBlockCRC, storedCombinedCRC;
103 |     private int computedBlockCRC, computedCombinedCRC;
104 | 
105 |     // Variables used by setup* methods exclusively
106 | 
107 |     private int su_count;
108 |     private int su_ch2;
109 |     private int su_chPrev;
110 |     private int su_i2;
111 |     private int su_j2;
112 |     private int su_rNToGo;
113 |     private int su_rTPos;
114 |     private int su_tPos;
115 |     private char su_z;
116 | 
117 |     /**
118 |      * All memory intensive stuff.
119 |      * This field is initialized by initBlock().
120 |      */
121 |     private Data data;
122 | 
123 |     /**
124 |      * Constructs a new CBZip2InputStream which decompresses bytes read from
125 |      * the specified stream.
126 |      * <p/>
127 |      * <p>Although BZip2 headers are marked with the magic
128 |      * <tt>"Bz"</tt> this constructor expects the next byte in the
129 |      * stream to be the first one after the magic.  Thus callers have
130 |      * to skip the first two bytes. Otherwise this constructor will
131 |      * throw an exception. </p>
132 |      *
133 |      * @throws java.io.IOException  if the stream content is malformed or an I/O error occurs.
134 |      * @throws NullPointerException if <tt>in == null</tt>
135 |      */
136 |     public CBZip2InputStream(final InputStream in) throws IOException {
137 |         super();
138 | 
139 |         this.in = in;
140 |         init();
141 |     }
142 | 
143 |     public int read() throws IOException {
144 |         if (this.in != null) {
145 |             return read0();
146 |         } else {
147 |             throw new IOException("stream closed");
148 |         }
149 |     }
150 | 
151 |     public int read(final byte[] dest, final int offs, final int len)
152 |             throws IOException {
153 |         if (offs < 0) {
154 |             throw new IndexOutOfBoundsException("offs(" + offs + ") < 0.");
155 |         }
156 |         if (len < 0) {
157 |             throw new IndexOutOfBoundsException("len(" + len + ") < 0.");
158 |         }
159 |         if (offs + len > dest.length) {
160 |             throw new IndexOutOfBoundsException("offs(" + offs + ") + len("
161 |                     + len + ") > dest.length("
162 |                     + dest.length + ").");
163 |         }
164 |         if (this.in == null) {
165 |             throw new IOException("stream closed");
166 |         }
167 | 
168 |         final int hi = offs + len;
169 |         int destOffs = offs;
170 |         for (int b; (destOffs < hi) && ((b = read0()) >= 0); ) {
171 |             dest[destOffs++] = (byte) b;
172 |         }
173 | 
174 |         return (destOffs == offs) ? -1 : (destOffs - offs);
175 |     }
176 | 
177 |     private int read0() throws IOException {
178 |         final int retChar = this.currentChar;
179 | 
180 |         switch (this.currentState) {
181 |             case EOF:
182 |                 return -1;
183 | 
184 |             case START_BLOCK_STATE:
185 |                 throw new IllegalStateException();
186 | 
187 |             case RAND_PART_A_STATE:
188 |                 throw new IllegalStateException();
189 | 
190 |             case RAND_PART_B_STATE:
191 |                 setupRandPartB();
192 |                 break;
193 | 
194 |             case RAND_PART_C_STATE:
195 |                 setupRandPartC();
196 |                 break;
197 | 
198 |             case NO_RAND_PART_A_STATE:
199 |                 throw new IllegalStateException();
200 | 
201 |             case NO_RAND_PART_B_STATE:
202 |                 setupNoRandPartB();
203 |                 break;
204 | 
205 |             case NO_RAND_PART_C_STATE:
206 |                 setupNoRandPartC();
207 |                 break;
208 | 
209 |             default:
210 |                 throw new IllegalStateException();
211 |         }
212 | 
213 |         return retChar;
214 |     }
215 | 
216 |     private void init() throws IOException {
217 |         if (null == in) {
218 |             throw new IOException("No InputStream");
219 |         }
220 |         if (in.available() == 0) {
221 |             throw new IOException("Empty InputStream");
222 |         }
223 |         int magic2 = this.in.read();
224 |         if (magic2 != 'h') {
225 |             throw new IOException("Stream is not BZip2 formatted: expected 'h'"
226 |                     + " as first byte but got '" + (char) magic2
227 |                     + "'");
228 |         }
229 | 
230 |         int blockSize = this.in.read();
231 |         if ((blockSize < '1') || (blockSize > '9')) {
232 |             throw new IOException("Stream is not BZip2 formatted: illegal "
233 |                     + "blocksize " + (char) blockSize);
234 |         }
235 | 
236 |         this.blockSize100k = blockSize - '0';
237 | 
238 |         initBlock();
239 |         setupBlock();
240 |     }
241 | 
242 |     private void initBlock() throws IOException {
243 |         char magic0 = bsGetUByte();
244 |         char magic1 = bsGetUByte();
245 |         char magic2 = bsGetUByte();
246 |         char magic3 = bsGetUByte();
247 |         char magic4 = bsGetUByte();
248 |         char magic5 = bsGetUByte();
249 | 
250 |         if (magic0 == 0x17 &&
251 |                 magic1 == 0x72 &&
252 |                 magic2 == 0x45 &&
253 |                 magic3 == 0x38 &&
254 |                 magic4 == 0x50 &&
255 |                 magic5 == 0x90) {
256 |             complete(); // end of file
257 |         } else if (magic0 != 0x31 || // '1'
258 |                 magic1 != 0x41 || // ')'
259 |                 magic2 != 0x59 || // 'Y'
260 |                 magic3 != 0x26 || // '&'
261 |                 magic4 != 0x53 || // 'S'
262 |                 magic5 != 0x59   // 'Y'
263 |                 ) {
264 |             this.currentState = EOF;
265 |             throw new IOException("bad block header");
266 |         } else {
267 |             this.storedBlockCRC = bsGetInt();
268 |             this.blockRandomised = bsR(1) == 1;
269 | 
270 |             /**
271 |              * Allocate data here instead in constructor, so we do not
272 |              * allocate it if the input file is empty.
273 |              */
274 |             if (this.data == null) {
275 |                 this.data = new Data(this.blockSize100k);
276 |             }
277 | 
278 |             // currBlockNo++;
279 |             getAndMoveToFrontDecode();
280 | 
281 |             this.crc.initialiseCRC();
282 |             this.currentState = START_BLOCK_STATE;
283 |         }
284 |     }
285 | 
286 |     private void endBlock() throws IOException {
287 |         this.computedBlockCRC = this.crc.getFinalCRC();
288 | 
289 |         // A bad CRC is considered a fatal error.
290 |         if (this.storedBlockCRC != this.computedBlockCRC) {
291 |             // make next blocks readable without error
292 |             // (repair feature, not yet documented, not tested)
293 |             this.computedCombinedCRC
294 |                     = (this.storedCombinedCRC << 1)
295 |                     | (this.storedCombinedCRC >>> 31);
296 |             this.computedCombinedCRC ^= this.storedBlockCRC;
297 | 
298 |             reportCRCError();
299 |         }
300 | 
301 |         this.computedCombinedCRC
302 |                 = (this.computedCombinedCRC << 1)
303 |                 | (this.computedCombinedCRC >>> 31);
304 |         this.computedCombinedCRC ^= this.computedBlockCRC;
305 |     }
306 | 
307 |     private void complete() throws IOException {
308 |         this.storedCombinedCRC = bsGetInt();
309 |         this.currentState = EOF;
310 |         this.data = null;
311 | 
312 |         if (this.storedCombinedCRC != this.computedCombinedCRC) {
313 |             reportCRCError();
314 |         }
315 |     }
316 | 
317 |     public void close() throws IOException {
318 |         InputStream inShadow = this.in;
319 |         if (inShadow != null) {
320 |             try {
321 |                 if (inShadow != System.in) {
322 |                     inShadow.close();
323 |                 }
324 |             } finally {
325 |                 this.data = null;
326 |                 this.in = null;
327 |             }
328 |         }
329 |     }
330 | 
331 |     private int bsR(final int n) throws IOException {
332 |         int bsLiveShadow = this.bsLive;
333 |         int bsBuffShadow = this.bsBuff;
334 | 
335 |         if (bsLiveShadow < n) {
336 |             final InputStream inShadow = this.in;
337 |             do {
338 |                 int thech = inShadow.read();
339 | 
340 |                 if (thech < 0) {
341 |                     throw new IOException("unexpected end of stream");
342 |                 }
343 | 
344 |                 bsBuffShadow = (bsBuffShadow << 8) | thech;
345 |                 bsLiveShadow += 8;
346 |             } while (bsLiveShadow < n);
347 | 
348 |             this.bsBuff = bsBuffShadow;
349 |         }
350 | 
351 |         this.bsLive = bsLiveShadow - n;
352 |         return (bsBuffShadow >> (bsLiveShadow - n)) & ((1 << n) - 1);
353 |     }
354 | 
355 |     private boolean bsGetBit() throws IOException {
356 |         int bsLiveShadow = this.bsLive;
357 |         int bsBuffShadow = this.bsBuff;
358 | 
359 |         if (bsLiveShadow < 1) {
360 |             int thech = this.in.read();
361 | 
362 |             if (thech < 0) {
363 |                 throw new IOException("unexpected end of stream");
364 |             }
365 | 
366 |             bsBuffShadow = (bsBuffShadow << 8) | thech;
367 |             bsLiveShadow += 8;
368 |             this.bsBuff = bsBuffShadow;
369 |         }
370 | 
371 |         this.bsLive = bsLiveShadow - 1;
372 |         return ((bsBuffShadow >> (bsLiveShadow - 1)) & 1) != 0;
373 |     }
374 | 
375 |     private char bsGetUByte() throws IOException {
376 |         return (char) bsR(8);
377 |     }
378 | 
379 |     private int bsGetInt() throws IOException {
380 |         return (((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8);
381 |     }
382 | 
383 |     /**
384 |      * Called by createHuffmanDecodingTables() exclusively.
385 |      */
386 |     private static void hbCreateDecodeTables(final int[] limit,
387 |                                              final int[] base,
388 |                                              final int[] perm,
389 |                                              final char[] length,
390 |                                              final int minLen,
391 |                                              final int maxLen,
392 |                                              final int alphaSize) {
393 |         for (int i = minLen, pp = 0; i <= maxLen; i++) {
394 |             for (int j = 0; j < alphaSize; j++) {
395 |                 if (length[j] == i) {
396 |                     perm[pp++] = j;
397 |                 }
398 |             }
399 |         }
400 | 
401 |         for (int i = MAX_CODE_LEN; --i > 0; ) {
402 |             base[i] = 0;
403 |             limit[i] = 0;
404 |         }
405 | 
406 |         for (int i = 0; i < alphaSize; i++) {
407 |             base[length[i] + 1]++;
408 |         }
409 | 
410 |         for (int i = 1, b = base[0]; i < MAX_CODE_LEN; i++) {
411 |             b += base[i];
412 |             base[i] = b;
413 |         }
414 | 
415 |         for (int i = minLen, vec = 0, b = base[i]; i <= maxLen; i++) {
416 |             final int nb = base[i + 1];
417 |             vec += nb - b;
418 |             b = nb;
419 |             limit[i] = vec - 1;
420 |             vec <<= 1;
421 |         }
422 | 
423 |         for (int i = minLen + 1; i <= maxLen; i++) {
424 |             base[i] = ((limit[i - 1] + 1) << 1) - base[i];
425 |         }
426 |     }
427 | 
428 |     private void recvDecodingTables() throws IOException {
429 |         final Data dataShadow = this.data;
430 |         final boolean[] inUse = dataShadow.inUse;
431 |         final byte[] pos = dataShadow.recvDecodingTables_pos;
432 |         final byte[] selector = dataShadow.selector;
433 |         final byte[] selectorMtf = dataShadow.selectorMtf;
434 | 
435 |         int inUse16 = 0;
436 | 
437 |         /* Receive the mapping table */
438 |         for (int i = 0; i < 16; i++) {
439 |             if (bsGetBit()) {
440 |                 inUse16 |= 1 << i;
441 |             }
442 |         }
443 | 
444 |         for (int i = 256; --i >= 0; ) {
445 |             inUse[i] = false;
446 |         }
447 | 
448 |         for (int i = 0; i < 16; i++) {
449 |             if ((inUse16 & (1 << i)) != 0) {
450 |                 final int i16 = i << 4;
451 |                 for (int j = 0; j < 16; j++) {
452 |                     if (bsGetBit()) {
453 |                         inUse[i16 + j] = true;
454 |                     }
455 |                 }
456 |             }
457 |         }
458 | 
459 |         makeMaps();
460 |         final int alphaSize = this.nInUse + 2;
461 | 
462 |         /* Now the selectors */
463 |         final int nGroups = bsR(3);
464 |         final int nSelectors = bsR(15);
465 | 
466 |         for (int i = 0; i < nSelectors; i++) {
467 |             int j = 0;
468 |             while (bsGetBit()) {
469 |                 j++;
470 |             }
471 |             selectorMtf[i] = (byte) j;
472 |         }
473 | 
474 |         /* Undo the MTF values for the selectors. */
475 |         for (int v = nGroups; --v >= 0; ) {
476 |             pos[v] = (byte) v;
477 |         }
478 | 
479 |         for (int i = 0; i < nSelectors; i++) {
480 |             int v = selectorMtf[i] & 0xff;
481 |             final byte tmp = pos[v];
482 |             while (v > 0) {
483 |                 // nearly all times v is zero, 4 in most other cases
484 |                 pos[v] = pos[v - 1];
485 |                 v--;
486 |             }
487 |             pos[0] = tmp;
488 |             selector[i] = tmp;
489 |         }
490 | 
491 |         final char[][] len = dataShadow.temp_charArray2d;
492 | 
493 |         /* Now the coding tables */
494 |         for (int t = 0; t < nGroups; t++) {
495 |             int curr = bsR(5);
496 |             final char[] len_t = len[t];
497 |             for (int i = 0; i < alphaSize; i++) {
498 |                 while (bsGetBit()) {
499 |                     curr += bsGetBit() ? -1 : 1;
500 |                 }
501 |                 len_t[i] = (char) curr;
502 |             }
503 |         }
504 | 
505 |         // finally create the Huffman tables
506 |         createHuffmanDecodingTables(alphaSize, nGroups);
507 |     }
508 | 
509 |     /**
510 |      * Called by recvDecodingTables() exclusively.
511 |      */
512 |     private void createHuffmanDecodingTables(final int alphaSize,
513 |                                              final int nGroups) {
514 |         final Data dataShadow = this.data;
515 |         final char[][] len = dataShadow.temp_charArray2d;
516 |         final int[] minLens = dataShadow.minLens;
517 |         final int[][] limit = dataShadow.limit;
518 |         final int[][] base = dataShadow.base;
519 |         final int[][] perm = dataShadow.perm;
520 | 
521 |         for (int t = 0; t < nGroups; t++) {
522 |             int minLen = 32;
523 |             int maxLen = 0;
524 |             final char[] len_t = len[t];
525 |             for (int i = alphaSize; --i >= 0; ) {
526 |                 final char lent = len_t[i];
527 |                 if (lent > maxLen) {
528 |                     maxLen = lent;
529 |                 }
530 |                 if (lent < minLen) {
531 |                     minLen = lent;
532 |                 }
533 |             }
534 |             hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen,
535 |                     maxLen, alphaSize);
536 |             minLens[t] = minLen;
537 |         }
538 |     }
539 | 
540 |     private void getAndMoveToFrontDecode() throws IOException {
541 |         this.origPtr = bsR(24);
542 |         recvDecodingTables();
543 | 
544 |         final InputStream inShadow = this.in;
545 |         final Data dataShadow = this.data;
546 |         final byte[] ll8 = dataShadow.ll8;
547 |         final int[] unzftab = dataShadow.unzftab;
548 |         final byte[] selector = dataShadow.selector;
549 |         final byte[] seqToUnseq = dataShadow.seqToUnseq;
550 |         final char[] yy = dataShadow.getAndMoveToFrontDecode_yy;
551 |         final int[] minLens = dataShadow.minLens;
552 |         final int[][] limit = dataShadow.limit;
553 |         final int[][] base = dataShadow.base;
554 |         final int[][] perm = dataShadow.perm;
555 |         final int limitLast = this.blockSize100k * 100000;
556 | 
557 |         /*
558 |           Setting up the unzftab entries here is not strictly
559 |           necessary, but it does save having to do it later
560 |           in a separate pass, and so saves a block's worth of
561 |           cache misses.
562 |         */
563 |         for (int i = 256; --i >= 0; ) {
564 |             yy[i] = (char) i;
565 |             unzftab[i] = 0;
566 |         }
567 | 
568 |         int groupNo = 0;
569 |         int groupPos = G_SIZE - 1;
570 |         final int eob = this.nInUse + 1;
571 |         int nextSym = getAndMoveToFrontDecode0(0);
572 |         int bsBuffShadow = this.bsBuff;
573 |         int bsLiveShadow = this.bsLive;
574 |         int lastShadow = -1;
575 |         int zt = selector[groupNo] & 0xff;
576 |         int[] base_zt = base[zt];
577 |         int[] limit_zt = limit[zt];
578 |         int[] perm_zt = perm[zt];
579 |         int minLens_zt = minLens[zt];
580 | 
581 |         while (nextSym != eob) {
582 |             if ((nextSym == RUNA) || (nextSym == RUNB)) {
583 |                 int s = -1;
584 | 
585 |                 for (int n = 1; true; n <<= 1) {
586 |                     if (nextSym == RUNA) {
587 |                         s += n;
588 |                     } else if (nextSym == RUNB) {
589 |                         s += n << 1;
590 |                     } else {
591 |                         break;
592 |                     }
593 | 
594 |                     if (groupPos == 0) {
595 |                         groupPos = G_SIZE - 1;
596 |                         zt = selector[++groupNo] & 0xff;
597 |                         base_zt = base[zt];
598 |                         limit_zt = limit[zt];
599 |                         perm_zt = perm[zt];
600 |                         minLens_zt = minLens[zt];
601 |                     } else {
602 |                         groupPos--;
603 |                     }
604 | 
605 |                     int zn = minLens_zt;
606 | 
607 |                     // Inlined:
608 |                     // int zvec = bsR(zn);
609 |                     while (bsLiveShadow < zn) {
610 |                         final int thech = inShadow.read();
611 |                         if (thech >= 0) {
612 |                             bsBuffShadow = (bsBuffShadow << 8) | thech;
613 |                             bsLiveShadow += 8;
614 |                             continue;
615 |                         } else {
616 |                             throw new IOException("unexpected end of stream");
617 |                         }
618 |                     }
619 |                     int zvec = (bsBuffShadow >> (bsLiveShadow - zn)) & ((1 << zn) - 1);
620 |                     bsLiveShadow -= zn;
621 | 
622 |                     while (zvec > limit_zt[zn]) {
623 |                         zn++;
624 |                         while (bsLiveShadow < 1) {
625 |                             final int thech = inShadow.read();
626 |                             if (thech >= 0) {
627 |                                 bsBuffShadow = (bsBuffShadow << 8) | thech;
628 |                                 bsLiveShadow += 8;
629 |                                 continue;
630 |                             } else {
631 |                                 throw new IOException("unexpected end of stream");
632 |                             }
633 |                         }
634 |                         bsLiveShadow--;
635 |                         zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
636 |                     }
637 |                     nextSym = perm_zt[zvec - base_zt[zn]];
638 |                 }
639 | 
640 |                 final byte ch = seqToUnseq[yy[0]];
641 |                 unzftab[ch & 0xff] += s + 1;
642 | 
643 |                 while (s-- >= 0) {
644 |                     ll8[++lastShadow] = ch;
645 |                 }
646 | 
647 |                 if (lastShadow >= limitLast) {
648 |                     throw new IOException("block overrun");
649 |                 }
650 |             } else {
651 |                 if (++lastShadow >= limitLast) {
652 |                     throw new IOException("block overrun");
653 |                 }
654 | 
655 |                 final char tmp = yy[nextSym - 1];
656 |                 unzftab[seqToUnseq[tmp] & 0xff]++;
657 |                 ll8[lastShadow] = seqToUnseq[tmp];
658 | 
659 |                 /*
660 |                   This loop is hammered during decompression,
661 |                   hence avoid native method call overhead of
662 |                   System.arraycopy for very small ranges to copy.
663 |                 */
664 |                 if (nextSym <= 16) {
665 |                     for (int j = nextSym - 1; j > 0; ) {
666 |                         yy[j] = yy[--j];
667 |                     }
668 |                 } else {
669 |                     System.arraycopy(yy, 0, yy, 1, nextSym - 1);
670 |                 }
671 | 
672 |                 yy[0] = tmp;
673 | 
674 |                 if (groupPos == 0) {
675 |                     groupPos = G_SIZE - 1;
676 |                     zt = selector[++groupNo] & 0xff;
677 |                     base_zt = base[zt];
678 |                     limit_zt = limit[zt];
679 |                     perm_zt = perm[zt];
680 |                     minLens_zt = minLens[zt];
681 |                 } else {
682 |                     groupPos--;
683 |                 }
684 | 
685 |                 int zn = minLens_zt;
686 | 
687 |                 // Inlined:
688 |                 // int zvec = bsR(zn);
689 |                 while (bsLiveShadow < zn) {
690 |                     final int thech = inShadow.read();
691 |                     if (thech >= 0) {
692 |                         bsBuffShadow = (bsBuffShadow << 8) | thech;
693 |                         bsLiveShadow += 8;
694 |                         continue;
695 |                     } else {
696 |                         throw new IOException("unexpected end of stream");
697 |                     }
698 |                 }
699 |                 int zvec = (bsBuffShadow >> (bsLiveShadow - zn)) & ((1 << zn) - 1);
700 |                 bsLiveShadow -= zn;
701 | 
702 |                 while (zvec > limit_zt[zn]) {
703 |                     zn++;
704 |                     while (bsLiveShadow < 1) {
705 |                         final int thech = inShadow.read();
706 |                         if (thech >= 0) {
707 |                             bsBuffShadow = (bsBuffShadow << 8) | thech;
708 |                             bsLiveShadow += 8;
709 |                             continue;
710 |                         } else {
711 |                             throw new IOException("unexpected end of stream");
712 |                         }
713 |                     }
714 |                     bsLiveShadow--;
715 |                     zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
716 |                 }
717 |                 nextSym = perm_zt[zvec - base_zt[zn]];
718 |             }
719 |         }
720 | 
721 |         this.last = lastShadow;
722 |         this.bsLive = bsLiveShadow;
723 |         this.bsBuff = bsBuffShadow;
724 |     }
725 | 
726 |     private int getAndMoveToFrontDecode0(final int groupNo)
727 |             throws IOException {
728 |         final InputStream inShadow = this.in;
729 |         final Data dataShadow = this.data;
730 |         final int zt = dataShadow.selector[groupNo] & 0xff;
731 |         final int[] limit_zt = dataShadow.limit[zt];
732 |         int zn = dataShadow.minLens[zt];
733 |         int zvec = bsR(zn);
734 |         int bsLiveShadow = this.bsLive;
735 |         int bsBuffShadow = this.bsBuff;
736 | 
737 |         while (zvec > limit_zt[zn]) {
738 |             zn++;
739 |             while (bsLiveShadow < 1) {
740 |                 final int thech = inShadow.read();
741 | 
742 |                 if (thech >= 0) {
743 |                     bsBuffShadow = (bsBuffShadow << 8) | thech;
744 |                     bsLiveShadow += 8;
745 |                     continue;
746 |                 } else {
747 |                     throw new IOException("unexpected end of stream");
748 |                 }
749 |             }
750 |             bsLiveShadow--;
751 |             zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
752 |         }
753 | 
754 |         this.bsLive = bsLiveShadow;
755 |         this.bsBuff = bsBuffShadow;
756 | 
757 |         return dataShadow.perm[zt][zvec - dataShadow.base[zt][zn]];
758 |     }
759 | 
760 |     private void setupBlock() throws IOException {
761 |         if (this.data == null) {
762 |             return;
763 |         }
764 | 
765 |         final int[] cftab = this.data.cftab;
766 |         final int[] tt = this.data.initTT(this.last + 1);
767 |         final byte[] ll8 = this.data.ll8;
768 |         cftab[0] = 0;
769 |         System.arraycopy(this.data.unzftab, 0, cftab, 1, 256);
770 | 
771 |         for (int i = 1, c = cftab[0]; i <= 256; i++) {
772 |             c += cftab[i];
773 |             cftab[i] = c;
774 |         }
775 | 
776 |         for (int i = 0, lastShadow = this.last; i <= lastShadow; i++) {
777 |             tt[cftab[ll8[i] & 0xff]++] = i;
778 |         }
779 | 
780 |         if ((this.origPtr < 0) || (this.origPtr >= tt.length)) {
781 |             throw new IOException("stream corrupted");
782 |         }
783 | 
784 |         this.su_tPos = tt[this.origPtr];
785 |         this.su_count = 0;
786 |         this.su_i2 = 0;
787 |         this.su_ch2 = 256;   /* not a char and not EOF */
788 | 
789 |         if (this.blockRandomised) {
790 |             this.su_rNToGo = 0;
791 |             this.su_rTPos = 0;
792 |             setupRandPartA();
793 |         } else {
794 |             setupNoRandPartA();
795 |         }
796 |     }
797 | 
798 |     private void setupRandPartA() throws IOException {
799 |         if (this.su_i2 <= this.last) {
800 |             this.su_chPrev = this.su_ch2;
801 |             int su_ch2Shadow = this.data.ll8[this.su_tPos] & 0xff;
802 |             this.su_tPos = this.data.tt[this.su_tPos];
803 |             if (this.su_rNToGo == 0) {
804 |                 this.su_rNToGo = BZip2Constants.rNums[this.su_rTPos] - 1;
805 |                 if (++this.su_rTPos == 512) {
806 |                     this.su_rTPos = 0;
807 |                 }
808 |             } else {
809 |                 this.su_rNToGo--;
810 |             }
811 |             this.su_ch2 = su_ch2Shadow ^= (this.su_rNToGo == 1) ? 1 : 0;
812 |             this.su_i2++;
813 |             this.currentChar = su_ch2Shadow;
814 |             this.currentState = RAND_PART_B_STATE;
815 |             this.crc.updateCRC(su_ch2Shadow);
816 |         } else {
817 |             endBlock();
818 |             initBlock();
819 |             setupBlock();
820 |         }
821 |     }
822 | 
823 |     private void setupNoRandPartA() throws IOException {
824 |         if (this.su_i2 <= this.last) {
825 |             this.su_chPrev = this.su_ch2;
826 |             int su_ch2Shadow = this.data.ll8[this.su_tPos] & 0xff;
827 |             this.su_ch2 = su_ch2Shadow;
828 |             this.su_tPos = this.data.tt[this.su_tPos];
829 |             this.su_i2++;
830 |             this.currentChar = su_ch2Shadow;
831 |             this.currentState = NO_RAND_PART_B_STATE;
832 |             this.crc.updateCRC(su_ch2Shadow);
833 |         } else {
834 |             this.currentState = NO_RAND_PART_A_STATE;
835 |             endBlock();
836 |             initBlock();
837 |             setupBlock();
838 |         }
839 |     }
840 | 
841 |     private void setupRandPartB() throws IOException {
842 |         if (this.su_ch2 != this.su_chPrev) {
843 |             this.currentState = RAND_PART_A_STATE;
844 |             this.su_count = 1;
845 |             setupRandPartA();
846 |         } else if (++this.su_count >= 4) {
847 |             this.su_z = (char) (this.data.ll8[this.su_tPos] & 0xff);
848 |             this.su_tPos = this.data.tt[this.su_tPos];
849 |             if (this.su_rNToGo == 0) {
850 |                 this.su_rNToGo = BZip2Constants.rNums[this.su_rTPos] - 1;
851 |                 if (++this.su_rTPos == 512) {
852 |                     this.su_rTPos = 0;
853 |                 }
854 |             } else {
855 |                 this.su_rNToGo--;
856 |             }
857 |             this.su_j2 = 0;
858 |             this.currentState = RAND_PART_C_STATE;
859 |             if (this.su_rNToGo == 1) {
860 |                 this.su_z ^= 1;
861 |             }
862 |             setupRandPartC();
863 |         } else {
864 |             this.currentState = RAND_PART_A_STATE;
865 |             setupRandPartA();
866 |         }
867 |     }
868 | 
869 |     private void setupRandPartC() throws IOException {
870 |         if (this.su_j2 < this.su_z) {
871 |             this.currentChar = this.su_ch2;
872 |             this.crc.updateCRC(this.su_ch2);
873 |             this.su_j2++;
874 |         } else {
875 |             this.currentState = RAND_PART_A_STATE;
876 |             this.su_i2++;
877 |             this.su_count = 0;
878 |             setupRandPartA();
879 |         }
880 |     }
881 | 
882 |     private void setupNoRandPartB() throws IOException {
883 |         if (this.su_ch2 != this.su_chPrev) {
884 |             this.su_count = 1;
885 |             setupNoRandPartA();
886 |         } else if (++this.su_count >= 4) {
887 |             this.su_z = (char) (this.data.ll8[this.su_tPos] & 0xff);
888 |             this.su_tPos = this.data.tt[this.su_tPos];
889 |             this.su_j2 = 0;
890 |             setupNoRandPartC();
891 |         } else {
892 |             setupNoRandPartA();
893 |         }
894 |     }
895 | 
896 |     private void setupNoRandPartC() throws IOException {
897 |         if (this.su_j2 < this.su_z) {
898 |             int su_ch2Shadow = this.su_ch2;
899 |             this.currentChar = su_ch2Shadow;
900 |             this.crc.updateCRC(su_ch2Shadow);
901 |             this.su_j2++;
902 |             this.currentState = NO_RAND_PART_C_STATE;
903 |         } else {
904 |             this.su_i2++;
905 |             this.su_count = 0;
906 |             setupNoRandPartA();
907 |         }
908 |     }
909 | 
910 |     private static final class Data extends Object {
911 | 
912 |         // (with blockSize 900k)
913 |         final boolean[] inUse = new boolean[256];                                   //      256 byte
914 | 
915 |         final byte[] seqToUnseq = new byte[256];                                    //      256 byte
916 |         final byte[] selector = new byte[MAX_SELECTORS];                          //    18002 byte
917 |         final byte[] selectorMtf = new byte[MAX_SELECTORS];                          //    18002 byte
918 | 
919 |         /**
920 |          * Freq table collected to save a pass over the data during
921 |          * decompression.
922 |          */
923 |         final int[] unzftab = new int[256];                                           //     1024 byte
924 | 
925 |         final int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];                      //     6192 byte
926 |         final int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];                      //     6192 byte
927 |         final int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];                      //     6192 byte
928 |         final int[] minLens = new int[N_GROUPS];                                      //       24 byte
929 | 
930 |         final int[] cftab = new int[257];                                     //     1028 byte
931 |         final char[] getAndMoveToFrontDecode_yy = new char[256];                   //      512 byte
932 |         final char[][] temp_charArray2d = new char[N_GROUPS][MAX_ALPHA_SIZE];       //     3096 byte
933 |         final byte[] recvDecodingTables_pos = new byte[N_GROUPS];                     //        6 byte
934 |         //---------------
935 |         //    60798 byte
936 | 
937 |         int[] tt;                                                                     //  3600000 byte
938 |         byte[] ll8;                                                                   //   900000 byte
939 |         //---------------
940 |         //  4560782 byte
941 |         //===============
942 | 
943 |         Data(int blockSize100k) {
944 |             super();
945 | 
946 |             this.ll8 = new byte[blockSize100k * BZip2Constants.baseBlockSize];
947 |         }
948 | 
949 |         /**
950 |          * Initializes the {@link #tt} array.
951 |          * <p/>
952 |          * This method is called when the required length of the array
953 |          * is known.  I don't initialize it at construction time to
954 |          * avoid unneccessary memory allocation when compressing small
955 |          * files.
956 |          */
957 |         final int[] initTT(int length) {
958 |             int[] ttShadow = this.tt;
959 | 
960 |             // tt.length should always be >= length, but theoretically
961 |             // it can happen, if the compressor mixed small and large
962 |             // blocks.  Normally only the last block will be smaller
963 |             // than others.
964 |             if ((ttShadow == null) || (ttShadow.length < length)) {
965 |                 this.tt = ttShadow = new int[length];
966 |             }
967 | 
968 |             return ttShadow;
969 |         }
970 | 
971 |     }
972 | }
973 | 
974 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/bzip2/CBZip2OutputStream.java:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Licensed to Elasticsearch under one or more contributor
   3 |  * license agreements. See the NOTICE file distributed with
   4 |  * this work for additional information regarding copyright
   5 |  * ownership. Elasticsearch licenses this file to you under
   6 |  * the Apache License, Version 2.0 (the "License"); you may
   7 |  * not use this file except in compliance with the License.
   8 |  * You may obtain a copy of the License at
   9 |  *
  10 |  *    http://www.apache.org/licenses/LICENSE-2.0
  11 |  *
  12 |  * Unless required by applicable law or agreed to in writing,
  13 |  * software distributed under the License is distributed on an
  14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15 |  * KIND, either express or implied.  See the License for the
  16 |  * specific language governing permissions and limitations
  17 |  * under the License.
  18 |  */
  19 | 
  20 | package org.elasticsearch.river.wikipedia.bzip2;
  21 | 
  22 | import java.io.IOException;
  23 | import java.io.OutputStream;
  24 | 
  25 | /**
  26 |  * An output stream that compresses into the BZip2 format (without the file
  27 |  * header chars) into another stream.
  28 |  * <p/>
  29 |  * <p>
  30 |  * The compression requires large amounts of memory. Thus you should call the
  31 |  * {@link #close() close()} method as soon as possible, to force
  32 |  * <tt>CBZip2OutputStream</tt> to release the allocated memory.
  33 |  * </p>
  34 |  * <p/>
  35 |  * <p> You can shrink the amount of allocated memory and maybe raise
  36 |  * the compression speed by choosing a lower blocksize, which in turn
  37 |  * may cause a lower compression ratio. You can avoid unnecessary
  38 |  * memory allocation by avoiding using a blocksize which is bigger
  39 |  * than the size of the input.  </p>
  40 |  * <p/>
  41 |  * <p> You can compute the memory usage for compressing by the
  42 |  * following formula: </p>
  43 |  * <p/>
  44 |  * <pre>
  45 |  * &lt;code&gt;400k + (9 * blocksize)&lt;/code&gt;.
  46 |  * </pre>
  47 |  * <p/>
  48 |  * <p> To get the memory required for decompression by {@link
  49 |  * CBZip2InputStream CBZip2InputStream} use </p>
  50 |  * <p/>
  51 |  * <pre>
  52 |  * &lt;code&gt;65k + (5 * blocksize)&lt;/code&gt;.
  53 |  * </pre>
  54 |  * <p/>
  55 |  * <table width="100%" border="1">
  56 |  * <colgroup> <col width="33%" /> <col width="33%" /> <col width="33%" />
  57 |  * </colgroup>
  58 |  * <tr>
  59 |  * <th colspan="3">Memory usage by blocksize</th>
  60 |  * </tr>
  61 |  * <tr>
  62 |  * <th align="right">Blocksize</th> <th align="right">Compression<br>
  63 |  * memory usage</th> <th align="right">Decompression<br>
  64 |  * memory usage</th>
  65 |  * </tr>
  66 |  * <tr>
  67 |  * <td align="right">100k</td>
  68 |  * <td align="right">1300k</td>
  69 |  * <td align="right">565k</td>
  70 |  * </tr>
  71 |  * <tr>
  72 |  * <td align="right">200k</td>
  73 |  * <td align="right">2200k</td>
  74 |  * <td align="right">1065k</td>
  75 |  * </tr>
  76 |  * <tr>
  77 |  * <td align="right">300k</td>
  78 |  * <td align="right">3100k</td>
  79 |  * <td align="right">1565k</td>
  80 |  * </tr>
  81 |  * <tr>
  82 |  * <td align="right">400k</td>
  83 |  * <td align="right">4000k</td>
  84 |  * <td align="right">2065k</td>
  85 |  * </tr>
  86 |  * <tr>
  87 |  * <td align="right">500k</td>
  88 |  * <td align="right">4900k</td>
  89 |  * <td align="right">2565k</td>
  90 |  * </tr>
  91 |  * <tr>
  92 |  * <td align="right">600k</td>
  93 |  * <td align="right">5800k</td>
  94 |  * <td align="right">3065k</td>
  95 |  * </tr>
  96 |  * <tr>
  97 |  * <td align="right">700k</td>
  98 |  * <td align="right">6700k</td>
  99 |  * <td align="right">3565k</td>
 100 |  * </tr>
 101 |  * <tr>
 102 |  * <td align="right">800k</td>
 103 |  * <td align="right">7600k</td>
 104 |  * <td align="right">4065k</td>
 105 |  * </tr>
 106 |  * <tr>
 107 |  * <td align="right">900k</td>
 108 |  * <td align="right">8500k</td>
 109 |  * <td align="right">4565k</td>
 110 |  * </tr>
 111 |  * </table>
 112 |  * <p/>
 113 |  * <p>
 114 |  * For decompression <tt>CBZip2InputStream</tt> allocates less memory if the
 115 |  * bzipped input is smaller than one block.
 116 |  * </p>
 117 |  * <p/>
 118 |  * <p>
 119 |  * Instances of this class are not threadsafe.
 120 |  * </p>
 121 |  * <p/>
 122 |  * <p>
 123 |  * TODO: Update to BZip2 1.0.1
 124 |  * </p>
 125 |  */
 126 | public class CBZip2OutputStream extends OutputStream
 127 |         implements BZip2Constants {
 128 | 
 129 |     /**
 130 |      * The minimum supported blocksize <tt> == 1</tt>.
 131 |      */
 132 |     public static final int MIN_BLOCKSIZE = 1;
 133 | 
 134 |     /**
 135 |      * The maximum supported blocksize <tt> == 9</tt>.
 136 |      */
 137 |     public static final int MAX_BLOCKSIZE = 9;
 138 | 
 139 |     /**
 140 |      * This constant is accessible by subclasses for historical
 141 |      * purposes. If you don't know what it means then you don't need
 142 |      * it.
 143 |      */
 144 |     protected static final int SETMASK = (1 << 21);
 145 | 
 146 |     /**
 147 |      * This constant is accessible by subclasses for historical
 148 |      * purposes. If you don't know what it means then you don't need
 149 |      * it.
 150 |      */
 151 |     protected static final int CLEARMASK = (~SETMASK);
 152 | 
 153 |     /**
 154 |      * This constant is accessible by subclasses for historical
 155 |      * purposes. If you don't know what it means then you don't need
 156 |      * it.
 157 |      */
 158 |     protected static final int GREATER_ICOST = 15;
 159 | 
 160 |     /**
 161 |      * This constant is accessible by subclasses for historical
 162 |      * purposes. If you don't know what it means then you don't need
 163 |      * it.
 164 |      */
 165 |     protected static final int LESSER_ICOST = 0;
 166 | 
 167 |     /**
 168 |      * This constant is accessible by subclasses for historical
 169 |      * purposes. If you don't know what it means then you don't need
 170 |      * it.
 171 |      */
 172 |     protected static final int SMALL_THRESH = 20;
 173 | 
 174 |     /**
 175 |      * This constant is accessible by subclasses for historical
 176 |      * purposes. If you don't know what it means then you don't need
 177 |      * it.
 178 |      */
 179 |     protected static final int DEPTH_THRESH = 10;
 180 | 
 181 |     /**
 182 |      * This constant is accessible by subclasses for historical
 183 |      * purposes. If you don't know what it means then you don't need
 184 |      * it.
 185 |      */
 186 |     protected static final int WORK_FACTOR = 30;
 187 | 
 188 |     /**
 189 |      * This constant is accessible by subclasses for historical
 190 |      * purposes. If you don't know what it means then you don't need
 191 |      * it.
 192 |      * <p> If you are ever unlucky/improbable enough to get a stack
 193 |      * overflow whilst sorting, increase the following constant and
 194 |      * try again. In practice I have never seen the stack go above 27
 195 |      * elems, so the following limit seems very generous.  </p>
 196 |      */
 197 |     protected static final int QSORT_STACK_SIZE = 1000;
 198 | 
 199 |     /**
 200 |      * Knuth's increments seem to work better than Incerpi-Sedgewick here.
 201 |      * Possibly because the number of elems to sort is usually small, typically
 202 |      * &lt;= 20.
 203 |      */
 204 |     private static final int[] INCS = {1, 4, 13, 40, 121, 364, 1093, 3280,
 205 |             9841, 29524, 88573, 265720, 797161,
 206 |             2391484};
 207 | 
 208 |     /**
 209 |      * This method is accessible by subclasses for historical
 210 |      * purposes. If you don't know what it does then you don't need
 211 |      * it.
 212 |      */
 213 |     protected static void hbMakeCodeLengths(char[] len, int[] freq,
 214 |                                             int alphaSize, int maxLen) {
 215 |         /*
 216 |          * Nodes and heap entries run from 1. Entry 0 for both the heap and
 217 |          * nodes is a sentinel.
 218 |          */
 219 |         final int[] heap = new int[MAX_ALPHA_SIZE * 2];
 220 |         final int[] weight = new int[MAX_ALPHA_SIZE * 2];
 221 |         final int[] parent = new int[MAX_ALPHA_SIZE * 2];
 222 | 
 223 |         for (int i = alphaSize; --i >= 0; ) {
 224 |             weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
 225 |         }
 226 | 
 227 |         for (boolean tooLong = true; tooLong; ) {
 228 |             tooLong = false;
 229 | 
 230 |             int nNodes = alphaSize;
 231 |             int nHeap = 0;
 232 |             heap[0] = 0;
 233 |             weight[0] = 0;
 234 |             parent[0] = -2;
 235 | 
 236 |             for (int i = 1; i <= alphaSize; i++) {
 237 |                 parent[i] = -1;
 238 |                 nHeap++;
 239 |                 heap[nHeap] = i;
 240 | 
 241 |                 int zz = nHeap;
 242 |                 int tmp = heap[zz];
 243 |                 while (weight[tmp] < weight[heap[zz >> 1]]) {
 244 |                     heap[zz] = heap[zz >> 1];
 245 |                     zz >>= 1;
 246 |                 }
 247 |                 heap[zz] = tmp;
 248 |             }
 249 | 
 250 |             // assert (nHeap < (MAX_ALPHA_SIZE + 2)) : nHeap;
 251 | 
 252 |             while (nHeap > 1) {
 253 |                 int n1 = heap[1];
 254 |                 heap[1] = heap[nHeap];
 255 |                 nHeap--;
 256 | 
 257 |                 int yy = 0;
 258 |                 int zz = 1;
 259 |                 int tmp = heap[1];
 260 | 
 261 |                 while (true) {
 262 |                     yy = zz << 1;
 263 | 
 264 |                     if (yy > nHeap) {
 265 |                         break;
 266 |                     }
 267 | 
 268 |                     if ((yy < nHeap)
 269 |                             && (weight[heap[yy + 1]] < weight[heap[yy]])) {
 270 |                         yy++;
 271 |                     }
 272 | 
 273 |                     if (weight[tmp] < weight[heap[yy]]) {
 274 |                         break;
 275 |                     }
 276 | 
 277 |                     heap[zz] = heap[yy];
 278 |                     zz = yy;
 279 |                 }
 280 | 
 281 |                 heap[zz] = tmp;
 282 | 
 283 |                 int n2 = heap[1];
 284 |                 heap[1] = heap[nHeap];
 285 |                 nHeap--;
 286 | 
 287 |                 yy = 0;
 288 |                 zz = 1;
 289 |                 tmp = heap[1];
 290 | 
 291 |                 while (true) {
 292 |                     yy = zz << 1;
 293 | 
 294 |                     if (yy > nHeap) {
 295 |                         break;
 296 |                     }
 297 | 
 298 |                     if ((yy < nHeap)
 299 |                             && (weight[heap[yy + 1]] < weight[heap[yy]])) {
 300 |                         yy++;
 301 |                     }
 302 | 
 303 |                     if (weight[tmp] < weight[heap[yy]]) {
 304 |                         break;
 305 |                     }
 306 | 
 307 |                     heap[zz] = heap[yy];
 308 |                     zz = yy;
 309 |                 }
 310 | 
 311 |                 heap[zz] = tmp;
 312 |                 nNodes++;
 313 |                 parent[n1] = parent[n2] = nNodes;
 314 | 
 315 |                 final int weight_n1 = weight[n1];
 316 |                 final int weight_n2 = weight[n2];
 317 |                 weight[nNodes] = (((weight_n1 & 0xffffff00)
 318 |                         + (weight_n2 & 0xffffff00))
 319 |                         |
 320 |                         (1 + (((weight_n1 & 0x000000ff)
 321 |                                 > (weight_n2 & 0x000000ff))
 322 |                                 ? (weight_n1 & 0x000000ff)
 323 |                                 : (weight_n2 & 0x000000ff))
 324 |                         ));
 325 | 
 326 |                 parent[nNodes] = -1;
 327 |                 nHeap++;
 328 |                 heap[nHeap] = nNodes;
 329 | 
 330 |                 tmp = 0;
 331 |                 zz = nHeap;
 332 |                 tmp = heap[zz];
 333 |                 final int weight_tmp = weight[tmp];
 334 |                 while (weight_tmp < weight[heap[zz >> 1]]) {
 335 |                     heap[zz] = heap[zz >> 1];
 336 |                     zz >>= 1;
 337 |                 }
 338 |                 heap[zz] = tmp;
 339 | 
 340 |             }
 341 | 
 342 |             // assert (nNodes < (MAX_ALPHA_SIZE * 2)) : nNodes;
 343 | 
 344 |             for (int i = 1; i <= alphaSize; i++) {
 345 |                 int j = 0;
 346 |                 int k = i;
 347 | 
 348 |                 for (int parent_k; (parent_k = parent[k]) >= 0; ) {
 349 |                     k = parent_k;
 350 |                     j++;
 351 |                 }
 352 | 
 353 |                 len[i - 1] = (char) j;
 354 |                 if (j > maxLen) {
 355 |                     tooLong = true;
 356 |                 }
 357 |             }
 358 | 
 359 |             if (tooLong) {
 360 |                 for (int i = 1; i < alphaSize; i++) {
 361 |                     int j = weight[i] >> 8;
 362 |                     j = 1 + (j >> 1);
 363 |                     weight[i] = j << 8;
 364 |                 }
 365 |             }
 366 |         }
 367 |     }
 368 | 
 369 |     private static void hbMakeCodeLengths(final byte[] len, final int[] freq,
 370 |                                           final Data dat, final int alphaSize,
 371 |                                           final int maxLen) {
 372 |         /*
 373 |          * Nodes and heap entries run from 1. Entry 0 for both the heap and
 374 |          * nodes is a sentinel.
 375 |          */
 376 |         final int[] heap = dat.heap;
 377 |         final int[] weight = dat.weight;
 378 |         final int[] parent = dat.parent;
 379 | 
 380 |         for (int i = alphaSize; --i >= 0; ) {
 381 |             weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
 382 |         }
 383 | 
 384 |         for (boolean tooLong = true; tooLong; ) {
 385 |             tooLong = false;
 386 | 
 387 |             int nNodes = alphaSize;
 388 |             int nHeap = 0;
 389 |             heap[0] = 0;
 390 |             weight[0] = 0;
 391 |             parent[0] = -2;
 392 | 
 393 |             for (int i = 1; i <= alphaSize; i++) {
 394 |                 parent[i] = -1;
 395 |                 nHeap++;
 396 |                 heap[nHeap] = i;
 397 | 
 398 |                 int zz = nHeap;
 399 |                 int tmp = heap[zz];
 400 |                 while (weight[tmp] < weight[heap[zz >> 1]]) {
 401 |                     heap[zz] = heap[zz >> 1];
 402 |                     zz >>= 1;
 403 |                 }
 404 |                 heap[zz] = tmp;
 405 |             }
 406 | 
 407 |             while (nHeap > 1) {
 408 |                 int n1 = heap[1];
 409 |                 heap[1] = heap[nHeap];
 410 |                 nHeap--;
 411 | 
 412 |                 int yy = 0;
 413 |                 int zz = 1;
 414 |                 int tmp = heap[1];
 415 | 
 416 |                 while (true) {
 417 |                     yy = zz << 1;
 418 | 
 419 |                     if (yy > nHeap) {
 420 |                         break;
 421 |                     }
 422 | 
 423 |                     if ((yy < nHeap)
 424 |                             && (weight[heap[yy + 1]] < weight[heap[yy]])) {
 425 |                         yy++;
 426 |                     }
 427 | 
 428 |                     if (weight[tmp] < weight[heap[yy]]) {
 429 |                         break;
 430 |                     }
 431 | 
 432 |                     heap[zz] = heap[yy];
 433 |                     zz = yy;
 434 |                 }
 435 | 
 436 |                 heap[zz] = tmp;
 437 | 
 438 |                 int n2 = heap[1];
 439 |                 heap[1] = heap[nHeap];
 440 |                 nHeap--;
 441 | 
 442 |                 yy = 0;
 443 |                 zz = 1;
 444 |                 tmp = heap[1];
 445 | 
 446 |                 while (true) {
 447 |                     yy = zz << 1;
 448 | 
 449 |                     if (yy > nHeap) {
 450 |                         break;
 451 |                     }
 452 | 
 453 |                     if ((yy < nHeap)
 454 |                             && (weight[heap[yy + 1]] < weight[heap[yy]])) {
 455 |                         yy++;
 456 |                     }
 457 | 
 458 |                     if (weight[tmp] < weight[heap[yy]]) {
 459 |                         break;
 460 |                     }
 461 | 
 462 |                     heap[zz] = heap[yy];
 463 |                     zz = yy;
 464 |                 }
 465 | 
 466 |                 heap[zz] = tmp;
 467 |                 nNodes++;
 468 |                 parent[n1] = parent[n2] = nNodes;
 469 | 
 470 |                 final int weight_n1 = weight[n1];
 471 |                 final int weight_n2 = weight[n2];
 472 |                 weight[nNodes] = ((weight_n1 & 0xffffff00)
 473 |                         + (weight_n2 & 0xffffff00))
 474 |                         | (1 + (((weight_n1 & 0x000000ff)
 475 |                         > (weight_n2 & 0x000000ff))
 476 |                         ? (weight_n1 & 0x000000ff)
 477 |                         : (weight_n2 & 0x000000ff)));
 478 | 
 479 |                 parent[nNodes] = -1;
 480 |                 nHeap++;
 481 |                 heap[nHeap] = nNodes;
 482 | 
 483 |                 tmp = 0;
 484 |                 zz = nHeap;
 485 |                 tmp = heap[zz];
 486 |                 final int weight_tmp = weight[tmp];
 487 |                 while (weight_tmp < weight[heap[zz >> 1]]) {
 488 |                     heap[zz] = heap[zz >> 1];
 489 |                     zz >>= 1;
 490 |                 }
 491 |                 heap[zz] = tmp;
 492 | 
 493 |             }
 494 | 
 495 |             for (int i = 1; i <= alphaSize; i++) {
 496 |                 int j = 0;
 497 |                 int k = i;
 498 | 
 499 |                 for (int parent_k; (parent_k = parent[k]) >= 0; ) {
 500 |                     k = parent_k;
 501 |                     j++;
 502 |                 }
 503 | 
 504 |                 len[i - 1] = (byte) j;
 505 |                 if (j > maxLen) {
 506 |                     tooLong = true;
 507 |                 }
 508 |             }
 509 | 
 510 |             if (tooLong) {
 511 |                 for (int i = 1; i < alphaSize; i++) {
 512 |                     int j = weight[i] >> 8;
 513 |                     j = 1 + (j >> 1);
 514 |                     weight[i] = j << 8;
 515 |                 }
 516 |             }
 517 |         }
 518 |     }
 519 | 
 520 |     /**
 521 |      * Index of the last char in the block, so the block size == last + 1.
 522 |      */
 523 |     private int last;
 524 | 
 525 |     /**
 526 |      * Index in fmap[] of original string after sorting.
 527 |      */
 528 |     private int origPtr;
 529 | 
 530 |     /**
 531 |      * Always: in the range 0 .. 9. The current block size is 100000 * this
 532 |      * number.
 533 |      */
 534 |     private final int blockSize100k;
 535 | 
 536 |     private boolean blockRandomised;
 537 | 
 538 |     private int bsBuff;
 539 |     private int bsLive;
 540 |     private final CRC crc = new CRC();
 541 | 
 542 |     private int nInUse;
 543 | 
 544 |     private int nMTF;
 545 | 
 546 |     /*
 547 |      * Used when sorting. If too many long comparisons happen, we stop sorting,
 548 |      * randomise the block slightly, and try again.
 549 |      */
 550 |     private int workDone;
 551 |     private int workLimit;
 552 |     private boolean firstAttempt;
 553 | 
 554 |     private int currentChar = -1;
 555 |     private int runLength = 0;
 556 | 
 557 |     private int blockCRC;
 558 |     private int combinedCRC;
 559 |     private int allowableBlockSize;
 560 | 
 561 |     /**
 562 |      * All memory intensive stuff.
 563 |      */
 564 |     private Data data;
 565 | 
 566 |     private OutputStream out;
 567 | 
 568 |     /**
 569 |      * Chooses a blocksize based on the given length of the data to compress.
 570 |      *
 571 |      * @param inputLength The length of the data which will be compressed by
 572 |      *                    <tt>CBZip2OutputStream</tt>.
 573 |      * @return The blocksize, between {@link #MIN_BLOCKSIZE} and
 574 |      *         {@link #MAX_BLOCKSIZE} both inclusive. For a negative
 575 |      *         <tt>inputLength</tt> this method returns <tt>MAX_BLOCKSIZE</tt>
 576 |      *         always.
 577 |      */
 578 |     public static int chooseBlockSize(long inputLength) {
 579 |         return (inputLength > 0) ? (int) Math
 580 |                 .min((inputLength / 132000) + 1, 9) : MAX_BLOCKSIZE;
 581 |     }
 582 | 
 583 |     /**
 584 |      * Constructs a new <tt>CBZip2OutputStream</tt> with a blocksize of 900k.
 585 |      * <p/>
 586 |      * <p>
 587 |      * <b>Attention: </b>The caller is resonsible to write the two BZip2 magic
 588 |      * bytes <tt>"BZ"</tt> to the specified stream prior to calling this
 589 |      * constructor.
 590 |      * </p>
 591 |      *
 592 |      * @param out *
 593 |      *            the destination stream.
 594 |      * @throws java.io.IOException  if an I/O error occurs in the specified stream.
 595 |      * @throws NullPointerException if <code>out == null</code>.
 596 |      */
 597 |     public CBZip2OutputStream(final OutputStream out) throws IOException {
 598 |         this(out, MAX_BLOCKSIZE);
 599 |     }
 600 | 
 601 |     /**
 602 |      * Constructs a new <tt>CBZip2OutputStream</tt> with specified blocksize.
 603 |      * <p/>
 604 |      * <p>
 605 |      * <b>Attention: </b>The caller is resonsible to write the two BZip2 magic
 606 |      * bytes <tt>"BZ"</tt> to the specified stream prior to calling this
 607 |      * constructor.
 608 |      * </p>
 609 |      *
 610 |      * @param out       the destination stream.
 611 |      * @param blockSize the blockSize as 100k units.
 612 |      * @throws java.io.IOException      if an I/O error occurs in the specified stream.
 613 |      * @throws IllegalArgumentException if <code>(blockSize < 1) || (blockSize > 9)</code>.
 614 |      * @throws NullPointerException     if <code>out == null</code>.
 615 |      * @see #MIN_BLOCKSIZE
 616 |      * @see #MAX_BLOCKSIZE
 617 |      */
 618 |     public CBZip2OutputStream(final OutputStream out, final int blockSize)
 619 |             throws IOException {
 620 |         super();
 621 | 
 622 |         if (blockSize < 1) {
 623 |             throw new IllegalArgumentException("blockSize(" + blockSize
 624 |                     + ") < 1");
 625 |         }
 626 |         if (blockSize > 9) {
 627 |             throw new IllegalArgumentException("blockSize(" + blockSize
 628 |                     + ") > 9");
 629 |         }
 630 | 
 631 |         this.blockSize100k = blockSize;
 632 |         this.out = out;
 633 |         init();
 634 |     }
 635 | 
 636 |     public void write(final int b) throws IOException {
 637 |         if (this.out != null) {
 638 |             write0(b);
 639 |         } else {
 640 |             throw new IOException("closed");
 641 |         }
 642 |     }
 643 | 
 644 |     private void writeRun() throws IOException {
 645 |         final int lastShadow = this.last;
 646 | 
 647 |         if (lastShadow < this.allowableBlockSize) {
 648 |             final int currentCharShadow = this.currentChar;
 649 |             final Data dataShadow = this.data;
 650 |             dataShadow.inUse[currentCharShadow] = true;
 651 |             final byte ch = (byte) currentCharShadow;
 652 | 
 653 |             int runLengthShadow = this.runLength;
 654 |             this.crc.updateCRC(currentCharShadow, runLengthShadow);
 655 | 
 656 |             switch (runLengthShadow) {
 657 |                 case 1:
 658 |                     dataShadow.block[lastShadow + 2] = ch;
 659 |                     this.last = lastShadow + 1;
 660 |                     break;
 661 | 
 662 |                 case 2:
 663 |                     dataShadow.block[lastShadow + 2] = ch;
 664 |                     dataShadow.block[lastShadow + 3] = ch;
 665 |                     this.last = lastShadow + 2;
 666 |                     break;
 667 | 
 668 |                 case 3: {
 669 |                     final byte[] block = dataShadow.block;
 670 |                     block[lastShadow + 2] = ch;
 671 |                     block[lastShadow + 3] = ch;
 672 |                     block[lastShadow + 4] = ch;
 673 |                     this.last = lastShadow + 3;
 674 |                 }
 675 |                 break;
 676 | 
 677 |                 default: {
 678 |                     runLengthShadow -= 4;
 679 |                     dataShadow.inUse[runLengthShadow] = true;
 680 |                     final byte[] block = dataShadow.block;
 681 |                     block[lastShadow + 2] = ch;
 682 |                     block[lastShadow + 3] = ch;
 683 |                     block[lastShadow + 4] = ch;
 684 |                     block[lastShadow + 5] = ch;
 685 |                     block[lastShadow + 6] = (byte) runLengthShadow;
 686 |                     this.last = lastShadow + 5;
 687 |                 }
 688 |                 break;
 689 | 
 690 |             }
 691 |         } else {
 692 |             endBlock();
 693 |             initBlock();
 694 |             writeRun();
 695 |         }
 696 |     }
 697 | 
 698 |     /**
 699 |      * Overriden to close the stream.
 700 |      */
 701 |     protected void finalize() throws Throwable {
 702 |         finish();
 703 |         super.finalize();
 704 |     }
 705 | 
 706 | 
 707 |     public void finish() throws IOException {
 708 |         if (out != null) {
 709 |             try {
 710 |                 if (this.runLength > 0) {
 711 |                     writeRun();
 712 |                 }
 713 |                 this.currentChar = -1;
 714 |                 endBlock();
 715 |                 endCompression();
 716 |             } finally {
 717 |                 this.out = null;
 718 |                 this.data = null;
 719 |             }
 720 |         }
 721 |     }
 722 | 
 723 |     public void close() throws IOException {
 724 |         if (out != null) {
 725 |             OutputStream outShadow = this.out;
 726 |             finish();
 727 |             outShadow.close();
 728 |         }
 729 |     }
 730 | 
 731 |     public void flush() throws IOException {
 732 |         OutputStream outShadow = this.out;
 733 |         if (outShadow != null) {
 734 |             outShadow.flush();
 735 |         }
 736 |     }
 737 | 
 738 |     private void init() throws IOException {
 739 |         // write magic: done by caller who created this stream
 740 |         // this.out.write('B');
 741 |         // this.out.write('Z');
 742 | 
 743 |         this.data = new Data(this.blockSize100k);
 744 | 
 745 |         /*
 746 |          * Write `magic' bytes h indicating file-format == huffmanised, followed
 747 |          * by a digit indicating blockSize100k.
 748 |          */
 749 |         bsPutUByte('h');
 750 |         bsPutUByte('0' + this.blockSize100k);
 751 | 
 752 |         this.combinedCRC = 0;
 753 |         initBlock();
 754 |     }
 755 | 
 756 |     private void initBlock() {
 757 |         // blockNo++;
 758 |         this.crc.initialiseCRC();
 759 |         this.last = -1;
 760 |         // ch = 0;
 761 | 
 762 |         boolean[] inUse = this.data.inUse;
 763 |         for (int i = 256; --i >= 0; ) {
 764 |             inUse[i] = false;
 765 |         }
 766 | 
 767 |         /* 20 is just a paranoia constant */
 768 |         this.allowableBlockSize = (this.blockSize100k * BZip2Constants.baseBlockSize) - 20;
 769 |     }
 770 | 
 771 |     private void endBlock() throws IOException {
 772 |         this.blockCRC = this.crc.getFinalCRC();
 773 |         this.combinedCRC = (this.combinedCRC << 1) | (this.combinedCRC >>> 31);
 774 |         this.combinedCRC ^= this.blockCRC;
 775 | 
 776 |         // empty block at end of file
 777 |         if (this.last == -1) {
 778 |             return;
 779 |         }
 780 | 
 781 |         /* sort the block and establish posn of original string */
 782 |         blockSort();
 783 | 
 784 |         /*
 785 |          * A 6-byte block header, the value chosen arbitrarily as 0x314159265359
 786 |          * :-). A 32 bit value does not really give a strong enough guarantee
 787 |          * that the value will not appear by chance in the compressed
 788 |          * datastream. Worst-case probability of this event, for a 900k block,
 789 |          * is about 2.0e-3 for 32 bits, 1.0e-5 for 40 bits and 4.0e-8 for 48
 790 |          * bits. For a compressed file of size 100Gb -- about 100000 blocks --
 791 |          * only a 48-bit marker will do. NB: normal compression/ decompression
 792 |          * donot rely on these statistical properties. They are only important
 793 |          * when trying to recover blocks from damaged files.
 794 |          */
 795 |         bsPutUByte(0x31);
 796 |         bsPutUByte(0x41);
 797 |         bsPutUByte(0x59);
 798 |         bsPutUByte(0x26);
 799 |         bsPutUByte(0x53);
 800 |         bsPutUByte(0x59);
 801 | 
 802 |         /* Now the block's CRC, so it is in a known place. */
 803 |         bsPutInt(this.blockCRC);
 804 | 
 805 |         /* Now a single bit indicating randomisation. */
 806 |         if (this.blockRandomised) {
 807 |             bsW(1, 1);
 808 |         } else {
 809 |             bsW(1, 0);
 810 |         }
 811 | 
 812 |         /* Finally, block's contents proper. */
 813 |         moveToFrontCodeAndSend();
 814 |     }
 815 | 
 816 |     private void endCompression() throws IOException {
 817 |         /*
 818 |          * Now another magic 48-bit number, 0x177245385090, to indicate the end
 819 |          * of the last block. (sqrt(pi), if you want to know. I did want to use
 820 |          * e, but it contains too much repetition -- 27 18 28 18 28 46 -- for me
 821 |          * to feel statistically comfortable. Call me paranoid.)
 822 |          */
 823 |         bsPutUByte(0x17);
 824 |         bsPutUByte(0x72);
 825 |         bsPutUByte(0x45);
 826 |         bsPutUByte(0x38);
 827 |         bsPutUByte(0x50);
 828 |         bsPutUByte(0x90);
 829 | 
 830 |         bsPutInt(this.combinedCRC);
 831 |         bsFinishedWithStream();
 832 |     }
 833 | 
 834 |     /**
 835 |      * Returns the blocksize parameter specified at construction time.
 836 |      */
 837 |     public final int getBlockSize() {
 838 |         return this.blockSize100k;
 839 |     }
 840 | 
 841 |     public void write(final byte[] buf, int offs, final int len)
 842 |             throws IOException {
 843 |         if (offs < 0) {
 844 |             throw new IndexOutOfBoundsException("offs(" + offs + ") < 0.");
 845 |         }
 846 |         if (len < 0) {
 847 |             throw new IndexOutOfBoundsException("len(" + len + ") < 0.");
 848 |         }
 849 |         if (offs + len > buf.length) {
 850 |             throw new IndexOutOfBoundsException("offs(" + offs + ") + len("
 851 |                     + len + ") > buf.length("
 852 |                     + buf.length + ").");
 853 |         }
 854 |         if (this.out == null) {
 855 |             throw new IOException("stream closed");
 856 |         }
 857 | 
 858 |         for (int hi = offs + len; offs < hi; ) {
 859 |             write0(buf[offs++]);
 860 |         }
 861 |     }
 862 | 
 863 |     private void write0(int b) throws IOException {
 864 |         if (this.currentChar != -1) {
 865 |             b &= 0xff;
 866 |             if (this.currentChar == b) {
 867 |                 if (++this.runLength > 254) {
 868 |                     writeRun();
 869 |                     this.currentChar = -1;
 870 |                     this.runLength = 0;
 871 |                 }
 872 |                 // else nothing to do
 873 |             } else {
 874 |                 writeRun();
 875 |                 this.runLength = 1;
 876 |                 this.currentChar = b;
 877 |             }
 878 |         } else {
 879 |             this.currentChar = b & 0xff;
 880 |             this.runLength++;
 881 |         }
 882 |     }
 883 | 
 884 |     private static void hbAssignCodes(final int[] code, final byte[] length,
 885 |                                       final int minLen, final int maxLen,
 886 |                                       final int alphaSize) {
 887 |         int vec = 0;
 888 |         for (int n = minLen; n <= maxLen; n++) {
 889 |             for (int i = 0; i < alphaSize; i++) {
 890 |                 if ((length[i] & 0xff) == n) {
 891 |                     code[i] = vec;
 892 |                     vec++;
 893 |                 }
 894 |             }
 895 |             vec <<= 1;
 896 |         }
 897 |     }
 898 | 
 899 |     private void bsFinishedWithStream() throws IOException {
 900 |         while (this.bsLive > 0) {
 901 |             int ch = this.bsBuff >> 24;
 902 |             this.out.write(ch); // write 8-bit
 903 |             this.bsBuff <<= 8;
 904 |             this.bsLive -= 8;
 905 |         }
 906 |     }
 907 | 
 908 |     private void bsW(final int n, final int v) throws IOException {
 909 |         final OutputStream outShadow = this.out;
 910 |         int bsLiveShadow = this.bsLive;
 911 |         int bsBuffShadow = this.bsBuff;
 912 | 
 913 |         while (bsLiveShadow >= 8) {
 914 |             outShadow.write(bsBuffShadow >> 24); // write 8-bit
 915 |             bsBuffShadow <<= 8;
 916 |             bsLiveShadow -= 8;
 917 |         }
 918 | 
 919 |         this.bsBuff = bsBuffShadow | (v << (32 - bsLiveShadow - n));
 920 |         this.bsLive = bsLiveShadow + n;
 921 |     }
 922 | 
 923 |     private void bsPutUByte(final int c) throws IOException {
 924 |         bsW(8, c);
 925 |     }
 926 | 
 927 |     private void bsPutInt(final int u) throws IOException {
 928 |         bsW(8, (u >> 24) & 0xff);
 929 |         bsW(8, (u >> 16) & 0xff);
 930 |         bsW(8, (u >> 8) & 0xff);
 931 |         bsW(8, u & 0xff);
 932 |     }
 933 | 
 934 |     private void sendMTFValues() throws IOException {
 935 |         final byte[][] len = this.data.sendMTFValues_len;
 936 |         final int alphaSize = this.nInUse + 2;
 937 | 
 938 |         for (int t = N_GROUPS; --t >= 0; ) {
 939 |             byte[] len_t = len[t];
 940 |             for (int v = alphaSize; --v >= 0; ) {
 941 |                 len_t[v] = GREATER_ICOST;
 942 |             }
 943 |         }
 944 | 
 945 |         /* Decide how many coding tables to use */
 946 |         // assert (this.nMTF > 0) : this.nMTF;
 947 |         final int nGroups = (this.nMTF < 200) ? 2 : (this.nMTF < 600) ? 3
 948 |                 : (this.nMTF < 1200) ? 4 : (this.nMTF < 2400) ? 5 : 6;
 949 | 
 950 |         /* Generate an initial set of coding tables */
 951 |         sendMTFValues0(nGroups, alphaSize);
 952 | 
 953 |         /*
 954 |          * Iterate up to N_ITERS times to improve the tables.
 955 |          */
 956 |         final int nSelectors = sendMTFValues1(nGroups, alphaSize);
 957 | 
 958 |         /* Compute MTF values for the selectors. */
 959 |         sendMTFValues2(nGroups, nSelectors);
 960 | 
 961 |         /* Assign actual codes for the tables. */
 962 |         sendMTFValues3(nGroups, alphaSize);
 963 | 
 964 |         /* Transmit the mapping table. */
 965 |         sendMTFValues4();
 966 | 
 967 |         /* Now the selectors. */
 968 |         sendMTFValues5(nGroups, nSelectors);
 969 | 
 970 |         /* Now the coding tables. */
 971 |         sendMTFValues6(nGroups, alphaSize);
 972 | 
 973 |         /* And finally, the block data proper */
 974 |         sendMTFValues7(nSelectors);
 975 |     }
 976 | 
 977 |     private void sendMTFValues0(final int nGroups, final int alphaSize) {
 978 |         final byte[][] len = this.data.sendMTFValues_len;
 979 |         final int[] mtfFreq = this.data.mtfFreq;
 980 | 
 981 |         int remF = this.nMTF;
 982 |         int gs = 0;
 983 | 
 984 |         for (int nPart = nGroups; nPart > 0; nPart--) {
 985 |             final int tFreq = remF / nPart;
 986 |             int ge = gs - 1;
 987 |             int aFreq = 0;
 988 | 
 989 |             for (final int a = alphaSize - 1; (aFreq < tFreq) && (ge < a); ) {
 990 |                 aFreq += mtfFreq[++ge];
 991 |             }
 992 | 
 993 |             if ((ge > gs) && (nPart != nGroups) && (nPart != 1)
 994 |                     && (((nGroups - nPart) & 1) != 0)) {
 995 |                 aFreq -= mtfFreq[ge--];
 996 |             }
 997 | 
 998 |             final byte[] len_np = len[nPart - 1];
 999 |             for (int v = alphaSize; --v >= 0; ) {
1000 |                 if ((v >= gs) && (v <= ge)) {
1001 |                     len_np[v] = LESSER_ICOST;
1002 |                 } else {
1003 |                     len_np[v] = GREATER_ICOST;
1004 |                 }
1005 |             }
1006 | 
1007 |             gs = ge + 1;
1008 |             remF -= aFreq;
1009 |         }
1010 |     }
1011 | 
1012 |     private int sendMTFValues1(final int nGroups, final int alphaSize) {
1013 |         final Data dataShadow = this.data;
1014 |         final int[][] rfreq = dataShadow.sendMTFValues_rfreq;
1015 |         final int[] fave = dataShadow.sendMTFValues_fave;
1016 |         final short[] cost = dataShadow.sendMTFValues_cost;
1017 |         final char[] sfmap = dataShadow.sfmap;
1018 |         final byte[] selector = dataShadow.selector;
1019 |         final byte[][] len = dataShadow.sendMTFValues_len;
1020 |         final byte[] len_0 = len[0];
1021 |         final byte[] len_1 = len[1];
1022 |         final byte[] len_2 = len[2];
1023 |         final byte[] len_3 = len[3];
1024 |         final byte[] len_4 = len[4];
1025 |         final byte[] len_5 = len[5];
1026 |         final int nMTFShadow = this.nMTF;
1027 | 
1028 |         int nSelectors = 0;
1029 | 
1030 |         for (int iter = 0; iter < N_ITERS; iter++) {
1031 |             for (int t = nGroups; --t >= 0; ) {
1032 |                 fave[t] = 0;
1033 |                 int[] rfreqt = rfreq[t];
1034 |                 for (int i = alphaSize; --i >= 0; ) {
1035 |                     rfreqt[i] = 0;
1036 |                 }
1037 |             }
1038 | 
1039 |             nSelectors = 0;
1040 | 
1041 |             for (int gs = 0; gs < this.nMTF; ) {
1042 |                 /* Set group start & end marks. */
1043 | 
1044 |                 /*
1045 |                  * Calculate the cost of this group as coded by each of the
1046 |                  * coding tables.
1047 |                  */
1048 | 
1049 |                 final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1);
1050 | 
1051 |                 if (nGroups == N_GROUPS) {
1052 |                     // unrolled version of the else-block
1053 | 
1054 |                     short cost0 = 0;
1055 |                     short cost1 = 0;
1056 |                     short cost2 = 0;
1057 |                     short cost3 = 0;
1058 |                     short cost4 = 0;
1059 |                     short cost5 = 0;
1060 | 
1061 |                     for (int i = gs; i <= ge; i++) {
1062 |                         final int icv = sfmap[i];
1063 |                         cost0 += len_0[icv] & 0xff;
1064 |                         cost1 += len_1[icv] & 0xff;
1065 |                         cost2 += len_2[icv] & 0xff;
1066 |                         cost3 += len_3[icv] & 0xff;
1067 |                         cost4 += len_4[icv] & 0xff;
1068 |                         cost5 += len_5[icv] & 0xff;
1069 |                     }
1070 | 
1071 |                     cost[0] = cost0;
1072 |                     cost[1] = cost1;
1073 |                     cost[2] = cost2;
1074 |                     cost[3] = cost3;
1075 |                     cost[4] = cost4;
1076 |                     cost[5] = cost5;
1077 | 
1078 |                 } else {
1079 |                     for (int t = nGroups; --t >= 0; ) {
1080 |                         cost[t] = 0;
1081 |                     }
1082 | 
1083 |                     for (int i = gs; i <= ge; i++) {
1084 |                         final int icv = sfmap[i];
1085 |                         for (int t = nGroups; --t >= 0; ) {
1086 |                             cost[t] += len[t][icv] & 0xff;
1087 |                         }
1088 |                     }
1089 |                 }
1090 | 
1091 |                 /*
1092 |                  * Find the coding table which is best for this group, and
1093 |                  * record its identity in the selector table.
1094 |                  */
1095 |                 int bt = -1;
1096 |                 for (int t = nGroups, bc = 999999999; --t >= 0; ) {
1097 |                     final int cost_t = cost[t];
1098 |                     if (cost_t < bc) {
1099 |                         bc = cost_t;
1100 |                         bt = t;
1101 |                     }
1102 |                 }
1103 | 
1104 |                 fave[bt]++;
1105 |                 selector[nSelectors] = (byte) bt;
1106 |                 nSelectors++;
1107 | 
1108 |                 /*
1109 |                  * Increment the symbol frequencies for the selected table.
1110 |                  */
1111 |                 final int[] rfreq_bt = rfreq[bt];
1112 |                 for (int i = gs; i <= ge; i++) {
1113 |                     rfreq_bt[sfmap[i]]++;
1114 |                 }
1115 | 
1116 |                 gs = ge + 1;
1117 |             }
1118 | 
1119 |             /*
1120 |              * Recompute the tables based on the accumulated frequencies.
1121 |              */
1122 |             for (int t = 0; t < nGroups; t++) {
1123 |                 hbMakeCodeLengths(len[t], rfreq[t], this.data, alphaSize, 20);
1124 |             }
1125 |         }
1126 | 
1127 |         return nSelectors;
1128 |     }
1129 | 
1130 |     private void sendMTFValues2(final int nGroups, final int nSelectors) {
1131 |         // assert (nGroups < 8) : nGroups;
1132 | 
1133 |         final Data dataShadow = this.data;
1134 |         byte[] pos = dataShadow.sendMTFValues2_pos;
1135 | 
1136 |         for (int i = nGroups; --i >= 0; ) {
1137 |             pos[i] = (byte) i;
1138 |         }
1139 | 
1140 |         for (int i = 0; i < nSelectors; i++) {
1141 |             final byte ll_i = dataShadow.selector[i];
1142 |             byte tmp = pos[0];
1143 |             int j = 0;
1144 | 
1145 |             while (ll_i != tmp) {
1146 |                 j++;
1147 |                 byte tmp2 = tmp;
1148 |                 tmp = pos[j];
1149 |                 pos[j] = tmp2;
1150 |             }
1151 | 
1152 |             pos[0] = tmp;
1153 |             dataShadow.selectorMtf[i] = (byte) j;
1154 |         }
1155 |     }
1156 | 
1157 |     private void sendMTFValues3(final int nGroups, final int alphaSize) {
1158 |         int[][] code = this.data.sendMTFValues_code;
1159 |         byte[][] len = this.data.sendMTFValues_len;
1160 | 
1161 |         for (int t = 0; t < nGroups; t++) {
1162 |             int minLen = 32;
1163 |             int maxLen = 0;
1164 |             final byte[] len_t = len[t];
1165 |             for (int i = alphaSize; --i >= 0; ) {
1166 |                 final int l = len_t[i] & 0xff;
1167 |                 if (l > maxLen) {
1168 |                     maxLen = l;
1169 |                 }
1170 |                 if (l < minLen) {
1171 |                     minLen = l;
1172 |                 }
1173 |             }
1174 | 
1175 |             // assert (maxLen <= 20) : maxLen;
1176 |             // assert (minLen >= 1) : minLen;
1177 | 
1178 |             hbAssignCodes(code[t], len[t], minLen, maxLen, alphaSize);
1179 |         }
1180 |     }
1181 | 
1182 |     private void sendMTFValues4() throws IOException {
1183 |         final boolean[] inUse = this.data.inUse;
1184 |         final boolean[] inUse16 = this.data.sentMTFValues4_inUse16;
1185 | 
1186 |         for (int i = 16; --i >= 0; ) {
1187 |             inUse16[i] = false;
1188 |             final int i16 = i * 16;
1189 |             for (int j = 16; --j >= 0; ) {
1190 |                 if (inUse[i16 + j]) {
1191 |                     inUse16[i] = true;
1192 |                 }
1193 |             }
1194 |         }
1195 | 
1196 |         for (int i = 0; i < 16; i++) {
1197 |             bsW(1, inUse16[i] ? 1 : 0);
1198 |         }
1199 | 
1200 |         final OutputStream outShadow = this.out;
1201 |         int bsLiveShadow = this.bsLive;
1202 |         int bsBuffShadow = this.bsBuff;
1203 | 
1204 |         for (int i = 0; i < 16; i++) {
1205 |             if (inUse16[i]) {
1206 |                 final int i16 = i * 16;
1207 |                 for (int j = 0; j < 16; j++) {
1208 |                     // inlined: bsW(1, inUse[i16 + j] ? 1 : 0);
1209 |                     while (bsLiveShadow >= 8) {
1210 |                         outShadow.write(bsBuffShadow >> 24); // write 8-bit
1211 |                         bsBuffShadow <<= 8;
1212 |                         bsLiveShadow -= 8;
1213 |                     }
1214 |                     if (inUse[i16 + j]) {
1215 |                         bsBuffShadow |= 1 << (32 - bsLiveShadow - 1);
1216 |                     }
1217 |                     bsLiveShadow++;
1218 |                 }
1219 |             }
1220 |         }
1221 | 
1222 |         this.bsBuff = bsBuffShadow;
1223 |         this.bsLive = bsLiveShadow;
1224 |     }
1225 | 
1226 |     private void sendMTFValues5(final int nGroups, final int nSelectors)
1227 |             throws IOException {
1228 |         bsW(3, nGroups);
1229 |         bsW(15, nSelectors);
1230 | 
1231 |         final OutputStream outShadow = this.out;
1232 |         final byte[] selectorMtf = this.data.selectorMtf;
1233 | 
1234 |         int bsLiveShadow = this.bsLive;
1235 |         int bsBuffShadow = this.bsBuff;
1236 | 
1237 |         for (int i = 0; i < nSelectors; i++) {
1238 |             for (int j = 0, hj = selectorMtf[i] & 0xff; j < hj; j++) {
1239 |                 // inlined: bsW(1, 1);
1240 |                 while (bsLiveShadow >= 8) {
1241 |                     outShadow.write(bsBuffShadow >> 24);
1242 |                     bsBuffShadow <<= 8;
1243 |                     bsLiveShadow -= 8;
1244 |                 }
1245 |                 bsBuffShadow |= 1 << (32 - bsLiveShadow - 1);
1246 |                 bsLiveShadow++;
1247 |             }
1248 | 
1249 |             // inlined: bsW(1, 0);
1250 |             while (bsLiveShadow >= 8) {
1251 |                 outShadow.write(bsBuffShadow >> 24);
1252 |                 bsBuffShadow <<= 8;
1253 |                 bsLiveShadow -= 8;
1254 |             }
1255 |             // bsBuffShadow |= 0 << (32 - bsLiveShadow - 1);
1256 |             bsLiveShadow++;
1257 |         }
1258 | 
1259 |         this.bsBuff = bsBuffShadow;
1260 |         this.bsLive = bsLiveShadow;
1261 |     }
1262 | 
1263 |     private void sendMTFValues6(final int nGroups, final int alphaSize)
1264 |             throws IOException {
1265 |         final byte[][] len = this.data.sendMTFValues_len;
1266 |         final OutputStream outShadow = this.out;
1267 | 
1268 |         int bsLiveShadow = this.bsLive;
1269 |         int bsBuffShadow = this.bsBuff;
1270 | 
1271 |         for (int t = 0; t < nGroups; t++) {
1272 |             byte[] len_t = len[t];
1273 |             int curr = len_t[0] & 0xff;
1274 | 
1275 |             // inlined: bsW(5, curr);
1276 |             while (bsLiveShadow >= 8) {
1277 |                 outShadow.write(bsBuffShadow >> 24); // write 8-bit
1278 |                 bsBuffShadow <<= 8;
1279 |                 bsLiveShadow -= 8;
1280 |             }
1281 |             bsBuffShadow |= curr << (32 - bsLiveShadow - 5);
1282 |             bsLiveShadow += 5;
1283 | 
1284 |             for (int i = 0; i < alphaSize; i++) {
1285 |                 int lti = len_t[i] & 0xff;
1286 |                 while (curr < lti) {
1287 |                     // inlined: bsW(2, 2);
1288 |                     while (bsLiveShadow >= 8) {
1289 |                         outShadow.write(bsBuffShadow >> 24); // write 8-bit
1290 |                         bsBuffShadow <<= 8;
1291 |                         bsLiveShadow -= 8;
1292 |                     }
1293 |                     bsBuffShadow |= 2 << (32 - bsLiveShadow - 2);
1294 |                     bsLiveShadow += 2;
1295 | 
1296 |                     curr++; /* 10 */
1297 |                 }
1298 | 
1299 |                 while (curr > lti) {
1300 |                     // inlined: bsW(2, 3);
1301 |                     while (bsLiveShadow >= 8) {
1302 |                         outShadow.write(bsBuffShadow >> 24); // write 8-bit
1303 |                         bsBuffShadow <<= 8;
1304 |                         bsLiveShadow -= 8;
1305 |                     }
1306 |                     bsBuffShadow |= 3 << (32 - bsLiveShadow - 2);
1307 |                     bsLiveShadow += 2;
1308 | 
1309 |                     curr--; /* 11 */
1310 |                 }
1311 | 
1312 |                 // inlined: bsW(1, 0);
1313 |                 while (bsLiveShadow >= 8) {
1314 |                     outShadow.write(bsBuffShadow >> 24); // write 8-bit
1315 |                     bsBuffShadow <<= 8;
1316 |                     bsLiveShadow -= 8;
1317 |                 }
1318 |                 // bsBuffShadow |= 0 << (32 - bsLiveShadow - 1);
1319 |                 bsLiveShadow++;
1320 |             }
1321 |         }
1322 | 
1323 |         this.bsBuff = bsBuffShadow;
1324 |         this.bsLive = bsLiveShadow;
1325 |     }
1326 | 
1327 |     private void sendMTFValues7(final int nSelectors) throws IOException {
1328 |         final Data dataShadow = this.data;
1329 |         final byte[][] len = dataShadow.sendMTFValues_len;
1330 |         final int[][] code = dataShadow.sendMTFValues_code;
1331 |         final OutputStream outShadow = this.out;
1332 |         final byte[] selector = dataShadow.selector;
1333 |         final char[] sfmap = dataShadow.sfmap;
1334 |         final int nMTFShadow = this.nMTF;
1335 | 
1336 |         int selCtr = 0;
1337 | 
1338 |         int bsLiveShadow = this.bsLive;
1339 |         int bsBuffShadow = this.bsBuff;
1340 | 
1341 |         for (int gs = 0; gs < nMTFShadow; ) {
1342 |             final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1);
1343 |             final int selector_selCtr = selector[selCtr] & 0xff;
1344 |             final int[] code_selCtr = code[selector_selCtr];
1345 |             final byte[] len_selCtr = len[selector_selCtr];
1346 | 
1347 |             while (gs <= ge) {
1348 |                 final int sfmap_i = sfmap[gs];
1349 | 
1350 |                 //
1351 |                 // inlined: bsW(len_selCtr[sfmap_i] & 0xff,
1352 |                 // code_selCtr[sfmap_i]);
1353 |                 //
1354 |                 while (bsLiveShadow >= 8) {
1355 |                     outShadow.write(bsBuffShadow >> 24);
1356 |                     bsBuffShadow <<= 8;
1357 |                     bsLiveShadow -= 8;
1358 |                 }
1359 |                 final int n = len_selCtr[sfmap_i] & 0xFF;
1360 |                 bsBuffShadow |= code_selCtr[sfmap_i] << (32 - bsLiveShadow - n);
1361 |                 bsLiveShadow += n;
1362 | 
1363 |                 gs++;
1364 |             }
1365 | 
1366 |             gs = ge + 1;
1367 |             selCtr++;
1368 |         }
1369 | 
1370 |         this.bsBuff = bsBuffShadow;
1371 |         this.bsLive = bsLiveShadow;
1372 |     }
1373 | 
1374 |     private void moveToFrontCodeAndSend() throws IOException {
1375 |         bsW(24, this.origPtr);
1376 |         generateMTFValues();
1377 |         sendMTFValues();
1378 |     }
1379 | 
1380 |     /**
1381 |      * This is the most hammered method of this class.
1382 |      * <p/>
1383 |      * <p>
1384 |      * This is the version using unrolled loops. Normally I never use such ones
1385 |      * in Java code. The unrolling has shown a noticable performance improvement
1386 |      * on JRE 1.4.2 (Linux i586 / HotSpot Client). Of course it depends on the
1387 |      * JIT compiler of the vm.
1388 |      * </p>
1389 |      */
1390 |     private boolean mainSimpleSort(final Data dataShadow, final int lo,
1391 |                                    final int hi, final int d) {
1392 |         final int bigN = hi - lo + 1;
1393 |         if (bigN < 2) {
1394 |             return this.firstAttempt && (this.workDone > this.workLimit);
1395 |         }
1396 | 
1397 |         int hp = 0;
1398 |         while (INCS[hp] < bigN) {
1399 |             hp++;
1400 |         }
1401 | 
1402 |         final int[] fmap = dataShadow.fmap;
1403 |         final char[] quadrant = dataShadow.quadrant;
1404 |         final byte[] block = dataShadow.block;
1405 |         final int lastShadow = this.last;
1406 |         final int lastPlus1 = lastShadow + 1;
1407 |         final boolean firstAttemptShadow = this.firstAttempt;
1408 |         final int workLimitShadow = this.workLimit;
1409 |         int workDoneShadow = this.workDone;
1410 | 
1411 |         // Following block contains unrolled code which could be shortened by
1412 |         // coding it in additional loops.
1413 | 
1414 |         HP:
1415 |         while (--hp >= 0) {
1416 |             final int h = INCS[hp];
1417 |             final int mj = lo + h - 1;
1418 | 
1419 |             for (int i = lo + h; i <= hi; ) {
1420 |                 // copy
1421 |                 for (int k = 3; (i <= hi) && (--k >= 0); i++) {
1422 |                     final int v = fmap[i];
1423 |                     final int vd = v + d;
1424 |                     int j = i;
1425 | 
1426 |                     // for (int a;
1427 |                     // (j > mj) && mainGtU((a = fmap[j - h]) + d, vd,
1428 |                     // block, quadrant, lastShadow);
1429 |                     // j -= h) {
1430 |                     // fmap[j] = a;
1431 |                     // }
1432 |                     //
1433 |                     // unrolled version:
1434 | 
1435 |                     // start inline mainGTU
1436 |                     boolean onceRunned = false;
1437 |                     int a = 0;
1438 | 
1439 |                     HAMMER:
1440 |                     while (true) {
1441 |                         if (onceRunned) {
1442 |                             fmap[j] = a;
1443 |                             if ((j -= h) <= mj) {
1444 |                                 break HAMMER;
1445 |                             }
1446 |                         } else {
1447 |                             onceRunned = true;
1448 |                         }
1449 | 
1450 |                         a = fmap[j - h];
1451 |                         int i1 = a + d;
1452 |                         int i2 = vd;
1453 | 
1454 |                         // following could be done in a loop, but
1455 |                         // unrolled it for performance:
1456 |                         if (block[i1 + 1] == block[i2 + 1]) {
1457 |                             if (block[i1 + 2] == block[i2 + 2]) {
1458 |                                 if (block[i1 + 3] == block[i2 + 3]) {
1459 |                                     if (block[i1 + 4] == block[i2 + 4]) {
1460 |                                         if (block[i1 + 5] == block[i2 + 5]) {
1461 |                                             if (block[(i1 += 6)] == block[(i2 += 6)]) {
1462 |                                                 int x = lastShadow;
1463 |                                                 X:
1464 |                                                 while (x > 0) {
1465 |                                                     x -= 4;
1466 | 
1467 |                                                     if (block[i1 + 1] == block[i2 + 1]) {
1468 |                                                         if (quadrant[i1] == quadrant[i2]) {
1469 |                                                             if (block[i1 + 2] == block[i2 + 2]) {
1470 |                                                                 if (quadrant[i1 + 1] == quadrant[i2 + 1]) {
1471 |                                                                     if (block[i1 + 3] == block[i2 + 3]) {
1472 |                                                                         if (quadrant[i1 + 2] == quadrant[i2 + 2]) {
1473 |                                                                             if (block[i1 + 4] == block[i2 + 4]) {
1474 |                                                                                 if (quadrant[i1 + 3] == quadrant[i2 + 3]) {
1475 |                                                                                     if ((i1 += 4) >= lastPlus1) {
1476 |                                                                                         i1 -= lastPlus1;
1477 |                                                                                     }
1478 |                                                                                     if ((i2 += 4) >= lastPlus1) {
1479 |                                                                                         i2 -= lastPlus1;
1480 |                                                                                     }
1481 |                                                                                     workDoneShadow++;
1482 |                                                                                     continue X;
1483 |                                                                                 } else if ((quadrant[i1 + 3] > quadrant[i2 + 3])) {
1484 |                                                                                     continue HAMMER;
1485 |                                                                                 } else {
1486 |                                                                                     break HAMMER;
1487 |                                                                                 }
1488 |                                                                             } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
1489 |                                                                                 continue HAMMER;
1490 |                                                                             } else {
1491 |                                                                                 break HAMMER;
1492 |                                                                             }
1493 |                                                                         } else if ((quadrant[i1 + 2] > quadrant[i2 + 2])) {
1494 |                                                                             continue HAMMER;
1495 |                                                                         } else {
1496 |                                                                             break HAMMER;
1497 |                                                                         }
1498 |                                                                     } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
1499 |                                                                         continue HAMMER;
1500 |                                                                     } else {
1501 |                                                                         break HAMMER;
1502 |                                                                     }
1503 |                                                                 } else if ((quadrant[i1 + 1] > quadrant[i2 + 1])) {
1504 |                                                                     continue HAMMER;
1505 |                                                                 } else {
1506 |                                                                     break HAMMER;
1507 |                                                                 }
1508 |                                                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
1509 |                                                                 continue HAMMER;
1510 |                                                             } else {
1511 |                                                                 break HAMMER;
1512 |                                                             }
1513 |                                                         } else if ((quadrant[i1] > quadrant[i2])) {
1514 |                                                             continue HAMMER;
1515 |                                                         } else {
1516 |                                                             break HAMMER;
1517 |                                                         }
1518 |                                                     } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
1519 |                                                         continue HAMMER;
1520 |                                                     } else {
1521 |                                                         break HAMMER;
1522 |                                                     }
1523 | 
1524 |                                                 }
1525 |                                                 break HAMMER;
1526 |                                             } // while x > 0
1527 |                                             else {
1528 |                                                 if ((block[i1] & 0xff) > (block[i2] & 0xff)) {
1529 |                                                     continue HAMMER;
1530 |                                                 } else {
1531 |                                                     break HAMMER;
1532 |                                                 }
1533 |                                             }
1534 |                                         } else if ((block[i1 + 5] & 0xff) > (block[i2 + 5] & 0xff)) {
1535 |                                             continue HAMMER;
1536 |                                         } else {
1537 |                                             break HAMMER;
1538 |                                         }
1539 |                                     } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
1540 |                                         continue HAMMER;
1541 |                                     } else {
1542 |                                         break HAMMER;
1543 |                                     }
1544 |                                 } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
1545 |                                     continue HAMMER;
1546 |                                 } else {
1547 |                                     break HAMMER;
1548 |                                 }
1549 |                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
1550 |                                 continue HAMMER;
1551 |                             } else {
1552 |                                 break HAMMER;
1553 |                             }
1554 |                         } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
1555 |                             continue HAMMER;
1556 |                         } else {
1557 |                             break HAMMER;
1558 |                         }
1559 | 
1560 |                     }
1561 |                     // HAMMER
1562 |                     // end inline mainGTU
1563 | 
1564 |                     fmap[j] = v;
1565 |                 }
1566 | 
1567 |                 if (firstAttemptShadow && (i <= hi)
1568 |                         && (workDoneShadow > workLimitShadow)) {
1569 |                     break HP;
1570 |                 }
1571 |             }
1572 |         }
1573 | 
1574 |         this.workDone = workDoneShadow;
1575 |         return firstAttemptShadow && (workDoneShadow > workLimitShadow);
1576 |     }
1577 | 
1578 |     private static void vswap(int[] fmap, int p1, int p2, int n) {
1579 |         n += p1;
1580 |         while (p1 < n) {
1581 |             int t = fmap[p1];
1582 |             fmap[p1++] = fmap[p2];
1583 |             fmap[p2++] = t;
1584 |         }
1585 |     }
1586 | 
1587 |     private static byte med3(byte a, byte b, byte c) {
1588 |         return (a < b) ? (b < c ? b : a < c ? c : a) : (b > c ? b : a > c ? c
1589 |                 : a);
1590 |     }
1591 | 
1592 |     private void blockSort() {
1593 |         this.workLimit = WORK_FACTOR * this.last;
1594 |         this.workDone = 0;
1595 |         this.blockRandomised = false;
1596 |         this.firstAttempt = true;
1597 |         mainSort();
1598 | 
1599 |         if (this.firstAttempt && (this.workDone > this.workLimit)) {
1600 |             randomiseBlock();
1601 |             this.workLimit = this.workDone = 0;
1602 |             this.firstAttempt = false;
1603 |             mainSort();
1604 |         }
1605 | 
1606 |         int[] fmap = this.data.fmap;
1607 |         this.origPtr = -1;
1608 |         for (int i = 0, lastShadow = this.last; i <= lastShadow; i++) {
1609 |             if (fmap[i] == 0) {
1610 |                 this.origPtr = i;
1611 |                 break;
1612 |             }
1613 |         }
1614 | 
1615 |         // assert (this.origPtr != -1) : this.origPtr;
1616 |     }
1617 | 
1618 |     /**
1619 |      * Method "mainQSort3", file "blocksort.c", BZip2 1.0.2
1620 |      */
1621 |     private void mainQSort3(final Data dataShadow, final int loSt,
1622 |                             final int hiSt, final int dSt) {
1623 |         final int[] stack_ll = dataShadow.stack_ll;
1624 |         final int[] stack_hh = dataShadow.stack_hh;
1625 |         final int[] stack_dd = dataShadow.stack_dd;
1626 |         final int[] fmap = dataShadow.fmap;
1627 |         final byte[] block = dataShadow.block;
1628 | 
1629 |         stack_ll[0] = loSt;
1630 |         stack_hh[0] = hiSt;
1631 |         stack_dd[0] = dSt;
1632 | 
1633 |         for (int sp = 1; --sp >= 0; ) {
1634 |             final int lo = stack_ll[sp];
1635 |             final int hi = stack_hh[sp];
1636 |             final int d = stack_dd[sp];
1637 | 
1638 |             if ((hi - lo < SMALL_THRESH) || (d > DEPTH_THRESH)) {
1639 |                 if (mainSimpleSort(dataShadow, lo, hi, d)) {
1640 |                     return;
1641 |                 }
1642 |             } else {
1643 |                 final int d1 = d + 1;
1644 |                 final int med = med3(block[fmap[lo] + d1],
1645 |                         block[fmap[hi] + d1], block[fmap[(lo + hi) >>> 1] + d1]) & 0xff;
1646 | 
1647 |                 int unLo = lo;
1648 |                 int unHi = hi;
1649 |                 int ltLo = lo;
1650 |                 int gtHi = hi;
1651 | 
1652 |                 while (true) {
1653 |                     while (unLo <= unHi) {
1654 |                         final int n = ((int) block[fmap[unLo] + d1] & 0xff)
1655 |                                 - med;
1656 |                         if (n == 0) {
1657 |                             final int temp = fmap[unLo];
1658 |                             fmap[unLo++] = fmap[ltLo];
1659 |                             fmap[ltLo++] = temp;
1660 |                         } else if (n < 0) {
1661 |                             unLo++;
1662 |                         } else {
1663 |                             break;
1664 |                         }
1665 |                     }
1666 | 
1667 |                     while (unLo <= unHi) {
1668 |                         final int n = ((int) block[fmap[unHi] + d1] & 0xff)
1669 |                                 - med;
1670 |                         if (n == 0) {
1671 |                             final int temp = fmap[unHi];
1672 |                             fmap[unHi--] = fmap[gtHi];
1673 |                             fmap[gtHi--] = temp;
1674 |                         } else if (n > 0) {
1675 |                             unHi--;
1676 |                         } else {
1677 |                             break;
1678 |                         }
1679 |                     }
1680 | 
1681 |                     if (unLo <= unHi) {
1682 |                         final int temp = fmap[unLo];
1683 |                         fmap[unLo++] = fmap[unHi];
1684 |                         fmap[unHi--] = temp;
1685 |                     } else {
1686 |                         break;
1687 |                     }
1688 |                 }
1689 | 
1690 |                 if (gtHi < ltLo) {
1691 |                     stack_ll[sp] = lo;
1692 |                     stack_hh[sp] = hi;
1693 |                     stack_dd[sp] = d1;
1694 |                     sp++;
1695 |                 } else {
1696 |                     int n = ((ltLo - lo) < (unLo - ltLo)) ? (ltLo - lo)
1697 |                             : (unLo - ltLo);
1698 |                     vswap(fmap, lo, unLo - n, n);
1699 |                     int m = ((hi - gtHi) < (gtHi - unHi)) ? (hi - gtHi)
1700 |                             : (gtHi - unHi);
1701 |                     vswap(fmap, unLo, hi - m + 1, m);
1702 | 
1703 |                     n = lo + unLo - ltLo - 1;
1704 |                     m = hi - (gtHi - unHi) + 1;
1705 | 
1706 |                     stack_ll[sp] = lo;
1707 |                     stack_hh[sp] = n;
1708 |                     stack_dd[sp] = d;
1709 |                     sp++;
1710 | 
1711 |                     stack_ll[sp] = n + 1;
1712 |                     stack_hh[sp] = m - 1;
1713 |                     stack_dd[sp] = d1;
1714 |                     sp++;
1715 | 
1716 |                     stack_ll[sp] = m;
1717 |                     stack_hh[sp] = hi;
1718 |                     stack_dd[sp] = d;
1719 |                     sp++;
1720 |                 }
1721 |             }
1722 |         }
1723 |     }
1724 | 
1725 |     private void mainSort() {
1726 |         final Data dataShadow = this.data;
1727 |         final int[] runningOrder = dataShadow.mainSort_runningOrder;
1728 |         final int[] copy = dataShadow.mainSort_copy;
1729 |         final boolean[] bigDone = dataShadow.mainSort_bigDone;
1730 |         final int[] ftab = dataShadow.ftab;
1731 |         final byte[] block = dataShadow.block;
1732 |         final int[] fmap = dataShadow.fmap;
1733 |         final char[] quadrant = dataShadow.quadrant;
1734 |         final int lastShadow = this.last;
1735 |         final int workLimitShadow = this.workLimit;
1736 |         final boolean firstAttemptShadow = this.firstAttempt;
1737 | 
1738 |         // Set up the 2-byte frequency table
1739 |         for (int i = 65537; --i >= 0; ) {
1740 |             ftab[i] = 0;
1741 |         }
1742 | 
1743 |         /*
1744 |          * In the various block-sized structures, live data runs from 0 to
1745 |          * last+NUM_OVERSHOOT_BYTES inclusive. First, set up the overshoot area
1746 |          * for block.
1747 |          */
1748 |         for (int i = 0; i < NUM_OVERSHOOT_BYTES; i++) {
1749 |             block[lastShadow + i + 2] = block[(i % (lastShadow + 1)) + 1];
1750 |         }
1751 |         for (int i = lastShadow + NUM_OVERSHOOT_BYTES + 1; --i >= 0; ) {
1752 |             quadrant[i] = 0;
1753 |         }
1754 |         block[0] = block[lastShadow + 1];
1755 | 
1756 |         // Complete the initial radix sort:
1757 | 
1758 |         int c1 = block[0] & 0xff;
1759 |         for (int i = 0; i <= lastShadow; i++) {
1760 |             final int c2 = block[i + 1] & 0xff;
1761 |             ftab[(c1 << 8) + c2]++;
1762 |             c1 = c2;
1763 |         }
1764 | 
1765 |         for (int i = 1; i <= 65536; i++)
1766 |             ftab[i] += ftab[i - 1];
1767 | 
1768 |         c1 = block[1] & 0xff;
1769 |         for (int i = 0; i < lastShadow; i++) {
1770 |             final int c2 = block[i + 2] & 0xff;
1771 |             fmap[--ftab[(c1 << 8) + c2]] = i;
1772 |             c1 = c2;
1773 |         }
1774 | 
1775 |         fmap[--ftab[((block[lastShadow + 1] & 0xff) << 8) + (block[1] & 0xff)]] = lastShadow;
1776 | 
1777 |         /*
1778 |          * Now ftab contains the first loc of every small bucket. Calculate the
1779 |          * running order, from smallest to largest big bucket.
1780 |          */
1781 |         for (int i = 256; --i >= 0; ) {
1782 |             bigDone[i] = false;
1783 |             runningOrder[i] = i;
1784 |         }
1785 | 
1786 |         for (int h = 364; h != 1; ) {
1787 |             h /= 3;
1788 |             for (int i = h; i <= 255; i++) {
1789 |                 final int vv = runningOrder[i];
1790 |                 final int a = ftab[(vv + 1) << 8] - ftab[vv << 8];
1791 |                 final int b = h - 1;
1792 |                 int j = i;
1793 |                 for (int ro = runningOrder[j - h]; (ftab[(ro + 1) << 8] - ftab[ro << 8]) > a; ro = runningOrder[j
1794 |                         - h]) {
1795 |                     runningOrder[j] = ro;
1796 |                     j -= h;
1797 |                     if (j <= b) {
1798 |                         break;
1799 |                     }
1800 |                 }
1801 |                 runningOrder[j] = vv;
1802 |             }
1803 |         }
1804 | 
1805 |         /*
1806 |          * The main sorting loop.
1807 |          */
1808 |         for (int i = 0; i <= 255; i++) {
1809 |             /*
1810 |              * Process big buckets, starting with the least full.
1811 |              */
1812 |             final int ss = runningOrder[i];
1813 | 
1814 |             // Step 1:
1815 |             /*
1816 |              * Complete the big bucket [ss] by quicksorting any unsorted small
1817 |              * buckets [ss, j]. Hopefully previous pointer-scanning phases have
1818 |              * already completed many of the small buckets [ss, j], so we don't
1819 |              * have to sort them at all.
1820 |              */
1821 |             for (int j = 0; j <= 255; j++) {
1822 |                 final int sb = (ss << 8) + j;
1823 |                 final int ftab_sb = ftab[sb];
1824 |                 if ((ftab_sb & SETMASK) != SETMASK) {
1825 |                     final int lo = ftab_sb & CLEARMASK;
1826 |                     final int hi = (ftab[sb + 1] & CLEARMASK) - 1;
1827 |                     if (hi > lo) {
1828 |                         mainQSort3(dataShadow, lo, hi, 2);
1829 |                         if (firstAttemptShadow
1830 |                                 && (this.workDone > workLimitShadow)) {
1831 |                             return;
1832 |                         }
1833 |                     }
1834 |                     ftab[sb] = ftab_sb | SETMASK;
1835 |                 }
1836 |             }
1837 | 
1838 |             // Step 2:
1839 |             // Now scan this big bucket so as to synthesise the
1840 |             // sorted order for small buckets [t, ss] for all t != ss.
1841 | 
1842 |             for (int j = 0; j <= 255; j++) {
1843 |                 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
1844 |             }
1845 | 
1846 |             for (int j = ftab[ss << 8] & CLEARMASK, hj = (ftab[(ss + 1) << 8] & CLEARMASK); j < hj; j++) {
1847 |                 final int fmap_j = fmap[j];
1848 |                 c1 = block[fmap_j] & 0xff;
1849 |                 if (!bigDone[c1]) {
1850 |                     fmap[copy[c1]] = (fmap_j == 0) ? lastShadow : (fmap_j - 1);
1851 |                     copy[c1]++;
1852 |                 }
1853 |             }
1854 | 
1855 |             for (int j = 256; --j >= 0; )
1856 |                 ftab[(j << 8) + ss] |= SETMASK;
1857 | 
1858 |             // Step 3:
1859 |             /*
1860 |              * The ss big bucket is now done. Record this fact, and update the
1861 |              * quadrant descriptors. Remember to update quadrants in the
1862 |              * overshoot area too, if necessary. The "if (i < 255)" test merely
1863 |              * skips this updating for the last bucket processed, since updating
1864 |              * for the last bucket is pointless.
1865 |              */
1866 |             bigDone[ss] = true;
1867 | 
1868 |             if (i < 255) {
1869 |                 final int bbStart = ftab[ss << 8] & CLEARMASK;
1870 |                 final int bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
1871 |                 int shifts = 0;
1872 | 
1873 |                 while ((bbSize >> shifts) > 65534) {
1874 |                     shifts++;
1875 |                 }
1876 | 
1877 |                 for (int j = 0; j < bbSize; j++) {
1878 |                     final int a2update = fmap[bbStart + j];
1879 |                     final char qVal = (char) (j >> shifts);
1880 |                     quadrant[a2update] = qVal;
1881 |                     if (a2update < NUM_OVERSHOOT_BYTES) {
1882 |                         quadrant[a2update + lastShadow + 1] = qVal;
1883 |                     }
1884 |                 }
1885 |             }
1886 | 
1887 |         }
1888 |     }
1889 | 
1890 |     private void randomiseBlock() {
1891 |         final boolean[] inUse = this.data.inUse;
1892 |         final byte[] block = this.data.block;
1893 |         final int lastShadow = this.last;
1894 | 
1895 |         for (int i = 256; --i >= 0; )
1896 |             inUse[i] = false;
1897 | 
1898 |         int rNToGo = 0;
1899 |         int rTPos = 0;
1900 |         for (int i = 0, j = 1; i <= lastShadow; i = j, j++) {
1901 |             if (rNToGo == 0) {
1902 |                 rNToGo = (char) BZip2Constants.rNums[rTPos];
1903 |                 if (++rTPos == 512) {
1904 |                     rTPos = 0;
1905 |                 }
1906 |             }
1907 | 
1908 |             rNToGo--;
1909 |             block[j] ^= ((rNToGo == 1) ? 1 : 0);
1910 | 
1911 |             // handle 16 bit signed numbers
1912 |             inUse[block[j] & 0xff] = true;
1913 |         }
1914 | 
1915 |         this.blockRandomised = true;
1916 |     }
1917 | 
1918 |     private void generateMTFValues() {
1919 |         final int lastShadow = this.last;
1920 |         final Data dataShadow = this.data;
1921 |         final boolean[] inUse = dataShadow.inUse;
1922 |         final byte[] block = dataShadow.block;
1923 |         final int[] fmap = dataShadow.fmap;
1924 |         final char[] sfmap = dataShadow.sfmap;
1925 |         final int[] mtfFreq = dataShadow.mtfFreq;
1926 |         final byte[] unseqToSeq = dataShadow.unseqToSeq;
1927 |         final byte[] yy = dataShadow.generateMTFValues_yy;
1928 | 
1929 |         // make maps
1930 |         int nInUseShadow = 0;
1931 |         for (int i = 0; i < 256; i++) {
1932 |             if (inUse[i]) {
1933 |                 unseqToSeq[i] = (byte) nInUseShadow;
1934 |                 nInUseShadow++;
1935 |             }
1936 |         }
1937 |         this.nInUse = nInUseShadow;
1938 | 
1939 |         final int eob = nInUseShadow + 1;
1940 | 
1941 |         for (int i = eob; i >= 0; i--) {
1942 |             mtfFreq[i] = 0;
1943 |         }
1944 | 
1945 |         for (int i = nInUseShadow; --i >= 0; ) {
1946 |             yy[i] = (byte) i;
1947 |         }
1948 | 
1949 |         int wr = 0;
1950 |         int zPend = 0;
1951 | 
1952 |         for (int i = 0; i <= lastShadow; i++) {
1953 |             final byte ll_i = unseqToSeq[block[fmap[i]] & 0xff];
1954 |             byte tmp = yy[0];
1955 |             int j = 0;
1956 | 
1957 |             while (ll_i != tmp) {
1958 |                 j++;
1959 |                 byte tmp2 = tmp;
1960 |                 tmp = yy[j];
1961 |                 yy[j] = tmp2;
1962 |             }
1963 |             yy[0] = tmp;
1964 | 
1965 |             if (j == 0) {
1966 |                 zPend++;
1967 |             } else {
1968 |                 if (zPend > 0) {
1969 |                     zPend--;
1970 |                     while (true) {
1971 |                         if ((zPend & 1) == 0) {
1972 |                             sfmap[wr] = RUNA;
1973 |                             wr++;
1974 |                             mtfFreq[RUNA]++;
1975 |                         } else {
1976 |                             sfmap[wr] = RUNB;
1977 |                             wr++;
1978 |                             mtfFreq[RUNB]++;
1979 |                         }
1980 | 
1981 |                         if (zPend >= 2) {
1982 |                             zPend = (zPend - 2) >> 1;
1983 |                         } else {
1984 |                             break;
1985 |                         }
1986 |                     }
1987 |                     zPend = 0;
1988 |                 }
1989 |                 sfmap[wr] = (char) (j + 1);
1990 |                 wr++;
1991 |                 mtfFreq[j + 1]++;
1992 |             }
1993 |         }
1994 | 
1995 |         if (zPend > 0) {
1996 |             zPend--;
1997 |             while (true) {
1998 |                 if ((zPend & 1) == 0) {
1999 |                     sfmap[wr] = RUNA;
2000 |                     wr++;
2001 |                     mtfFreq[RUNA]++;
2002 |                 } else {
2003 |                     sfmap[wr] = RUNB;
2004 |                     wr++;
2005 |                     mtfFreq[RUNB]++;
2006 |                 }
2007 | 
2008 |                 if (zPend >= 2) {
2009 |                     zPend = (zPend - 2) >> 1;
2010 |                 } else {
2011 |                     break;
2012 |                 }
2013 |             }
2014 |         }
2015 | 
2016 |         sfmap[wr] = (char) eob;
2017 |         mtfFreq[eob]++;
2018 |         this.nMTF = wr + 1;
2019 |     }
2020 | 
2021 |     private static final class Data extends Object {
2022 | 
2023 |         // with blockSize 900k
2024 |         final boolean[] inUse = new boolean[256]; // 256 byte
2025 |         final byte[] unseqToSeq = new byte[256]; // 256 byte
2026 |         final int[] mtfFreq = new int[MAX_ALPHA_SIZE]; // 1032 byte
2027 |         final byte[] selector = new byte[MAX_SELECTORS]; // 18002 byte
2028 |         final byte[] selectorMtf = new byte[MAX_SELECTORS]; // 18002 byte
2029 | 
2030 |         final byte[] generateMTFValues_yy = new byte[256]; // 256 byte
2031 |         final byte[][] sendMTFValues_len = new byte[N_GROUPS][MAX_ALPHA_SIZE]; // 1548
2032 |         // byte
2033 |         final int[][] sendMTFValues_rfreq = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192
2034 |         // byte
2035 |         final int[] sendMTFValues_fave = new int[N_GROUPS]; // 24 byte
2036 |         final short[] sendMTFValues_cost = new short[N_GROUPS]; // 12 byte
2037 |         final int[][] sendMTFValues_code = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192
2038 |         // byte
2039 |         final byte[] sendMTFValues2_pos = new byte[N_GROUPS]; // 6 byte
2040 |         final boolean[] sentMTFValues4_inUse16 = new boolean[16]; // 16 byte
2041 | 
2042 |         final int[] stack_ll = new int[QSORT_STACK_SIZE]; // 4000 byte
2043 |         final int[] stack_hh = new int[QSORT_STACK_SIZE]; // 4000 byte
2044 |         final int[] stack_dd = new int[QSORT_STACK_SIZE]; // 4000 byte
2045 | 
2046 |         final int[] mainSort_runningOrder = new int[256]; // 1024 byte
2047 |         final int[] mainSort_copy = new int[256]; // 1024 byte
2048 |         final boolean[] mainSort_bigDone = new boolean[256]; // 256 byte
2049 | 
2050 |         final int[] heap = new int[MAX_ALPHA_SIZE + 2]; // 1040 byte
2051 |         final int[] weight = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte
2052 |         final int[] parent = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte
2053 | 
2054 |         final int[] ftab = new int[65537]; // 262148 byte
2055 |         // ------------
2056 |         // 333408 byte
2057 | 
2058 |         final byte[] block; // 900021 byte
2059 |         final int[] fmap; // 3600000 byte
2060 |         final char[] sfmap; // 3600000 byte
2061 |         // ------------
2062 |         // 8433529 byte
2063 |         // ============
2064 | 
2065 |         /**
2066 |          * Array instance identical to sfmap, both are used only
2067 |          * temporarily and indepently, so we do not need to allocate
2068 |          * additional memory.
2069 |          */
2070 |         final char[] quadrant;
2071 | 
2072 |         Data(int blockSize100k) {
2073 |             super();
2074 | 
2075 |             final int n = blockSize100k * BZip2Constants.baseBlockSize;
2076 |             this.block = new byte[(n + 1 + NUM_OVERSHOOT_BYTES)];
2077 |             this.fmap = new int[n];
2078 |             this.sfmap = new char[2 * n];
2079 |             this.quadrant = this.sfmap;
2080 |         }
2081 | 
2082 |     }
2083 | 
2084 | }
2085 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/bzip2/CRC.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia.bzip2;
 21 | 
 22 | /**
 23 |  * A simple class the hold and calculate the CRC for sanity checking
 24 |  * of the data.
 25 |  */
 26 | final class CRC {
 27 |     static final int crc32Table[] = {
 28 |             0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
 29 |             0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
 30 |             0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
 31 |             0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
 32 |             0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
 33 |             0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
 34 |             0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
 35 |             0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
 36 |             0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
 37 |             0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
 38 |             0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
 39 |             0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
 40 |             0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
 41 |             0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
 42 |             0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
 43 |             0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
 44 |             0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
 45 |             0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
 46 |             0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
 47 |             0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
 48 |             0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
 49 |             0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
 50 |             0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
 51 |             0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
 52 |             0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
 53 |             0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
 54 |             0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
 55 |             0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
 56 |             0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
 57 |             0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
 58 |             0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
 59 |             0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
 60 |             0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
 61 |             0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
 62 |             0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
 63 |             0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
 64 |             0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
 65 |             0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
 66 |             0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
 67 |             0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
 68 |             0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
 69 |             0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
 70 |             0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
 71 |             0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
 72 |             0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
 73 |             0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
 74 |             0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
 75 |             0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
 76 |             0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
 77 |             0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
 78 |             0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
 79 |             0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
 80 |             0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
 81 |             0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
 82 |             0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
 83 |             0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
 84 |             0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
 85 |             0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
 86 |             0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
 87 |             0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
 88 |             0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
 89 |             0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
 90 |             0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
 91 |             0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
 92 |     };
 93 | 
 94 |     CRC() {
 95 |         initialiseCRC();
 96 |     }
 97 | 
 98 |     void initialiseCRC() {
 99 |         globalCrc = 0xffffffff;
100 |     }
101 | 
102 |     int getFinalCRC() {
103 |         return ~globalCrc;
104 |     }
105 | 
106 |     int getGlobalCRC() {
107 |         return globalCrc;
108 |     }
109 | 
110 |     void setGlobalCRC(int newCrc) {
111 |         globalCrc = newCrc;
112 |     }
113 | 
114 |     void updateCRC(int inCh) {
115 |         int temp = (globalCrc >> 24) ^ inCh;
116 |         if (temp < 0) {
117 |             temp = 256 + temp;
118 |         }
119 |         globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
120 |     }
121 | 
122 |     void updateCRC(int inCh, int repeat) {
123 |         int globalCrcShadow = this.globalCrc;
124 |         while (repeat-- > 0) {
125 |             int temp = (globalCrcShadow >> 24) ^ inCh;
126 |             globalCrcShadow = (globalCrcShadow << 8) ^ crc32Table[(temp >= 0)
127 |                     ? temp
128 |                     : (temp + 256)];
129 |         }
130 |         this.globalCrc = globalCrcShadow;
131 |     }
132 | 
133 |     int globalCrc;
134 | }
135 | 
136 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/InfoBox.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | /**
23 |  * A class abstracting Wiki infobox
24 |  *
25 |  * @author Delip Rao
26 |  */
27 | public class InfoBox {
28 |     String infoBoxWikiText = null;
29 | 
30 |     InfoBox(String infoBoxWikiText) {
31 |         this.infoBoxWikiText = infoBoxWikiText;
32 |     }
33 | 
34 |     public String dumpRaw() {
35 |         return infoBoxWikiText;
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/IteratorHandler.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | public class IteratorHandler implements PageCallbackHandler {
23 | 
24 |     private WikiXMLParser parser = null;
25 | 
26 |     public IteratorHandler(WikiXMLParser myParser) {
27 |         parser = myParser;
28 |     }
29 | 
30 |     public void process(WikiPage page) {
31 |         parser.notifyPage(page);
32 |     }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/PageCallbackHandler.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | /**
23 |  * Interface to allow streamed processing of pages.
24 |  * This allows a SAX style processing of Wikipedia XML files.
25 |  * The registered callback is executed on each page
26 |  * element in the XML file.
27 |  * <p/>
28 |  * Using callbacks will consume lesser memory, an useful feature for large
29 |  * dumps like English and German.
30 |  *
31 |  * @author Delip Rao
32 |  * @see WikiXMLDOMParser
33 |  * @see WikiPage
34 |  */
35 | 
36 | public interface PageCallbackHandler {
37 |     /**
38 |      * This is the callback method that should be implemented before
39 |      * registering with <code>WikiXMLDOMParser</code>
40 |      *
41 |      * @param page a wikipedia page object
42 |      * @see WikiPage
43 |      */
44 |     public void process(WikiPage page);
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/SAXPageCallbackHandler.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | import org.xml.sax.Attributes;
23 | import org.xml.sax.helpers.DefaultHandler;
24 | 
25 | /**
26 |  * A Wrapper class for the PageCallbackHandler
27 |  *
28 |  * @author Jason Smith
29 |  */
30 | public class SAXPageCallbackHandler extends DefaultHandler {
31 | 
32 |     private PageCallbackHandler pageHandler;
33 |     private WikiPage currentPage;
34 |     private String currentTag;
35 | 
36 |     private String currentWikitext;
37 |     private String currentTitle;
38 |     private String currentID;
39 | 
40 |     public SAXPageCallbackHandler(PageCallbackHandler ph) {
41 |         pageHandler = ph;
42 |     }
43 | 
44 |     public void startElement(String uri, String name, String qName, Attributes attr) {
45 |         currentTag = qName;
46 |         if (qName.equals("page")) {
47 |             currentPage = new WikiPage();
48 |             currentWikitext = "";
49 |             currentTitle = "";
50 |             currentID = "";
51 |         }
52 |     }
53 | 
54 |     public void endElement(String uri, String name, String qName) {
55 |         if (qName.equals("page")) {
56 |             currentPage.setTitle(currentTitle);
57 |             currentPage.setID(currentID);
58 |             currentPage.setWikiText(currentWikitext);
59 |             pageHandler.process(currentPage);
60 |         }
61 |         if (qName.equals("mediawiki")) {
62 |             // TODO hasMoreElements() should now return false
63 |         }
64 |     }
65 | 
66 |     public void characters(char ch[], int start, int length) {
67 |         if (currentTag.equals("title")) {
68 |             currentTitle = currentTitle.concat(new String(ch, start, length));
69 |         }
70 |         // TODO: To avoid looking at the revision ID, only the first ID is taken.
71 |         // I'm not sure how big the block size is in each call to characters(),
72 |         // so this may be unsafe.
73 |         else if ((currentTag.equals("id")) && (currentID.length() == 0)) {
74 |             currentID = new String(ch, start, length);
75 |         } else if (currentTag.equals("text")) {
76 |             currentWikitext = currentWikitext.concat(new String(ch, start, length));
77 |         }
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPage.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia.support;
 21 | 
 22 | import java.util.List;
 23 | 
 24 | /**
 25 |  * Data structures for a wikipedia page.
 26 |  *
 27 |  * @author Delip Rao
 28 |  */
 29 | public class WikiPage {
 30 | 
 31 |     private String title = null;
 32 |     private WikiTextParser wikiTextParser = null;
 33 |     private String id = null;
 34 | 
 35 |     /**
 36 |      * Set the page title. This is not intended for direct use.
 37 |      *
 38 |      * @param title
 39 |      */
 40 |     public void setTitle(String title) {
 41 |         this.title = title;
 42 |     }
 43 | 
 44 |     /**
 45 |      * Set the wiki text associated with this page.
 46 |      * This setter also introduces side effects. This is not intended for direct use.
 47 |      *
 48 |      * @param wtext wiki-formatted text
 49 |      */
 50 |     public void setWikiText(String wtext) {
 51 |         wikiTextParser = new WikiTextParser(wtext);
 52 |     }
 53 | 
 54 |     /**
 55 |      * @return a string containing the page title.
 56 |      */
 57 |     public String getTitle() {
 58 |         return title;
 59 |     }
 60 | 
 61 |     /**
 62 |      * @param languageCode
 63 |      * @return a string containing the title translated
 64 |      *         in the given languageCode.
 65 |      */
 66 |     public String getTranslatedTitle(String languageCode) {
 67 |         return wikiTextParser.getTranslatedTitle(languageCode);
 68 |     }
 69 | 
 70 |     /**
 71 |      * @return true if this a disambiguation page.
 72 |      */
 73 |     public boolean isDisambiguationPage() {
 74 |         if (title.contains("(disambiguation)") ||
 75 |                 wikiTextParser.isDisambiguationPage())
 76 |             return true;
 77 |         else return false;
 78 |     }
 79 | 
 80 |     /**
 81 |      * @return true for "special pages" -- like Category:, Wikipedia:, etc
 82 |      */
 83 |     public boolean isSpecialPage() {
 84 |         return title.contains(":");
 85 |     }
 86 | 
 87 |     /**
 88 |      * Use this method to get the wiki text associated with this page.
 89 |      * Useful for custom processing the wiki text.
 90 |      *
 91 |      * @return a string containing the wiki text.
 92 |      */
 93 |     public String getWikiText() {
 94 |         return wikiTextParser.getText();
 95 |     }
 96 | 
 97 |     /**
 98 |      * @return true if this is a redirection page
 99 |      */
100 |     public boolean isRedirect() {
101 |         return wikiTextParser.isRedirect();
102 |     }
103 | 
104 |     /**
105 |      * @return true if this is a stub page
106 |      */
107 |     public boolean isStub() {
108 |         return wikiTextParser.isStub();
109 |     }
110 | 
111 |     /**
112 |      * @return the title of the page being redirected to.
113 |      */
114 |     public String getRedirectPage() {
115 |         return wikiTextParser.getRedirectText();
116 |     }
117 | 
118 |     /**
119 |      * @return plain text stripped of all wiki formatting.
120 |      */
121 |     public String getText() {
122 |         return wikiTextParser.getPlainText();
123 |     }
124 | 
125 |     /**
126 |      * @return a list of categories the page belongs to, null if this a redirection/disambiguation page
127 |      */
128 |     public List<String> getCategories() {
129 |         return wikiTextParser.getCategories();
130 |     }
131 | 
132 |     /**
133 |      * @return a list of links contained in the page
134 |      */
135 |     public List<String> getLinks() {
136 |         return wikiTextParser.getLinks();
137 |     }
138 | 
139 |     public void setID(String id) {
140 |         this.id = id;
141 |     }
142 | 
143 |     public InfoBox getInfoBox() {
144 |         return wikiTextParser.getInfoBox();
145 |     }
146 | 
147 |     public String getID() {
148 |         return id;
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPageIterator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | import java.util.Vector;
23 | 
24 | /**
25 |  * A class to iterate the pages after the wikipedia XML file has been parsed with {@link WikiXMLDOMParser}.
26 |  *
27 |  * @author Delip Rao
28 |  * @see WikiXMLDOMParser
29 |  */
30 | public class WikiPageIterator {
31 | 
32 |     private int currentPage = 0;
33 |     private int lastPage = 0;
34 |     Vector<WikiPage> pageList = null;
35 | 
36 |     public WikiPageIterator(Vector<WikiPage> list) {
37 |         pageList = list;
38 |         if (pageList != null)
39 |             lastPage = pageList.size();
40 |     }
41 | 
42 |     /**
43 |      * @return true if there are more pages to be read
44 |      */
45 |     public boolean hasMorePages() {
46 |         return (currentPage < lastPage);
47 |     }
48 | 
49 |     /**
50 |      * Reset the iterator.
51 |      */
52 |     public void reset() {
53 |         currentPage = 0;
54 |     }
55 | 
56 |     /**
57 |      * Advances the iterator by one position.
58 |      *
59 |      * @return a {@link WikiPage}
60 |      */
61 |     public WikiPage nextPage() {
62 |         if (hasMorePages())
63 |             return pageList.elementAt(currentPage++);
64 |         return null;
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiTextParser.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia.support;
 21 | 
 22 | import java.util.ArrayList;
 23 | import java.util.regex.Matcher;
 24 | import java.util.regex.Pattern;
 25 | 
 26 | /**
 27 |  * For internal use only -- Used by the {@link WikiPage} class.
 28 |  * Can also be used as a stand alone class to parse wiki formatted text.
 29 |  *
 30 |  * @author Delip Rao
 31 |  */
 32 | public class WikiTextParser {
 33 | 
 34 |     private String wikiText = null;
 35 |     private ArrayList<String> pageCats = null;
 36 |     private ArrayList<String> pageLinks = null;
 37 |     private boolean redirect = false;
 38 |     private String redirectString = null;
 39 |     private static Pattern redirectPattern =
 40 |             Pattern.compile("#REDIRECT\\s+\\[\\[(.*?)\\]\\]", Pattern.CASE_INSENSITIVE);
 41 |     private boolean stub = false;
 42 |     private boolean disambiguation = false;
 43 |     private static Pattern stubPattern = Pattern.compile("\\-stub\\}\\}");
 44 |     // the first letter of pages is case-insensitive
 45 |     private static Pattern disambCatPattern =
 46 |             Pattern.compile("\\{\\{[Dd]isambig(uation)?\\}\\}");
 47 |     private InfoBox infoBox = null;
 48 | 
 49 |     public WikiTextParser(String wtext) {
 50 |         wikiText = wtext;
 51 |         Matcher matcher = redirectPattern.matcher(wikiText);
 52 |         if (matcher.find()) {
 53 |             redirect = true;
 54 |             if (matcher.groupCount() == 1)
 55 |                 redirectString = matcher.group(1);
 56 |         }
 57 |         matcher = stubPattern.matcher(wikiText);
 58 |         stub = matcher.find();
 59 |         matcher = disambCatPattern.matcher(wikiText);
 60 |         disambiguation = matcher.find();
 61 |     }
 62 | 
 63 |     public boolean isRedirect() {
 64 |         return redirect;
 65 |     }
 66 | 
 67 |     public boolean isStub() {
 68 |         return stub;
 69 |     }
 70 | 
 71 |     public String getRedirectText() {
 72 |         return redirectString;
 73 |     }
 74 | 
 75 |     public String getText() {
 76 |         return wikiText;
 77 |     }
 78 | 
 79 |     public ArrayList<String> getCategories() {
 80 |         if (pageCats == null) parseCategories();
 81 |         return pageCats;
 82 |     }
 83 | 
 84 |     public ArrayList<String> getLinks() {
 85 |         if (pageLinks == null) parseLinks();
 86 |         return pageLinks;
 87 |     }
 88 | 
 89 |     private void parseCategories() {
 90 |         pageCats = new ArrayList<String>();
 91 |         Pattern catPattern = Pattern.compile("\\[\\[[Cc]ategory:(.*?)\\]\\]", Pattern.MULTILINE);
 92 |         Matcher matcher = catPattern.matcher(wikiText);
 93 |         while (matcher.find()) {
 94 |             String[] temp = matcher.group(1).split("\\|");
 95 |             pageCats.add(temp[0]);
 96 |         }
 97 |     }
 98 | 
 99 |     private void parseLinks() {
100 |         pageLinks = new ArrayList<String>();
101 | 
102 |         Pattern catPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
103 |         Matcher matcher = catPattern.matcher(wikiText);
104 |         while (matcher.find()) {
105 |             String[] temp = matcher.group(1).split("\\|");
106 |             if (temp == null || temp.length == 0) continue;
107 |             String link = temp[0];
108 |             if (link.contains(":") == false) {
109 |                 pageLinks.add(link);
110 |             }
111 |         }
112 |     }
113 | 
114 |     public String getPlainText() {
115 |         String text = wikiText.replaceAll("&gt;", ">");
116 |         text = text.replaceAll("&lt;", "<");
117 |         text = text.replaceAll("<ref>.*?</ref>", " ");
118 |         text = text.replaceAll("</?.*?>", " ");
119 |         text = text.replaceAll("\\{\\{.*?\\}\\}", " ");
120 |         text = text.replaceAll("\\[\\[.*?:.*?\\]\\]", " ");
121 |         text = text.replaceAll("\\[\\[(.*?)\\]\\]", "$1");
122 |         text = text.replaceAll("\\s(.*?)\\|(\\w+\\s)", " $2");
123 |         text = text.replaceAll("\\[.*?\\]", " ");
124 |         text = text.replaceAll("\\'+", "");
125 |         return text;
126 |     }
127 | 
128 |     public InfoBox getInfoBox() {
129 |         //parseInfoBox is expensive. Doing it only once like other parse* methods
130 |         if (infoBox == null)
131 |             infoBox = parseInfoBox();
132 |         return infoBox;
133 |     }
134 | 
135 |     private InfoBox parseInfoBox() {
136 |         String INFOBOX_CONST_STR = "{{Infobox";
137 |         int startPos = wikiText.indexOf(INFOBOX_CONST_STR);
138 |         if (startPos < 0) return null;
139 |         int bracketCount = 2;
140 |         int endPos = startPos + INFOBOX_CONST_STR.length();
141 |         for (; endPos < wikiText.length(); endPos++) {
142 |             switch (wikiText.charAt(endPos)) {
143 |                 case '}':
144 |                     bracketCount--;
145 |                     break;
146 |                 case '{':
147 |                     bracketCount++;
148 |                     break;
149 |                 default:
150 |             }
151 |             if (bracketCount == 0) break;
152 |         }
153 |         String infoBoxText = wikiText.substring(startPos, endPos + 1);
154 |         infoBoxText = stripCite(infoBoxText); // strip clumsy {{cite}} tags
155 |         // strip any html formatting
156 |         infoBoxText = infoBoxText.replaceAll("&gt;", ">");
157 |         infoBoxText = infoBoxText.replaceAll("&lt;", "<");
158 |         infoBoxText = infoBoxText.replaceAll("<ref.*?>.*?</ref>", " ");
159 |         infoBoxText = infoBoxText.replaceAll("</?.*?>", " ");
160 |         return new InfoBox(infoBoxText);
161 |     }
162 | 
163 |     private String stripCite(String text) {
164 |         String CITE_CONST_STR = "{{cite";
165 |         int startPos = text.indexOf(CITE_CONST_STR);
166 |         if (startPos < 0) return text;
167 |         int bracketCount = 2;
168 |         int endPos = startPos + CITE_CONST_STR.length();
169 |         for (; endPos < text.length(); endPos++) {
170 |             switch (text.charAt(endPos)) {
171 |                 case '}':
172 |                     bracketCount--;
173 |                     break;
174 |                 case '{':
175 |                     bracketCount++;
176 |                     break;
177 |                 default:
178 |             }
179 |             if (bracketCount == 0) break;
180 |         }
181 |         text = text.substring(0, startPos - 1) + text.substring(endPos);
182 |         return stripCite(text);
183 |     }
184 | 
185 |     public boolean isDisambiguationPage() {
186 |         return disambiguation;
187 |     }
188 | 
189 |     public String getTranslatedTitle(String languageCode) {
190 |         Pattern pattern = Pattern.compile("^\\[\\[" + languageCode + ":(.*?)\\]\\]$", Pattern.MULTILINE);
191 |         Matcher matcher = pattern.matcher(wikiText);
192 |         if (matcher.find()) {
193 |             return matcher.group(1);
194 |         }
195 |         return null;
196 |     }
197 | 
198 | }
199 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | import org.elasticsearch.river.wikipedia.bzip2.CBZip2InputStream;
23 | import org.xml.sax.InputSource;
24 | 
25 | import java.io.BufferedReader;
26 | import java.io.IOException;
27 | import java.io.InputStream;
28 | import java.io.InputStreamReader;
29 | import java.net.URL;
30 | import java.util.zip.GZIPInputStream;
31 | 
32 | /**
33 |  * @author Delip Rao
34 |  * @author Jason Smith
35 |  */
36 | public abstract class WikiXMLParser {
37 | 
38 |     private URL wikiXMLFile = null;
39 |     protected WikiPage currentPage = null;
40 |     private BufferedReader br;
41 | 
42 |     public WikiXMLParser(URL fileName) {
43 |         wikiXMLFile = fileName;
44 |     }
45 | 
46 |     /**
47 |      * Set a callback handler. The callback is executed every time a
48 |      * page instance is detected in the stream. Custom handlers are
49 |      * implementations of {@link PageCallbackHandler}
50 |      *
51 |      * @param handler
52 |      * @throws Exception
53 |      */
54 |     public abstract void setPageCallback(PageCallbackHandler handler) throws Exception;
55 | 
56 |     /**
57 |      * The main parse method.
58 |      *
59 |      * @throws Exception
60 |      */
61 |     public abstract void parse() throws Exception;
62 | 
63 |     /**
64 |      * @return an iterator to the list of pages
65 |      * @throws Exception
66 |      */
67 |     public abstract WikiPageIterator getIterator() throws Exception;
68 | 
69 |     /**
70 |      * @return An InputSource created from wikiXMLFile
71 |      * @throws Exception
72 |      */
73 |     protected InputSource getInputSource() throws Exception {
74 |         if (wikiXMLFile.toExternalForm().endsWith(".gz")) {
75 |             br = new BufferedReader(new InputStreamReader(new GZIPInputStream(wikiXMLFile.openStream()), "UTF-8"));
76 |         } else if (wikiXMLFile.toExternalForm().endsWith(".bz2")) {
77 |             InputStream fis = wikiXMLFile.openStream();
78 |             byte[] ignoreBytes = new byte[2];
79 |             fis.read(ignoreBytes); //"B", "Z" bytes from commandline tools
80 |             CBZip2InputStream cbZip2InputStream = new CBZip2InputStream(fis);
81 |             br = new BufferedReader(new InputStreamReader(cbZip2InputStream, "UTF-8"));
82 |         } else {
83 |             br = new BufferedReader(new InputStreamReader(wikiXMLFile.openStream(), "UTF-8"));
84 |         }
85 | 
86 |         return new InputSource(br);
87 |     }
88 | 
89 |     protected void notifyPage(WikiPage page) {
90 |         currentPage = page;
91 |     }
92 | 
93 |     public void close() throws IOException {
94 |         if (br != null) {
95 |             br.close();
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParserFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | import java.net.URL;
23 | 
24 | /**
25 |  * @author Delip Rao
26 |  */
27 | public class WikiXMLParserFactory {
28 | 
29 |     public static WikiXMLParser getSAXParser(URL fileName) {
30 |         return new WikiXMLSAXParser(fileName);
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLSAXParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | package org.elasticsearch.river.wikipedia.support;
21 | 
22 | import org.xml.sax.SAXException;
23 | import org.xml.sax.XMLReader;
24 | import org.xml.sax.helpers.XMLReaderFactory;
25 | 
26 | import java.net.URL;
27 | 
28 | /**
29 |  * A SAX Parser for Wikipedia XML dumps.
30 |  *
31 |  * @author Jason Smith
32 |  */
33 | public class WikiXMLSAXParser extends WikiXMLParser {
34 | 
35 |     private XMLReader xmlReader;
36 |     private PageCallbackHandler pageHandler = null;
37 | 
38 |     public WikiXMLSAXParser(URL fileName) {
39 |         super(fileName);
40 |         try {
41 |             xmlReader = XMLReaderFactory.createXMLReader();
42 |             pageHandler = new IteratorHandler(this);
43 |         } catch (SAXException e) {
44 |             throw new RuntimeException(e);
45 |         }
46 |     }
47 | 
48 |     /**
49 |      * Set a callback handler. The callback is executed every time a
50 |      * page instance is detected in the stream. Custom handlers are
51 |      * implementations of {@link PageCallbackHandler}
52 |      *
53 |      * @param handler
54 |      * @throws Exception
55 |      */
56 |     public void setPageCallback(PageCallbackHandler handler) throws Exception {
57 |         pageHandler = handler;
58 |     }
59 | 
60 |     /**
61 |      * The main parse method.
62 |      *
63 |      * @throws Exception
64 |      */
65 |     public void parse() throws Exception {
66 |         xmlReader.setContentHandler(new SAXPageCallbackHandler(pageHandler));
67 |         xmlReader.parse(getInputSource());
68 |     }
69 | 
70 |     /**
71 |      * This parser is event driven, so it
72 |      * can't provide a page iterator.
73 |      */
74 |     @Override
75 |     public WikiPageIterator getIterator() throws Exception {
76 |         if (!(pageHandler instanceof IteratorHandler)) {
77 |             throw new Exception("Custom page callback found. Will not iterate.");
78 |         }
79 |         throw new UnsupportedOperationException();
80 |     }
81 | 
82 |     /**
83 |      * A convenience method for the Wikipedia SAX interface
84 |      *
85 |      * @param dumpFile - path to the Wikipedia dump
86 |      * @param handler  - callback handler used for parsing
87 |      * @throws Exception
88 |      */
89 |     public static void parseWikipediaDump(URL dumpFile,
90 |                                           PageCallbackHandler handler) throws Exception {
91 |         WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(dumpFile);
92 |         wxsp.setPageCallback(handler);
93 |         wxsp.parse();
94 |     }
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/river/wikipedia/support/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | /**
21 |  * Copied from wikixmlj on 2010-10-03.
22 |  *
23 |  * Changed from File handling to URL handling, and removed Dom parser.
24 |  */
25 | package org.elasticsearch.river.wikipedia.support;
26 | 


--------------------------------------------------------------------------------
/src/main/resources/es-plugin.properties:
--------------------------------------------------------------------------------
1 | plugin=org.elasticsearch.plugin.river.wikipedia.WikipediaRiverPlugin
2 | version=${project.version}
3 | 
4 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/river/wikipedia/WikipediaRiverTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package org.elasticsearch.river.wikipedia;
 21 | 
 22 | import org.elasticsearch.action.count.CountResponse;
 23 | import org.elasticsearch.common.base.Predicate;
 24 | import org.elasticsearch.common.settings.Settings;
 25 | import org.elasticsearch.indices.IndexMissingException;
 26 | import org.elasticsearch.plugins.PluginsService;
 27 | import org.elasticsearch.river.wikipedia.helper.HttpClient;
 28 | import org.elasticsearch.river.wikipedia.helper.HttpClientResponse;
 29 | import org.elasticsearch.test.ElasticsearchIntegrationTest;
 30 | import org.elasticsearch.test.junit.annotations.Network;
 31 | import org.junit.After;
 32 | import org.junit.Before;
 33 | import org.junit.Test;
 34 | 
 35 | import java.io.IOException;
 36 | import java.util.concurrent.TimeUnit;
 37 | 
 38 | import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
 39 | import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
 40 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 41 | import static org.hamcrest.CoreMatchers.equalTo;
 42 | 
 43 | /**
 44 |  * This test requires internet connexion
 45 |  * If you want to run this test, use -Dtests.network=true
 46 |  */
 47 | @ElasticsearchIntegrationTest.ClusterScope(
 48 |         scope = ElasticsearchIntegrationTest.Scope.SUITE, transportClientRatio = 0.0)
 49 | @Network
 50 | public class WikipediaRiverTest extends ElasticsearchIntegrationTest {
 51 | 
 52 |     @Override
 53 |     protected Settings nodeSettings(int nodeOrdinal) {
 54 |         return Settings.builder()
 55 |                 .put(super.nodeSettings(nodeOrdinal))
 56 |                 .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
 57 |                 .build();
 58 |     }
 59 | 
 60 |     @Before
 61 |     public void createEmptyRiverIndex() {
 62 |         // We want to force _river index to use 1 shard 1 replica
 63 |         client().admin().indices().prepareCreate("_river").setSettings(Settings.builder()
 64 |                 .put(SETTING_NUMBER_OF_SHARDS, 1)
 65 |                 .put(SETTING_NUMBER_OF_REPLICAS, 0)).get();
 66 |     }
 67 | 
 68 |     @After
 69 |     public void deleteRiverAndWait() throws InterruptedException {
 70 |         logger.info(" --> remove all wikipedia rivers");
 71 |         client().admin().indices().prepareDelete("_river").get();
 72 |         // We just wait a few to make sure that all bulks has been processed
 73 |         awaitBusy(new Predicate<Object>() {
 74 |             @Override
 75 |             public boolean apply(Object o) {
 76 |                 return false;
 77 |             }
 78 |         }, 2, TimeUnit.SECONDS);
 79 |     }
 80 | 
 81 |     private boolean isUrlAccessible(String server, String url) {
 82 |         HttpClientResponse response = new HttpClient(server, 80).request("HEAD", url);
 83 |         if (response.errorCode() == 200) {
 84 |             logger.info("  -> Internet working for [{}{}]", server, url);
 85 |             return true;
 86 |         } else {
 87 |             logger.info("  -> Internet not working for [{}{}]: {}", server, url, response.errorCode());
 88 |             return false;
 89 |         }
 90 |     }
 91 | 
 92 |     @Test
 93 |     public void testWikipediaRiver() throws IOException, InterruptedException {
 94 |         if (isUrlAccessible("download.wikimedia.org", "/enwiki/latest/enwiki-latest-pages-articles.xml.bz2")) {
 95 |             logger.info(" --> create wikipedia river");
 96 |             index("_river", "wikipedia", "_meta", jsonBuilder()
 97 |                     .startObject()
 98 |                     .field("type", "wikipedia")
 99 |                     .startObject("index")
100 |                     .field("bulk_size", 100)
101 |                     .field("flush_interval", "100ms")
102 |                     .endObject()
103 |                     .endObject());
104 | 
105 |             logger.info(" --> waiting for some documents");
106 |             // Check that docs are indexed by the river
107 |             assertThat(awaitBusy(new Predicate<Object>() {
108 |                 public boolean apply(Object obj) {
109 |                     try {
110 |                         refresh();
111 |                         CountResponse response = client().prepareCount("wikipedia").get();
112 |                         logger.info("  -> got {} docs in {} index", response.getCount());
113 |                         return response.getCount() > 0;
114 |                     } catch (IndexMissingException e) {
115 |                         return false;
116 |                     }
117 |                 }
118 |             }, 1, TimeUnit.MINUTES), equalTo(true));
119 |         }
120 |     }
121 | 
122 |     /**
123 |      * Testing another wikipedia source
124 |      * http://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2
125 |      */
126 |     @Test
127 |     public void testWikipediaRiverFrench() throws IOException, InterruptedException {
128 |         if (isUrlAccessible("dumps.wikimedia.org", "/frwiki/latest/frwiki-latest-pages-articles.xml.bz2")) {
129 |             logger.info(" --> create wikipedia river");
130 |             index("_river", "wikipedia", "_meta", jsonBuilder()
131 |                     .startObject()
132 |                         .field("type", "wikipedia")
133 |                     .startObject("wikipedia")
134 |                     .field("url", "http://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2")
135 |                     .endObject()
136 |                     .startObject("index")
137 |                     .field("bulk_size", 100)
138 |                     .field("flush_interval", "1s")
139 |                         .endObject()
140 |                     .endObject());
141 | 
142 |             logger.info(" --> waiting for some documents");
143 |             // Check that docs are indexed by the river
144 |             assertThat(awaitBusy(new Predicate<Object>() {
145 |                 public boolean apply(Object obj) {
146 |                     try {
147 |                         refresh();
148 |                         CountResponse response = client().prepareCount("wikipedia").get();
149 |                         logger.info("  -> got {} docs in {} index", response.getCount());
150 |                         return response.getCount() > 0;
151 |                     } catch (IndexMissingException e) {
152 |                         return false;
153 |                     }
154 |                 }
155 |             }, 1, TimeUnit.MINUTES), equalTo(true));
156 |         }
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/river/wikipedia/helper/HttpClient.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Elasticsearch under one or more contributor
  3 |  * license agreements. See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright
  5 |  * ownership. Elasticsearch licenses this file to you under
  6 |  * the Apache License, Version 2.0 (the "License"); you may
  7 |  * not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package org.elasticsearch.river.wikipedia.helper;
 20 | 
 21 | import org.elasticsearch.ElasticsearchException;
 22 | import org.elasticsearch.common.base.Charsets;
 23 | import org.elasticsearch.common.io.Streams;
 24 | 
 25 | import java.io.IOException;
 26 | import java.io.InputStream;
 27 | import java.io.InputStreamReader;
 28 | import java.io.OutputStreamWriter;
 29 | import java.net.HttpURLConnection;
 30 | import java.net.MalformedURLException;
 31 | import java.net.URL;
 32 | import java.nio.charset.StandardCharsets;
 33 | import java.util.List;
 34 | import java.util.Map;
 35 | 
 36 | public class HttpClient {
 37 | 
 38 |     private final URL baseUrl;
 39 | 
 40 |     public HttpClient(String hostname, Integer port) {
 41 |         try {
 42 |             baseUrl = new URL("http", hostname, port, "/");
 43 |         } catch (MalformedURLException e) {
 44 |             throw new ElasticsearchException("", e);
 45 |         }
 46 |     }
 47 | 
 48 |     public HttpClientResponse request(String path) {
 49 |         return request("GET", path, null, null);
 50 |     }
 51 | 
 52 |     public HttpClientResponse request(String method, String path) {
 53 |         return request(method, path, null, null);
 54 |     }
 55 | 
 56 |     public HttpClientResponse request(String method, String path, String payload) {
 57 |         return request(method, path, null, payload);
 58 |     }
 59 | 
 60 |     public HttpClientResponse request(String method, String path, Map<String, String> headers, String payload) {
 61 |         URL url;
 62 |         try {
 63 |             url = new URL(baseUrl, path);
 64 |         } catch (MalformedURLException e) {
 65 |             throw new ElasticsearchException("Cannot parse " + path, e);
 66 |         }
 67 | 
 68 |         HttpURLConnection urlConnection;
 69 |         try {
 70 |             urlConnection = (HttpURLConnection) url.openConnection();
 71 |             urlConnection.setRequestMethod(method);
 72 |             if (headers != null) {
 73 |                 for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
 74 |                     urlConnection.setRequestProperty(headerEntry.getKey(), headerEntry.getValue());
 75 |                 }
 76 |             }
 77 | 
 78 |             if (payload != null) {
 79 |                 urlConnection.setDoOutput(true);
 80 |                 urlConnection.setRequestProperty("Content-Type", "application/json");
 81 |                 urlConnection.setRequestProperty("Accept", "application/json");
 82 |                 OutputStreamWriter osw = new OutputStreamWriter(urlConnection.getOutputStream(), StandardCharsets.UTF_8);
 83 |                 osw.write(payload);
 84 |                 osw.flush();
 85 |                 osw.close();
 86 |             }
 87 | 
 88 |             urlConnection.connect();
 89 |         } catch (IOException e) {
 90 |             throw new ElasticsearchException("", e);
 91 |         }
 92 | 
 93 |         int errorCode = -1;
 94 |         Map<String, List<String>> respHeaders = null;
 95 |         try {
 96 |             errorCode = urlConnection.getResponseCode();
 97 |             respHeaders = urlConnection.getHeaderFields();
 98 |             InputStream inputStream = urlConnection.getInputStream();
 99 |             String body = null;
100 |             try {
101 |                 body = Streams.copyToString(new InputStreamReader(inputStream, Charsets.UTF_8));
102 |             } catch (IOException e1) {
103 |                 throw new ElasticsearchException("problem reading error stream", e1);
104 |             }
105 |             return new HttpClientResponse(body, errorCode, respHeaders, null);
106 |         } catch (IOException e) {
107 |             InputStream errStream = urlConnection.getErrorStream();
108 |             String body = null;
109 |             if (errStream != null) {
110 |                 try {
111 |                     body = Streams.copyToString(new InputStreamReader(errStream, Charsets.UTF_8));
112 |                 } catch (IOException e1) {
113 |                     throw new ElasticsearchException("problem reading error stream", e1);
114 |                 }
115 |             }
116 |             return new HttpClientResponse(body, errorCode, respHeaders, e);
117 |         } finally {
118 |             urlConnection.disconnect();
119 |         }
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/river/wikipedia/helper/HttpClientResponse.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Elasticsearch under one or more contributor
 3 |  * license agreements. See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright
 5 |  * ownership. Elasticsearch licenses this file to you under
 6 |  * the Apache License, Version 2.0 (the "License"); you may
 7 |  * not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package org.elasticsearch.river.wikipedia.helper;
20 | 
21 | import java.util.List;
22 | import java.util.Map;
23 | 
24 | public class HttpClientResponse {
25 |     private final String response;
26 |     private final int errorCode;
27 |     private Map<String, List<String>> headers;
28 |     private final Throwable e;
29 | 
30 |     public HttpClientResponse(String response, int errorCode, Map<String, List<String>> headers,  Throwable e) {
31 |         this.response = response;
32 |         this.errorCode = errorCode;
33 |         this.headers = headers;
34 |         this.e = e;
35 |     }
36 | 
37 |     public String response() {
38 |         return response;
39 |     }
40 | 
41 |     public int errorCode() {
42 |         return errorCode;
43 |     }
44 | 
45 |     public Throwable cause() {
46 |         return e;
47 |     }
48 | 
49 |     public Map<String, List<String>> getHeaders() {
50 |         return headers;
51 |     }
52 | 
53 |     public String getHeader(String name) {
54 |         if (headers == null) {
55 |             return null;
56 |         }
57 |         List<String> vals = headers.get(name);
58 |         if (vals == null || vals.size() == 0) {
59 |             return null;
60 |         }
61 |         return vals.iterator().next();
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------