├── .gitattributes
├── .github
    ├── CODEOWNERS
    ├── dependabot.yml
    └── workflows
    │   └── tests.yml
├── .gitignore
├── CONTRIBUTORS.md
├── LICENSE
├── README.md
├── pom.xml
└── src
    ├── main
        └── java
        │   └── com
        │       └── sigpwned
        │           └── chardet4j
        │               ├── ByteOrderMark.java
        │               ├── Chardet.java
        │               ├── com
        │                   └── ibm
        │                   │   └── icu
        │                   │       └── text
        │                   │           ├── CharsetDetector.java
        │                   │           ├── CharsetMatch.java
        │                   │           ├── CharsetRecog_2022.java
        │                   │           ├── CharsetRecog_UTF8.java
        │                   │           ├── CharsetRecog_Unicode.java
        │                   │           ├── CharsetRecog_mbcs.java
        │                   │           ├── CharsetRecog_sbcs.java
        │                   │           ├── CharsetRecognizer.java
        │                   │           └── package-info.java
        │               ├── io
        │                   ├── BomAwareInputStream.java
        │                   └── DecodedInputStreamReader.java
        │               └── util
        │                   ├── ByteStreams.java
        │                   └── CharStreams.java
    └── test
        ├── java
            └── com
            │   └── sigpwned
            │       └── chardet4j
            │           └── ChardetTest.java
        └── resources
            └── webpage.html


/.gitattributes:
--------------------------------------------------------------------------------
1 | **/*.html -linguist-detectable
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @sigpwned


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "maven"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |   - package-ecosystem: "github-actions"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "daily"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     types:
 9 |       - opened
10 |       - synchronize
11 |       - reopened
12 |     branches:
13 |       - main
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |       with:
21 |         fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
22 |     - name: Set up JDK 8
23 |       uses: actions/setup-java@v4
24 |       with:
25 |         java-version: 8
26 |         distribution: temurin
27 |         cache: maven
28 |     - name: Cache Maven packages
29 |       uses: actions/cache@v4
30 |       with:
31 |         path: ~/.m2
32 |         key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
33 |         restore-keys: ${{ runner.os }}-m2
34 |     - name: Build and analyze
35 |       run: |
36 |         mvn                                                    \
37 |         -B                                                     \
38 |         clean                                                  \
39 |         verify                                                 \
40 |         --file pom.xml
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # From https://github.com/github/gitignore/blob/main/Java.gitignore
 2 | 
 3 | # Compiled class file
 4 | *.class
 5 | 
 6 | # Log file
 7 | *.log
 8 | 
 9 | # BlueJ files
10 | *.ctxt
11 | 
12 | # Mobile Tools for Java (J2ME)
13 | .mtj.tmp/
14 | 
15 | # Package Files #
16 | *.jar
17 | *.war
18 | *.nar
19 | *.ear
20 | *.zip
21 | *.tar.gz
22 | *.rar
23 | 
24 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
25 | hs_err_pid*
26 | replay_pid*
27 | 
28 | # From https://github.com/github/gitignore/blob/main/Maven.gitignore
29 | 
30 | target/
31 | pom.xml.tag
32 | pom.xml.releaseBackup
33 | pom.xml.versionsBackup
34 | pom.xml.next
35 | release.properties
36 | dependency-reduced-pom.xml
37 | buildNumber.properties
38 | .mvn/timing.properties
39 | # https://github.com/takari/maven-wrapper#usage-without-binary-jar
40 | .mvn/wrapper/maven-wrapper.jar
41 | 
42 | # Eclipse m2e generated files
43 | # Eclipse Core
44 | .project
45 | # JDT-specific (Eclipse Java Development Tools)
46 | .classpath
47 | 
48 | # Other
49 | 
50 | # Emacs temporary files
51 | *~
52 | 
53 | # Eclipse m2e
54 | .settings
55 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | ## Special thanks for all the people who have contributed code to this project so far:
 4 | 
 5 | * [sigpwned](https://github.com/sigpwned) (Founder)
 6 | 
 7 | ## Thanks to everyone who has reported issues to this project so far:
 8 | 
 9 | * [chrisbrookes](https://github.com/chrisbrookes)
10 | 
11 | ## I would like to join this list. How can I help the project?
12 | 
13 | Outstanding! We're currently looking for contributions for the following:
14 | 
15 | - [ ] Bug fixes
16 | - [ ] More tests
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CHARDET4J [![tests](https://github.com/sigpwned/chardet4j/actions/workflows/tests.yml/badge.svg)](https://github.com/sigpwned/chardet4j/actions/workflows/tests.yml) [![Maven Central](https://img.shields.io/maven-central/v/com.sigpwned/chardet4j)](https://central.sonatype.com/search?q=g%3Acom.sigpwned%20a%3Achardet4j) [![javadoc](https://javadoc.io/badge2/com.sigpwned/chardet4j/javadoc.svg)](https://javadoc.io/doc/com.sigpwned/chardet4j)
  2 | 
  3 | ## Introduction
  4 | 
  5 | The state-of-the-art character set detection library for Java is
  6 | [icu4j](https://github.com/unicode-org/icu). However, the icu4j JAR
  7 | file is about 13MB. This is a hefty price to pay for programs that
  8 | only require charset detection! There should be a smaller option of
  9 | the same quality.
 10 | 
 11 | The chardet4j library pulls the `CharsetDetector` feature from icu4j
 12 | and repackages it as this standalone library. This allows programs to
 13 | make good use of this important feature without bloating their
 14 | JARs. At the time of this writing, the chardet4j JAR comes in around
 15 | 85KB. There are no dependencies.
 16 | 
 17 | This library also implements some other important components of
 18 | character set detection and decoding, namely byte order mark handling.
 19 | 
 20 | ## Features
 21 | 
 22 | The library assists the user with detecting character set encodings for byte
 23 | streams and decoding them into character streams. It offers specific
 24 | abstractions for byte order marks (BOMs) and specific methods for identifying
 25 | and decoding character encodings for byte arrays and input streams.
 26 | 
 27 | The library uses the following algorithm to determine character encoding of
 28 | binary data:
 29 | 
 30 | 1. Check for a BOM. If one is present, then trust it, and use the corresponding
 31 |    charset to decode the data.
 32 | 2. Use a battery of bespoke character set detectors to guess which charset is
 33 |    most likely. Users may provide a declared encoding, which provides a boost
 34 |    to the given charset in this estimation process. If a charset is identified
 35 |    with sufficient confidence, then use it to decode the data.
 36 | 3. The default charset is used to decode the data, if one is given.
 37 | 
 38 | ## Installation
 39 | 
 40 | The library can be found in Maven Central with the following coordinates:
 41 | 
 42 |     <dependency>
 43 |         <groupId>com.sigpwned</groupId>
 44 |         <artifactId>chardet4j</artifactId>
 45 |         <version>75.1.2</version>
 46 |     </dependency>
 47 | 
 48 | It is compatible with Java versions 8 and later. chardet4j has no dependencies.
 49 | 
 50 | The `$major.$minor.$patch` version of the library is determined by the underlying
 51 | icu4j version and the local release version. The `$major` and `$minor` are taken
 52 | from the icu4j version, and `$patch` is the release number of this library for
 53 | the icu4j version, starting with 0.
 54 | 
 55 | ## Getting Started
 56 | 
 57 | To decode an `InputStream` to a `Reader` by detecting its character set:
 58 | 
 59 |     try (Reader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) {
 60 |         // Process chars here
 61 |     }
 62 | 
 63 | Charset detection is important when dealing with content of unknown provenance,
 64 | like content downloaded from the internet or text files uploaded by users. In
 65 | such cases, users often have a declared encoding, typically from a content type.
 66 | The name of the declared encoding can be provided as a hint to charset
 67 | detection:
 68 | 
 69 |     try (Reader chars=Chardet.decode(bytes, declaredEncoding, StandardCharsets.UTF_8)) {
 70 |         // Process chars here
 71 |     }
 72 | 
 73 | Byte arrays can be converted directly to Strings as well:
 74 | 
 75 |     String chars=Chardet.decode(bytes, declaredEncoding, StandardCharsets.UTF_8);
 76 |     
 77 | Users only interested in detection can detect the charset directly, or by name
 78 | in case the detected charset is not supported by the JVM:
 79 | 
 80 |     // Throws an UnsupportedCharsetException if the charset is not supported by JVM
 81 |     Optional<Charset> maybeCharset = Chardet.detect(bytes, declaredEncoding);
 82 |     
 83 |     // Never throws
 84 |     Optional<String> maybeCharsetName = Chardet.detectName(bytes, declaredEncoding);
 85 | 
 86 | ## Advanced Usage
 87 | 
 88 | The following are more sophisticated use cases and edge cases that most users
 89 | will not need to worry about.
 90 | 
 91 | ### Working with BOMs Directly
 92 | 
 93 | The easiest way to work with byte order marks directly is with the
 94 | `BomAwareInputStream` class:
 95 | 
 96 |     try (BomAwareInputStream bomed=BomAwareInputStream.detect(in)) {
 97 |         if(bomed.bom().isPresent()) {
 98 |             // A BOM was detected in this byte stream, and can be accessed using
 99 |             // bomed.bom()
100 |         } else {
101 |             // No BOM was detected in this byte stream.
102 |         }
103 |     }
104 | 
105 | It is not typically required to work with BOMs directly, but it can be useful
106 | when creating a custom decode pipeline.
107 | 
108 | ### Accessing Character Encoding
109 | 
110 | The easiest way to determine which character encoding is in use is with the
111 | `DecodedInputStreamReader` class:
112 | 
113 |     try (DecodedInputStreamReader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) {
114 |         // The charset that was detected and is being used to decode the given byte
115 |         // stream can be accessed using chars.charset()
116 |         Charset charset = chars.charset();
117 |     }
118 | 
119 | ### Handling Unsupported Charsets
120 | 
121 | The Java Standard only requires that distributions support the
122 | [standard charsets](https://docs.oracle.com/javase/8/docs/api/index.html?java/nio/charset/StandardCharsets.html)
123 | ISO-8859-1, US-ASCII, UTF-8, UTF-16BE, and UTF-16LE. This library detects those
124 | charsets and many more besides, so there is a possibility that the detected
125 | charset is not supported by the current JVM.
126 | 
127 | Users are unlikely to hit this situation in the wild, since (a) Java generally
128 | supports almost all of the charsets this library detects, and (b) the
129 | unsupported charsets are scarce in the wild, and getting more scarce every year.
130 | 
131 | Regardless, there are a couple ways to manage this situation.
132 | 
133 | #### Catch UnsupportedCharsetException
134 | 
135 | The library throws a `UnsupportedCharsetException` when the detected charset is not
136 | supported by the current JVM. Users are free to catch this exception and handle
137 | as desired.
138 | 
139 |     try (Reader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) {
140 |         // Process chars here
141 |     } catch(UnsupportedCharsetException e) {
142 |         // The charset was detected, but is not supported by current JVM. There are a
143 |         // few ways this is typically handled:
144 |         // 
145 |         // - Propagate as an IOException, since the content cannot be decoded properly
146 |         // - Ignore the error and use a default charset
147 |     }
148 | 
149 | #### Detect Charset Names
150 | 
151 | Rather than working with charsets, work with charset names instead. This will
152 | never throw an exception.
153 | 
154 |     Optional<String> maybeCharsetName = Chardet.detectCharsetName(bytes);
155 |     if(maybeCharsetName.isPresent()) {
156 |         // The charset was detected successfully, and the name can be accessed using
157 |         // maybeCharsetName.get()
158 |     } else {
159 |         // The charset could not be detected
160 |     }
161 | 
162 | ### Using Custom Charsets
163 | 
164 | Users who wish to add new charsets to the JVM should follow the instructions
165 | on the
166 | [CharsetProvider](https://docs.oracle.com/javase/8/docs/api/java/nio/charset/spi/CharsetProvider.html)
167 | class. The library will automatically pick up any such new charsets.
168 |     
169 | ## Configuration
170 | 
171 | The following configuration variables are available to customize the working of
172 | the library.
173 | 
174 | ### System Property chardet4j.detect.bufsize
175 | 
176 | One way the library detects character encodings is by analyzing the leading
177 | bytes of a binary file. The more data the library analyzes, the more accurate
178 | the estimates will be, but the longer it will take. By default, this value is
179 | 8192 bytes, or 8KiB. Users can change this value by setting the
180 | `chardet4j.detect.bufsize` system property. For example, to set this value to 
181 | 16KiB, use:
182 | 
183 |     java -Dchardet.detect.bufsize=16384 ...
184 | 
185 | Adjusting the buffer size can be useful when dealing with particularly large
186 | files where detection accuracy or performance might be a concern.
187 | 
188 | ## Supported Character Encodings
189 | 
190 | The chardet4j library and Java in general supports the following character
191 | encodings at the following levels:
192 | 
193 | |     Name     | Standard | ICU4J | BOM | Laptop |
194 | |:------------:|:--------:|:-----:|:---:|:------:|
195 | | Big5         |          |   ✔   |     |    ✔   |
196 | | EUC-JP       |          |   ✔   |     |    ✔   |
197 | | EUC-KR       |          |   ✔   |     |    ✔   |
198 | | GB18030      |          |   ✔   |  ✔  |    ✔   |
199 | | ISO-2022-CN  |          |   ✔   |     |    ✔   |
200 | | ISO-2022-JP  |          |   ✔   |     |    ✔   |
201 | | ISO-2022-KR  |          |   ✔   |     |    ✔   |
202 | | ISO-8859-1   |          |   ✔   |     |    ✔   |
203 | | ISO-8859-2   |          |   ✔   |     |    ✔   |
204 | | ISO-8859-5   |          |   ✔   |     |    ✔   |
205 | | ISO-8859-6   |          |   ✔   |     |    ✔   |
206 | | ISO-8859-7   |          |   ✔   |     |    ✔   |
207 | | ISO-8859-8   |          |   ✔   |     |    ✔   |
208 | | ISO-8859-8-I |          |   ✔   |     |        |
209 | | ISO-8859-9   |          |   ✔   |     |    ✔   |
210 | | KOI8-R       |          |   ✔   |     |    ✔   |
211 | | Shift_JIS    |          |   ✔   |     |    ✔   |
212 | | US-ASCII     |     ✔    |   ✔*  |     |    ✔   |
213 | | UTF-1        |          |       |  ✔  |        |
214 | | UTF-16BE     |     ✔    |   ✔   |  ✔  |    ✔   |
215 | | UTF-16LE     |     ✔    |   ✔   |  ✔  |    ✔   |
216 | | UTF-32BE     |          |   ✔   |  ✔  |    ✔   |
217 | | UTF-32LE     |          |   ✔   |  ✔  |    ✔   |
218 | | UTF-8        |     ✔    |   ✔   |  ✔  |    ✔   |
219 | | UTF-EBCDIC   |          |       |  ✔  |        |
220 | | windows-1250 |          |   ✔   |     |    ✔   |
221 | | windows-1251 |          |   ✔   |     |    ✔   |
222 | | windows-1252 |          |   ✔   |     |    ✔   |
223 | | windows-1253 |          |   ✔   |     |    ✔   |
224 | | windows-1254 |          |   ✔   |     |    ✔   |
225 | | windows-1255 |          |   ✔   |     |    ✔   |
226 | | windows-1256 |          |   ✔   |     |    ✔   |
227 | 
228 | Notes:  
229 | `*`: ICU4J detects US-ASCII as ISO-8859-1, a superset of US-ASCII
230 | 
231 | The support levels have the following meanings:
232 | 
233 | * `Standard` -- The Java Standard requires that all JVMs support this
234 |    character encoding
235 | * `ICU4J` -- The ICU4J project has a bespoke charset recognizer for this
236 |   character encoding
237 | * `BOM` -- The character encoding can be detected by Byte Order Mark
238 | * `Laptop` -- The character sets supported by `java version "1.8.0_321"` on my
239 |    laptop (Obviously, this test is completely unscientific. If you have a
240 |    better suggestion, please open an issue!)
241 | 
242 | ## Licensing
243 | 
244 | The icu library is released under the ICU license. The chardet4j library is
245 | released under the Apache license. For more details, see the LICENSE file.
246 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  2 |     <modelVersion>4.0.0</modelVersion>
  3 | 
  4 |     <groupId>com.sigpwned</groupId>
  5 |     <artifactId>chardet4j</artifactId>
  6 |     <version>77.1.1-SNAPSHOT</version>
  7 |     <name>chardet4j</name>
  8 |     <inceptionYear>2022</inceptionYear>
  9 |     <description>Simple, compact charset detection for Java 8+</description>
 10 |     <url>https://github.com/sigpwned/chardet4j</url>
 11 |     <packaging>jar</packaging>
 12 | 
 13 |     <organization>
 14 |         <name>Andy Boothe</name>
 15 |         <url>https://www.sigpwned.com/</url>
 16 |     </organization>
 17 | 
 18 |     <scm>
 19 |         <connection>scm:git:ssh://git@github.com/sigpwned/chardet4j.git</connection>
 20 |         <developerConnection>scm:git:ssh://git@github.com/sigpwned/chardet4j.git</developerConnection>
 21 |         <url>https://github.com/sigpwned/chardet4j/tree/main</url>
 22 |         <tag>v70.1.0</tag>
 23 |     </scm>
 24 | 
 25 |     <licenses>
 26 |         <license>
 27 |             <name>Apache License, Version 2.0</name>
 28 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 29 |         </license>
 30 |     </licenses>
 31 | 
 32 |     <developers>
 33 |         <developer>
 34 |             <name>Andy Boothe</name>
 35 |             <email>andy.boothe@gmail.com</email>
 36 |         </developer>
 37 |     </developers>
 38 | 
 39 |     <distributionManagement>
 40 |         <snapshotRepository>
 41 |             <id>ossrh</id>
 42 |             <name>Sonatype Nexus Snapshots</name>
 43 |             <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
 44 |         </snapshotRepository>
 45 |         <repository>
 46 |             <id>ossrh</id>
 47 |             <name>Nexus Release Repository</name>
 48 |             <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 49 |         </repository>
 50 |     </distributionManagement>
 51 | 
 52 |     <issueManagement>
 53 |         <system>GitHub</system>
 54 |         <url>https://github.com/sigpwned/chardet4j</url>
 55 |     </issueManagement>
 56 | 
 57 |     <properties>
 58 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 59 |         <maven.compiler.source>1.8</maven.compiler.source>
 60 |         <maven.compiler.target>1.8</maven.compiler.target>
 61 |         <icu4j.version>77.1</icu4j.version>
 62 |         <guava.version>33.4.5-jre</guava.version>
 63 |         <slf4j.version>2.0.16</slf4j.version>
 64 |         <junit.version>4.13.2</junit.version>
 65 |         <hamcrest.version>1.3</hamcrest.version>
 66 |     </properties>
 67 | 
 68 |     <build>
 69 |         <plugins>
 70 |             <plugin>
 71 |                 <groupId>org.codehaus.mojo</groupId>
 72 |                 <artifactId>license-maven-plugin</artifactId>
 73 |                 <version>2.5.0</version>
 74 |                 <configuration>
 75 |                     <verbose>false</verbose>
 76 |                 </configuration>
 77 |                 <executions>
 78 |                     <execution>
 79 |                         <id>update-file-header</id>
 80 |                         <goals>
 81 |                             <goal>update-file-header</goal>
 82 |                         </goals>
 83 |                         <phase>process-sources</phase>
 84 |                         <configuration>
 85 |                             <addJavaLicenseAfterPackage>false</addJavaLicenseAfterPackage>
 86 |                             <processStartTag>=================================LICENSE_START==================================</processStartTag>
 87 |                             <processEndTag>==================================LICENSE_END===================================</processEndTag>
 88 |                             <sectionDelimiter>====================================SECTION=====================================</sectionDelimiter>
 89 |                             <licenseName>apache_v2</licenseName>
 90 |                             <roots>
 91 |                                 <root>src/main/java</root>
 92 |                                 <root>src/test/java</root>
 93 |                             </roots>
 94 |                             <excludes>
 95 |                                 <exclude>**/com/ibm/icu/**</exclude>
 96 |                             </excludes>
 97 |                         </configuration>
 98 |                     </execution>
 99 |                 </executions>
100 |             </plugin>
101 |         </plugins>
102 |     </build>
103 | 
104 |     <dependencyManagement>
105 |         <dependencies>
106 |             <!-- We don't include icu4j. That's the whole point of this repo! But we reference -->
107 |             <!-- a specific version to convince dependabot to notify us when updates are made. -->
108 |             <dependency>
109 |                 <groupId>com.ibm.icu</groupId>
110 |                 <artifactId>icu4j</artifactId>
111 |                 <version>${icu4j.version}</version>
112 |             </dependency>
113 |         </dependencies>
114 |     </dependencyManagement>
115 | 
116 |     <dependencies>
117 |         <dependency>
118 |             <groupId>com.google.guava</groupId>
119 |             <artifactId>guava</artifactId>
120 |             <version>${guava.version}</version>
121 |             <scope>test</scope>
122 |         </dependency>
123 |         <dependency>
124 |             <groupId>junit</groupId>
125 |             <artifactId>junit</artifactId>
126 |             <version>${junit.version}</version>
127 |             <scope>test</scope>
128 |         </dependency>
129 |         <dependency>
130 |             <groupId>org.hamcrest</groupId>
131 |             <artifactId>hamcrest-all</artifactId>
132 |             <version>${hamcrest.version}</version>
133 |             <scope>test</scope>
134 |         </dependency>
135 |     </dependencies>
136 | 
137 |     <!-- For releasing to maven central -->
138 |     <profiles>
139 |         <profile>
140 |             <id>release</id>
141 |             <build>
142 |                 <plugins>
143 |                     <plugin>
144 |                         <groupId>org.apache.maven.plugins</groupId>
145 |                         <artifactId>maven-source-plugin</artifactId>
146 |                         <executions>
147 |                             <execution>
148 |                                 <id>attach-sources</id>
149 |                                 <goals>
150 |                                     <goal>jar-no-fork</goal>
151 |                                 </goals>
152 |                             </execution>
153 |                         </executions>
154 |                     </plugin>
155 |                     <plugin>
156 |                         <groupId>org.apache.maven.plugins</groupId>
157 |                         <artifactId>maven-javadoc-plugin</artifactId>
158 |                         <executions>
159 |                             <execution>
160 |                                 <id>attach-javadocs</id>
161 |                                 <goals>
162 |                                     <goal>jar</goal>
163 |                                 </goals>
164 |                                 <configuration>
165 |                                     <failOnError>false</failOnError>
166 |                                 </configuration>
167 |                             </execution>
168 |                         </executions>
169 |                     </plugin>
170 |                     <plugin>
171 |                         <groupId>org.apache.maven.plugins</groupId>
172 |                         <artifactId>maven-gpg-plugin</artifactId>
173 |                         <executions>
174 |                             <execution>
175 |                                 <id>sign-artifacts</id>
176 |                                 <phase>verify</phase>
177 |                                 <goals>
178 |                                     <goal>sign</goal>
179 |                                 </goals>
180 |                             </execution>
181 |                         </executions>
182 |                     </plugin>
183 |                     <plugin>
184 |                         <groupId>org.apache.maven.plugins</groupId>
185 |                         <artifactId>maven-release-plugin</artifactId>
186 |                         <configuration>
187 |                             <tagNameFormat>v@{project.version}</tagNameFormat>
188 |                             <autoVersionSubmodules>true</autoVersionSubmodules>
189 |                             <useReleaseProfile>false</useReleaseProfile>
190 |                             <releaseProfiles>release</releaseProfiles>
191 |                             <goals>deploy</goals>
192 |                         </configuration>
193 |                     </plugin>
194 |                     <plugin>
195 |                         <groupId>org.sonatype.plugins</groupId>
196 |                         <artifactId>nexus-staging-maven-plugin</artifactId>
197 |                         <extensions>true</extensions>
198 |                         <configuration>
199 |                             <serverId>ossrh</serverId>
200 |                             <nexusUrl>https://oss.sonatype.org/</nexusUrl>
201 |                             <autoReleaseAfterClose>true</autoReleaseAfterClose>
202 |                         </configuration>
203 |                     </plugin>
204 |                 </plugins>
205 |             </build>
206 |         </profile>
207 |     </profiles>
208 | </project>
209 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/ByteOrderMark.java:
--------------------------------------------------------------------------------
  1 | /*-
  2 |  * =================================LICENSE_START==================================
  3 |  * chardet4j
  4 |  * ====================================SECTION=====================================
  5 |  * Copyright (C) 2022 Andy Boothe
  6 |  * ====================================SECTION=====================================
  7 |  * Licensed under the Apache License, Version 2.0 (the "License");
  8 |  * you may not use this file except in compliance with the License.
  9 |  * You may obtain a copy of the License at
 10 |  * 
 11 |  *      http://www.apache.org/licenses/LICENSE-2.0
 12 |  * 
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  * ==================================LICENSE_END===================================
 19 |  */
 20 | package com.sigpwned.chardet4j;
 21 | 
 22 | import static java.util.Objects.requireNonNull;
 23 | import java.io.IOException;
 24 | import java.io.InputStream;
 25 | import java.nio.charset.Charset;
 26 | import java.nio.charset.IllegalCharsetNameException;
 27 | import java.nio.charset.StandardCharsets;
 28 | import java.nio.charset.UnsupportedCharsetException;
 29 | import java.nio.charset.spi.CharsetProvider;
 30 | import java.util.Arrays;
 31 | import java.util.Comparator;
 32 | import java.util.Optional;
 33 | import java.util.concurrent.atomic.AtomicReference;
 34 | import com.sigpwned.chardet4j.io.BomAwareInputStream;
 35 | 
 36 | /**
 37 |  * A byte order mark (BOM) that hard-codes charset into an input stream. At this time, this
 38 |  * implementation only supports BOMs for the character sets the JVM supports, namely UTF-8,
 39 |  * UTF-16LE, and UTF-16BE.
 40 |  * 
 41 |  * @see <a href=
 42 |  *      "https://en.wikipedia.org/wiki/Byte_order_mark">https://en.wikipedia.org/wiki/Byte_order_mark</a>
 43 |  */
 44 | public enum ByteOrderMark {
 45 |   /**
 46 |    * The BOM for a UTF-8 stream
 47 |    */
 48 |   UTF_8(new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.UTF_8, "UTF-8"),
 49 | 
 50 |   /**
 51 |    * The BOM for a UTF-16 big endian stream
 52 |    */
 53 |   UTF_16BE(new byte[] {(byte) 0xFE, (byte) 0xFF}, StandardCharsets.UTF_16BE, "UTF-16BE"),
 54 | 
 55 |   /**
 56 |    * The BOM for a UTF-16 little endian stream
 57 |    */
 58 |   UTF_16LE(new byte[] {(byte) 0xFF, (byte) 0xFE}, StandardCharsets.UTF_16LE, "UTF-16LE"),
 59 | 
 60 |   /**
 61 |    * The BOM for a UTF-32 big endian stream
 62 |    */
 63 |   UTF_32BE(new byte[] {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}, null, "UTF-32BE"),
 64 | 
 65 |   /**
 66 |    * The BOM for a UTF-32 little endian stream
 67 |    */
 68 |   UTF_32LE(new byte[] {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}, null, "UTF-32LE"),
 69 | 
 70 |   /**
 71 |    * The BOM for a UTF-1 stream
 72 |    */
 73 |   UTF_1(new byte[] {(byte) 0xF7, (byte) 0x64, (byte) 0x4C}, null, "UTF-1"),
 74 | 
 75 |   /**
 76 |    * The BOM for a UTF-EBCDIC
 77 |    */
 78 |   UTF_EBCDIC(new byte[] {(byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73}, null, "UTF-EBCDIC"),
 79 | 
 80 |   /**
 81 |    * The BOM for a GB-18030 stream
 82 |    */
 83 |   GB_18030(new byte[] {(byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33}, null, "GB-18030");
 84 | 
 85 |   // While BOMs for UTF-7, SCSU, and BOCU-1 exist, they are not deterministic and may not observe
 86 |   // byte boundaries. Also, the JVM generally does not support these charsets out of the box. So,
 87 |   // to keep things simple, these BOMs are not supported here.
 88 | 
 89 |   public static final int MAX_BYTE_LENGTH =
 90 |       Arrays.stream(values()).mapToInt(bom -> bom.getBytes().length).max().getAsInt();
 91 | 
 92 |   /**
 93 |    * The values of the enum, sorted by the length of the BOM bytes, with the longest BOMs first.
 94 |    */
 95 |   private static final ByteOrderMark[] VALUES = Arrays.copyOf(values(), values().length);
 96 |   static {
 97 |     Arrays.sort(VALUES, Comparator.<ByteOrderMark>comparingInt(bom -> bom.getBytes().length)
 98 |         .reversed().thenComparing(ByteOrderMark::getCharsetName));
 99 |   }
100 | 
101 |   /**
102 |    * Detects the BOM in the given input stream, if any, and returns a {@link BomAwareInputStream}
103 |    * that wraps the stream.
104 |    * 
105 |    * @param in the input stream
106 |    * @return the {@link BomAwareInputStream}
107 |    * @throws IOException if an I/O error
108 |    */
109 |   public static BomAwareInputStream detect(InputStream in) throws IOException {
110 |     return BomAwareInputStream.detect(in);
111 |   }
112 | 
113 |   /**
114 |    * Returns the BOM for the given data, if it is supported. Searches the whole array.
115 |    * 
116 |    * @param data the data to check
117 |    * @return the BOM, if found, otherwise empty
118 |    * 
119 |    * @throws NullPointerException if {@code data} is {@code null}
120 |    * 
121 |    * @see #detect(byte[], int)
122 |    */
123 |   public static Optional<ByteOrderMark> detect(byte[] data) {
124 |     if (data == null)
125 |       throw new NullPointerException();
126 |     return detect(data, data.length);
127 |   }
128 | 
129 |   /**
130 |    * Detects the BOM in the given data, starting at 0, up to the given length.
131 |    * 
132 |    * @param data the data to check
133 |    * @param len the length of the data to check
134 |    * @return the BOM, if found, otherwise empty
135 |    * 
136 |    * @throws NullPointerException if {@code data} is {@code null}
137 |    * @throws IllegalArgumentException if {@code len < 0}
138 |    * @throws ArrayIndexOutOfBoundsException if {@code len > data.length}
139 |    * 
140 |    * @see #detect(byte[], int, int)
141 |    */
142 |   public static Optional<ByteOrderMark> detect(byte[] data, int len) {
143 |     return detect(data, 0, len);
144 |   }
145 | 
146 |   /**
147 |    * Detects the BOM in the given data, starting at the given offset and continuing for the given
148 |    * length.
149 |    * 
150 |    * @param data the data to check
151 |    * @param off the offset in the data to start checking
152 |    * @param len the length of the data to check
153 |    * @return the BOM, if found, otherwise empty
154 |    *
155 |    * @throws NullPointerException if {@code data} is {@code null}
156 |    * @throws IllegalArgumentException if {@code len < 0}
157 |    * @throws ArrayIndexOutOfBoundsException if {@code off < 0} or {@code off + len > data.length}
158 |    */
159 |   public static Optional<ByteOrderMark> detect(byte[] data, int off, int len) {
160 |     if (data == null)
161 |       throw new NullPointerException();
162 |     if (len < 0)
163 |       throw new IllegalArgumentException("len < 0");
164 |     if (off < 0)
165 |       throw new ArrayIndexOutOfBoundsException(off);
166 |     if (off + len > data.length)
167 |       throw new ArrayIndexOutOfBoundsException(off + len);
168 | 
169 |     for (ByteOrderMark value : VALUES) {
170 |       byte[] bom = value.getBytes();
171 |       int bomlen = value.getBytes().length;
172 |       if (off + bomlen <= len && equals(data, off, off + bomlen, bom, 0, bomlen)) {
173 |         return Optional.of(value);
174 |       }
175 |     }
176 | 
177 |     return Optional.empty();
178 |   }
179 | 
180 |   /**
181 |    * Returns true if the two specified arrays of bytes, over the specified ranges, are <i>equal</i>
182 |    * to one another.
183 |    *
184 |    * <p>
185 |    * Two arrays are considered equal if the number of elements covered by each range is the same,
186 |    * and all corresponding pairs of elements over the specified ranges in the two arrays are equal.
187 |    * In other words, two arrays are equal if they contain, over the specified ranges, the same
188 |    * elements in the same order.
189 |    *
190 |    * @param a the first array to be tested for equality
191 |    * @param aFromIndex the index (inclusive) of the first element in the first array to be tested
192 |    * @param aToIndex the index (exclusive) of the last element in the first array to be tested
193 |    * @param b the second array to be tested for equality
194 |    * @param bFromIndex the index (inclusive) of the first element in the second array to be tested
195 |    * @param bToIndex the index (exclusive) of the last element in the second array to be tested
196 |    * @return {@code true} if the two arrays, over the specified ranges, are equal
197 |    * @throws IllegalArgumentException if {@code aFromIndex > aToIndex} or if
198 |    *         {@code bFromIndex > bToIndex}
199 |    * @throws ArrayIndexOutOfBoundsException if {@code aFromIndex < 0 or aToIndex > a.length} or if
200 |    *         {@code bFromIndex < 0 or bToIndex > b.length}
201 |    * @throws NullPointerException if either array is {@code null}
202 |    */
203 |   private static boolean equals(byte[] a, int aFromIndex, int aToIndex, byte[] b, int bFromIndex,
204 |       int bToIndex) {
205 |     rangeCheck(a.length, aFromIndex, aToIndex);
206 |     rangeCheck(b.length, bFromIndex, bToIndex);
207 | 
208 |     int aLength = aToIndex - aFromIndex;
209 |     int bLength = bToIndex - bFromIndex;
210 |     if (aLength != bLength)
211 |       return false;
212 |     int length = aLength;
213 | 
214 |     for (int i = 0; i < length; i++) {
215 |       if (a[aFromIndex + i] != b[bFromIndex + i]) {
216 |         return false;
217 |       }
218 |     }
219 | 
220 |     return true;
221 |   }
222 | 
223 |   /**
224 |    * Checks that {@code fromIndex} and {@code toIndex} are in the range and throws an exception if
225 |    * they aren't.
226 |    */
227 |   private static void rangeCheck(int arrayLength, int fromIndex, int toIndex) {
228 |     if (fromIndex > toIndex) {
229 |       throw new IllegalArgumentException("fromIndex(" + fromIndex + ") > toIndex(" + toIndex + ")");
230 |     }
231 |     if (fromIndex < 0) {
232 |       throw new ArrayIndexOutOfBoundsException(fromIndex);
233 |     }
234 |     if (toIndex > arrayLength) {
235 |       throw new ArrayIndexOutOfBoundsException(toIndex);
236 |     }
237 |   }
238 | 
239 |   private final byte[] bytes;
240 |   private final Charset standardCharset;
241 |   private final String charsetName;
242 |   private volatile AtomicReference<Charset> charset;
243 | 
244 |   private ByteOrderMark(byte[] bytes, Charset standardCharset, String charsetName) {
245 |     this.bytes = requireNonNull(bytes);
246 |     this.standardCharset = standardCharset;
247 |     this.charsetName = requireNonNull(charsetName);
248 |     if (standardCharset != null)
249 |       this.charset = new AtomicReference<>(standardCharset);
250 |   }
251 | 
252 |   /**
253 |    * @return the bytes
254 |    */
255 |   /* default */ byte[] getBytes() {
256 |     return bytes;
257 |   }
258 | 
259 |   public int length() {
260 |     return bytes.length;
261 |   }
262 | 
263 |   /**
264 |    * Returns the charset for this BOM. Checks for standard charsets first, then attempts to load the
265 |    * charset using {@link Charset#forName(String)}. If the charset is not supported, then throws an
266 |    * {@link UnsupportedCharsetException}.
267 |    * 
268 |    * @return the charset
269 |    * @throws UnsupportedCharsetException if the charset is not supported, e.g., UTF-32BE
270 |    * 
271 |    * @see #getCharsetIfSupported()
272 |    * @see CharsetProvider
273 |    */
274 |   public Charset getCharset() {
275 |     return getCharsetIfSupported().orElseThrow(() -> new UnsupportedCharsetException(charsetName));
276 |   }
277 | 
278 |   /**
279 |    * Returns the charset for this BOM. Checks for standard charsets first, then attempts to load the
280 |    * charset using {@link Charset#forName}. If the charset is not supported, then returns empty.
281 |    * 
282 |    * @return the charset, if supported, otherwise empty
283 |    * @see #getCharset()
284 |    */
285 |   public Optional<Charset> getCharsetIfSupported() {
286 |     // If it's a standard charset, return it
287 |     if (standardCharset != null)
288 |       return Optional.of(standardCharset);
289 | 
290 |     // If it's not a standard charset, then attempt to load it and cache the result.
291 |     if (charset == null) {
292 |       Charset c;
293 |       try {
294 |         c = Charset.forName(charsetName);
295 |       } catch (IllegalCharsetNameException e) {
296 |         // Odd. None of these charset names should be invalid. Just treat it like it's not supported
297 |         // and set the cached charset to null.
298 |         c = null;
299 |       } catch (UnsupportedCharsetException e) {
300 |         // If the charset is not supported, then set the cached charset to null.
301 |         c = null;
302 |       }
303 |       charset = new AtomicReference<>(c);
304 |     }
305 | 
306 |     // If the cached charset is null, then return empty. Otherwise, return.
307 |     return Optional.ofNullable(charset.get());
308 |   }
309 | 
310 |   /**
311 |    * @return the charset name
312 |    */
313 |   public String getCharsetName() {
314 |     return charsetName;
315 |   }
316 | }
317 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/Chardet.java:
--------------------------------------------------------------------------------
  1 | /*-
  2 |  * =================================LICENSE_START==================================
  3 |  * chardet4j
  4 |  * ====================================SECTION=====================================
  5 |  * Copyright (C) 2022 Andy Boothe
  6 |  * ====================================SECTION=====================================
  7 |  * Licensed under the Apache License, Version 2.0 (the "License");
  8 |  * you may not use this file except in compliance with the License.
  9 |  * You may obtain a copy of the License at
 10 |  * 
 11 |  *      http://www.apache.org/licenses/LICENSE-2.0
 12 |  * 
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  * ==================================LICENSE_END===================================
 19 |  */
 20 | package com.sigpwned.chardet4j;
 21 | 
 22 | import static java.util.stream.Collectors.toList;
 23 | import java.io.ByteArrayInputStream;
 24 | import java.io.IOException;
 25 | import java.io.InputStream;
 26 | import java.io.Reader;
 27 | import java.io.SequenceInputStream;
 28 | import java.io.StringWriter;
 29 | import java.io.UncheckedIOException;
 30 | import java.io.Writer;
 31 | import java.nio.charset.Charset;
 32 | import java.nio.charset.UnsupportedCharsetException;
 33 | import java.util.Arrays;
 34 | import java.util.Comparator;
 35 | import java.util.List;
 36 | import java.util.Objects;
 37 | import java.util.Optional;
 38 | import com.sigpwned.chardet4j.com.ibm.icu.text.CharsetDetector;
 39 | import com.sigpwned.chardet4j.io.BomAwareInputStream;
 40 | import com.sigpwned.chardet4j.io.DecodedInputStreamReader;
 41 | import com.sigpwned.chardet4j.util.ByteStreams;
 42 | import com.sigpwned.chardet4j.util.CharStreams;
 43 | 
 44 | /**
 45 |  * Simple interface to charset detection.
 46 |  */
 47 | public final class Chardet {
 48 |   private Chardet() {}
 49 | 
 50 |   private static final int MIN_CONFIDENCE = 0;
 51 |   private static final int MAX_CONFIDENCE = 100;
 52 | 
 53 |   private static final int DECLARED_ENCODING_BUMP = Optional
 54 |       .ofNullable(System.getProperty("chardet4j.detect.bump")).map(Integer::parseInt).orElse(10);
 55 | 
 56 |   /**
 57 |    * We have to do this because the ICU detector ignores the declared encoding, but the CharsetMatch
 58 |    * values are immutable and the constructor isn't visible.
 59 |    */
 60 |   private static class ChardetMatch implements Comparable<ChardetMatch> {
 61 |     public static ChardetMatch of(String name, int confidence) {
 62 |       return new ChardetMatch(name, confidence);
 63 |     }
 64 | 
 65 |     private final String name;
 66 |     private final int confidence;
 67 | 
 68 |     public ChardetMatch(String name, int confidence) {
 69 |       if (name == null)
 70 |         throw new NullPointerException();
 71 |       if (confidence < MIN_CONFIDENCE || confidence > MAX_CONFIDENCE)
 72 |         throw new IllegalArgumentException("confidence out of range " + confidence);
 73 |       this.name = name;
 74 |       this.confidence = confidence;
 75 |     }
 76 | 
 77 |     /**
 78 |      * @return the name
 79 |      */
 80 |     public String getName() {
 81 |       return name;
 82 |     }
 83 | 
 84 |     /**
 85 |      * @return the confidence
 86 |      */
 87 |     public int getConfidence() {
 88 |       return confidence;
 89 |     }
 90 | 
 91 |     @Override
 92 |     public int hashCode() {
 93 |       return Objects.hash(confidence, name);
 94 |     }
 95 | 
 96 |     @Override
 97 |     public boolean equals(Object obj) {
 98 |       if (this == obj)
 99 |         return true;
100 |       if (obj == null)
101 |         return false;
102 |       if (getClass() != obj.getClass())
103 |         return false;
104 |       ChardetMatch other = (ChardetMatch) obj;
105 |       return confidence == other.confidence && Objects.equals(name, other.name);
106 |     }
107 | 
108 |     @Override
109 |     public String toString() {
110 |       return "PossibleMatch [name=" + name + ", confidence=" + confidence + "]";
111 |     }
112 | 
113 |     @Override
114 |     public int compareTo(ChardetMatch o) {
115 |       return Integer.compare(getConfidence(), o.getConfidence());
116 |     }
117 |   }
118 | 
119 |   // detectCharset /////////////////////////////////////////////////////////////////////////////////
120 | 
121 |   /**
122 |    * Detect the charset of the given byte data. Input includes the entire array. If the character
123 |    * encoding is detected, but not supported, then an {@link UnsupportedCharsetException} is thrown.
124 |    * 
125 |    * @throws NullPointerException if data is null
126 |    * @throws UnsupportedOperationException If the charset can be detected, but is not supported.
127 |    */
128 |   public static Optional<Charset> detectCharset(byte[] data) {
129 |     return detectCharset(data, null);
130 |   }
131 | 
132 |   /**
133 |    * Detect the charset of the given byte data with the given encoding as a hint. Input includes the
134 |    * entire array. If the character encoding is detected, but not supported, then an
135 |    * {@link UnsupportedCharsetException} is thrown.
136 |    * 
137 |    * @param data the byte data
138 |    * @param declaredEncoding the declared encoding, treated as a hint
139 |    * @return the charset, if one can be detected
140 |    * 
141 |    * @throws NullPointerException if data is null
142 |    * @throws UnsupportedOperationException If the charset can be detected, but is not supported.
143 |    */
144 |   public static Optional<Charset> detectCharset(byte[] data, String declaredEncoding) {
145 |     if (data == null)
146 |       throw new NullPointerException();
147 |     return detectCharset(data, data.length, declaredEncoding);
148 |   }
149 | 
150 |   /**
151 |    * Detect the charset encoding of the given byte data in the first len bytes of the given array.
152 |    * If the character encoding is detected, but not supported, then an
153 |    * {@link UnsupportedCharsetException} is thrown.
154 |    * 
155 |    * @param data the byte data
156 |    * @param len the number of bytes to consider, starting from 0
157 |    * @param declaredEncoding the optional declared encoding, which is treated as a hint
158 |    * @return the charset encoding, if one can be detected
159 |    * 
160 |    * @throws NullPointerException if data is null
161 |    * @throws IllegalArgumentException if len < 0
162 |    * @throws ArrayIndexOutOfBoundsException if len > data.length
163 |    * @throws UnsupportedOperationException If the charset can be detected, but is not supported.
164 |    */
165 |   public static Optional<Charset> detectCharset(byte[] data, int len, String declaredEncoding) {
166 |     return detectCharset(data, 0, len, declaredEncoding);
167 |   }
168 | 
169 |   /**
170 |    * Detect the charset encoding of the given byte data in the given range of the given array. If
171 |    * the character encoding is detected, but not supported, then an
172 |    * {@link UnsupportedCharsetException} is thrown.
173 |    * 
174 |    * @param data the byte data
175 |    * @param off the offset into the byte data
176 |    * @param len the number of bytes to consider
177 |    * @param declaredEncoding the optional declared encoding, which is treated as a hint
178 |    * @return the charset encoding, if one can be detected
179 |    * 
180 |    * @throws NullPointerException if data is null
181 |    * @throws IllegalArgumentException if len < 0
182 |    * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length
183 |    * @throws UnsupportedOperationException If the charset can be detected, but is not supported. To
184 |    *         get the charset name whether it is supported or not, use
185 |    *         {@link #detectCharsetName(byte[], int, String)}.
186 |    */
187 |   public static Optional<Charset> detectCharset(byte[] data, int off, int len,
188 |       String declaredEncoding) {
189 |     return detectCharsetName(data, off, len, declaredEncoding).map(Charset::forName);
190 |   }
191 | 
192 |   // detectCharsetName /////////////////////////////////////////////////////////////////////////////
193 | 
194 |   /**
195 |    * Detect the charset of the given byte data. Input includes the entire array.
196 |    * 
197 |    * @throws NullPointerException if data is null
198 |    */
199 |   public static Optional<String> detectCharsetName(byte[] data) {
200 |     return detectCharsetName(data, null);
201 |   }
202 | 
203 |   /**
204 |    * Detect the charset of the given byte data. Input includes the entire array.
205 |    * 
206 |    * @param data the byte data
207 |    * @param declaredEncoding the declared encoding, treated as a hint
208 |    * @return the charset name, if one is detected
209 |    * 
210 |    * @throws NullPointerException if data is null
211 |    */
212 |   public static Optional<String> detectCharsetName(byte[] data, String declaredEncoding) {
213 |     if (data == null)
214 |       throw new NullPointerException();
215 |     return detectCharsetName(data, data.length, declaredEncoding);
216 |   }
217 | 
218 |   /**
219 |    * Detect the name of the charset encoding of the given byte data in the first len bytes of the
220 |    * given array.
221 |    * 
222 |    * @param data the byte data
223 |    * @param len the number of bytes to consider, starting from 0
224 |    * @param declaredEncoding the optional declared encoding, which is treated as a hint
225 |    * @return the charset encoding, if one can be detected
226 |    * 
227 |    * @throws NullPointerException if data is null
228 |    * @throws IllegalArgumentException if len < 0
229 |    * @throws ArrayIndexOutOfBoundsException if len > data.length
230 |    */
231 |   public static Optional<String> detectCharsetName(byte[] data, int len, String declaredEncoding) {
232 |     return detectCharsetName(data, 0, len, declaredEncoding);
233 |   }
234 | 
235 |   /**
236 |    * Detect the name of the charset encoding of the given range of the given array.
237 |    * 
238 |    * @param data the byte data
239 |    * @param len the number of bytes to consider, starting from 0
240 |    * @param declaredEncoding the optional declared encoding, which is treated as a hint
241 |    * @return the charset encoding, if one can be detected
242 |    * 
243 |    * @throws NullPointerException if data is null
244 |    * @throws IllegalArgumentException if len < 0
245 |    * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length
246 |    * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O
247 |    *         operations are performed in-memory
248 |    */
249 |   public static Optional<String> detectCharsetName(byte[] data, int off, int len,
250 |       String declaredEncoding) {
251 |     if (data == null)
252 |       throw new NullPointerException();
253 |     if (len < 0)
254 |       throw new IllegalArgumentException("len < 0");
255 |     if (off < 0)
256 |       throw new ArrayIndexOutOfBoundsException(off);
257 |     if (off + len > data.length)
258 |       throw new ArrayIndexOutOfBoundsException(off + len);
259 | 
260 |     Optional<ByteOrderMark> maybeBom = ByteOrderMark.detect(data, off, len);
261 |     if (maybeBom.isPresent()) {
262 |       return maybeBom.map(ByteOrderMark::getCharsetName);
263 |     }
264 | 
265 |     CharsetDetector chardet = new CharsetDetector();
266 | 
267 |     if (off == 0 && len == data.length) {
268 |       // Let's avoid a byte copy if we can
269 |       chardet.setText(data);
270 |     } else {
271 |       try {
272 |         chardet.setText(new ByteArrayInputStream(data, off, len));
273 |       } catch (IOException e) {
274 |         // This should never happen in a ByteArrayInputStream
275 |         throw new UncheckedIOException("unexpected exception when reading from byte array", e);
276 |       }
277 |     }
278 | 
279 |     // Ideally, we'd just use this methods from the CharsetDetector class, but the declared encoding
280 |     // is ignored. So we have to do it ourselves.
281 |     // if (declaredEncoding != null)
282 |     // chardet.setDeclaredEncoding(declaredEncoding);
283 | 
284 |     List<ChardetMatch> matches = Arrays.stream(chardet.detectAll()).map(mi -> {
285 |       String name = mi.getName();
286 | 
287 |       int confidence = mi.getConfidence();
288 |       if (declaredEncoding != null && name.equalsIgnoreCase(declaredEncoding))
289 |         confidence = Math.min(confidence + DECLARED_ENCODING_BUMP, MAX_CONFIDENCE);
290 | 
291 |       return ChardetMatch.of(name, confidence);
292 |     }).sorted(Comparator.reverseOrder()).collect(toList());
293 | 
294 |     if (matches.isEmpty()) {
295 |       return Optional.empty();
296 |     }
297 | 
298 |     return Optional.of(matches.get(0).getName());
299 |   }
300 | 
301 |   // decode ////////////////////////////////////////////////////////////////////////////////////////
302 | 
303 |   /**
304 |    * The default is chosen based on a reading of the CharsetDetector source code, which sets buffer
305 |    * size for byte frequency analysis at 8000. (Ample) extra space is left for BOMs.
306 |    */
307 |   public static final int DECODE_DETECT_BUFSIZE =
308 |       Optional.ofNullable(System.getProperty("chardet4j.detect.bufsize")).map(Integer::parseInt)
309 |           .orElse(8192);
310 | 
311 |   /**
312 |    * Returns a character-decoded version of the given byte stream. Any leading BOMs are discarded.
313 |    * If no character set can be detected, then the given default is used.
314 |    * 
315 |    * @param input the input stream
316 |    * @param defaultCharset the default charset to use if no other can be detected
317 |    * 
318 |    * @throws NullPointerException if input is null
319 |    * @throws NullPointerException if defaultCharset is null
320 |    * @throws IOException if an I/O error occurs
321 |    * @throws UnsupportedCharsetException if the detected charset is not supported
322 |    */
323 |   public static DecodedInputStreamReader decode(InputStream input, Charset defaultCharset)
324 |       throws IOException {
325 |     return decode(input, null, defaultCharset);
326 |   }
327 | 
328 |   /**
329 |    * Returns a character-decoded version of the given byte stream. The declared encoding is treated
330 |    * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given
331 |    * default is used. If the character set is detected, but not supported, then an
332 |    * {@link UnsupportedCharsetException} is thrown.
333 |    * 
334 |    * @param input the input stream
335 |    * @param declaredEncoding the declared encoding, treated as a hint
336 |    * @param defaultCharset the default charset to use if no other can be detected
337 |    * @return the character-decoded stream
338 |    * 
339 |    * @throws NullPointerException if input is null
340 |    * @throws NullPointerException if defaultCharset is null
341 |    * @throws IOException if an I/O error occurs
342 |    * @throws UnsupportedCharsetException if the detected charset is not supported
343 |    */
344 |   public static DecodedInputStreamReader decode(InputStream input, String declaredEncoding,
345 |       Charset defaultCharset) throws IOException {
346 |     if (input == null)
347 |       throw new NullPointerException();
348 |     if (defaultCharset == null)
349 |       throw new NullPointerException();
350 | 
351 |     // Detect the BOM, if any. If there is one, then trust it and use the corresponding charset.
352 |     final BomAwareInputStream bomed = BomAwareInputStream.detect(input);
353 |     if (bomed.bom().isPresent())
354 |       return new DecodedInputStreamReader(bomed, bomed.bom().get().getCharset());
355 | 
356 |     // If there is no BOM, then read some bytes to detect the charset.
357 |     final byte[] buf = ByteStreams.readNBytes(bomed, DECODE_DETECT_BUFSIZE);
358 | 
359 |     // Note that charset cannot be null, since we check defaultCharset above.
360 |     Charset charset = detectCharset(buf, declaredEncoding).orElse(defaultCharset);
361 | 
362 |     return new DecodedInputStreamReader(
363 |         new SequenceInputStream(new ByteArrayInputStream(buf), bomed), charset);
364 |   }
365 | 
366 |   /**
367 |    * Returns a character-decoded String version of the given bytes. The declared encoding is treated
368 |    * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given
369 |    * default is used. If the character set is detected, but not supported, then an
370 |    * {@link UnsupportedCharsetException} is thrown.
371 |    * 
372 |    * @param data the byte data
373 |    * @param declaredEncoding the declared encoding, treated as a hint
374 |    * @param defaultCharset the default charset to use if no other can be detected
375 |    * @return the character-decoded string
376 |    * 
377 |    * @throws NullPointerException if data is null
378 |    * @throws UnsupportedCharsetException if the detected charset is not supported
379 |    * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O
380 |    *         operations are performed in-memory
381 |    */
382 |   public static String decode(byte[] data, Charset defaultCharset) {
383 |     return decode(data, null, defaultCharset);
384 |   }
385 | 
386 |   /**
387 |    * Returns a character-decoded String version of the given bytes. The declared encoding is treated
388 |    * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given
389 |    * default is used. If the character set is detected, but not supported, then an
390 |    * {@link UnsupportedCharsetException} is thrown.
391 |    * 
392 |    * @param data the byte data
393 |    * @param declaredEncoding the declared encoding, treated as a hint
394 |    * @param defaultCharset the default charset to use if no other can be detected
395 |    * @return the character-decoded string
396 |    * 
397 |    * @throws NullPointerException if data is null
398 |    * @throws UnsupportedCharsetException if the detected charset is not supported
399 |    * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O
400 |    *         operations are performed in-memory
401 |    */
402 |   public static String decode(byte[] data, String declaredEncoding, Charset defaultCharset) {
403 |     if (data == null)
404 |       throw new NullPointerException();
405 |     return decode(data, data.length, declaredEncoding, defaultCharset);
406 |   }
407 | 
408 |   /**
409 |    * Returns a character-decoded String version of the given bytes. The declared encoding is treated
410 |    * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given
411 |    * default is used. If the character set is detected, but not supported, then an
412 |    * {@link UnsupportedCharsetException} is thrown.
413 |    * 
414 |    * @param data the byte data
415 |    * @param len the number of bytes to consider, starting from 0
416 |    * @param declaredEncoding the declared encoding, treated as a hint
417 |    * @param defaultCharset the default charset to use if no other can be detected
418 |    * @return the character-decoded string
419 |    * 
420 |    * @throws NullPointerException if data is null
421 |    * @throws IllegalArgumentException if len < 0
422 |    * @throws ArrayIndexOutOfBoundsException if len > data.length
423 |    * @throws UnsupportedCharsetException if the detected charset is detected, but not supported
424 |    * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O
425 |    *         operations are performed in-memory
426 |    */
427 |   public static String decode(byte[] data, int len, String declaredEncoding,
428 |       Charset defaultCharset) {
429 |     return decode(data, 0, len, declaredEncoding, defaultCharset);
430 |   }
431 | 
432 |   /**
433 |    * Returns a character-decoded String version of the given bytes. The declared encoding is treated
434 |    * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given
435 |    * default is used. If the character set is detected, but not supported, then an
436 |    * {@link UnsupportedCharsetException} is thrown.
437 |    * 
438 |    * @param data the byte data
439 |    * @param off the offset into the byte data
440 |    * @param len the number of bytes to consider, starting at off
441 |    * @param declaredEncoding the declared encoding, treated as a hint
442 |    * @param defaultCharset the default charset to use if no other can be detected
443 |    * @return the character-decoded string
444 |    * 
445 |    * @throws NullPointerException if data is null
446 |    * @throws NullPointerException if defaultCharset is null
447 |    * @throws IllegalArgumentException if len < 0
448 |    * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length
449 |    * @throws UnsupportedCharsetException if the detected charset is detected, but not supported
450 |    * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O
451 |    *         operations are performed in-memory
452 |    */
453 |   public static String decode(byte[] data, int off, int len, String declaredEncoding,
454 |       Charset defaultCharset) {
455 |     if (data == null)
456 |       throw new NullPointerException();
457 |     if (defaultCharset == null)
458 |       throw new NullPointerException();
459 |     if (len < 0)
460 |       throw new IllegalArgumentException("len < 0");
461 |     if (off < 0)
462 |       throw new ArrayIndexOutOfBoundsException(off);
463 |     if (off + len > data.length)
464 |       throw new ArrayIndexOutOfBoundsException(off + len);
465 | 
466 |     try (InputStream in = new ByteArrayInputStream(data, off, len);
467 |         Reader r = decode(in, declaredEncoding, defaultCharset);
468 |         Writer w = new StringWriter()) {
469 |       CharStreams.transferTo(r, w);
470 |       return w.toString();
471 |     } catch (IOException e) {
472 |       throw new UncheckedIOException(e);
473 |     }
474 |   }
475 | }
476 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetDetector.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /**
  4 | *******************************************************************************
  5 | * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  6 | * others. All Rights Reserved.                                                *
  7 | *******************************************************************************
  8 | */
  9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 10 | 
 11 | import java.io.IOException;
 12 | import java.io.InputStream;
 13 | import java.io.Reader;
 14 | import java.util.ArrayList;
 15 | import java.util.Arrays;
 16 | import java.util.Collections;
 17 | import java.util.List;
 18 | 
 19 | 
 20 | /**
 21 |  * <code>CharsetDetector</code> provides a facility for detecting the
 22 |  * charset or encoding of character data in an unknown format.
 23 |  * The input data can either be from an input stream or an array of bytes.
 24 |  * The result of the detection operation is a list of possibly matching
 25 |  * charsets, or, for simple use, you can just ask for a Java Reader that
 26 |  * will will work over the input data.
 27 |  * <p>
 28 |  * Character set detection is at best an imprecise operation.  The detection
 29 |  * process will attempt to identify the charset that best matches the characteristics
 30 |  * of the byte data, but the process is partly statistical in nature, and
 31 |  * the results can not be guaranteed to always be correct.
 32 |  * <p>
 33 |  * For best accuracy in charset detection, the input data should be primarily
 34 |  * in a single language, and a minimum of a few hundred bytes worth of plain text
 35 |  * in the language are needed.  The detection process will attempt to
 36 |  * ignore html or xml style markup that could otherwise obscure the content.
 37 |  * <p>
 38 |  * @stable ICU 3.4
 39 |  */
 40 | public class CharsetDetector {
 41 | 
 42 | //   Question: Should we have getters corresponding to the setters for input text
 43 | //   and declared encoding?
 44 | 
 45 | //   A thought: If we were to create our own type of Java Reader, we could defer
 46 | //   figuring out an actual charset for data that starts out with too much English
 47 | //   only ASCII until the user actually read through to something that didn't look
 48 | //   like 7 bit English.  If  nothing else ever appeared, we would never need to
 49 | //   actually choose the "real" charset.  All assuming that the application just
 50 | //   wants the data, and doesn't care about a char set name.
 51 | 
 52 |     /**
 53 |      *   Constructor
 54 |      * 
 55 |      * @stable ICU 3.4
 56 |      */
 57 |     public CharsetDetector() {
 58 |     }
 59 | 
 60 |     /**
 61 |      * Set the declared encoding for charset detection.
 62 |      *  The declared encoding of an input text is an encoding obtained
 63 |      *  from an http header or xml declaration or similar source that
 64 |      *  can be provided as additional information to the charset detector.  
 65 |      *  A match between a declared encoding and a possible detected encoding
 66 |      *  will raise the quality of that detected encoding by a small delta,
 67 |      *  and will also appear as a "reason" for the match.
 68 |      * <p>
 69 |      * A declared encoding that is incompatible with the input data being
 70 |      * analyzed will not be added to the list of possible encodings.
 71 |      * 
 72 |      *  @param encoding The declared encoding 
 73 |      *
 74 |      * @stable ICU 3.4
 75 |      */
 76 |     public CharsetDetector setDeclaredEncoding(String encoding) {
 77 |         fDeclaredEncoding = encoding;
 78 |         return this;
 79 |     }
 80 |     
 81 |     /**
 82 |      * Set the input text (byte) data whose charset is to be detected.
 83 |      * 
 84 |      * @param in the input text of unknown encoding
 85 |      * 
 86 |      * @return This CharsetDetector
 87 |      *
 88 |      * @stable ICU 3.4
 89 |      */
 90 |     public CharsetDetector setText(byte [] in) {
 91 |         fRawInput  = in;
 92 |         fRawLength = in.length;
 93 |         
 94 |         return this;
 95 |     }
 96 |     
 97 |     private static final int kBufSize = 8000;
 98 | 
 99 |     /**
100 |      * Set the input text (byte) data whose charset is to be detected.
101 |      *  <p>
102 |      *   The input stream that supplies the character data must have markSupported()
103 |      *   == true; the charset detection process will read a small amount of data,
104 |      *   then return the stream to its original position via
105 |      *   the InputStream.reset() operation.  The exact amount that will
106 |      *   be read depends on the characteristics of the data itself.
107 |      *
108 |      * @param in the input text of unknown encoding
109 |      * 
110 |      * @return This CharsetDetector
111 |      *
112 |      * @stable ICU 3.4
113 |      */
114 |     
115 |     public CharsetDetector setText(InputStream in) throws IOException {
116 |         fInputStream = in;
117 |         fInputStream.mark(kBufSize);
118 |         fRawInput = new byte[kBufSize];   // Always make a new buffer because the
119 |                                           //   previous one may have come from the caller,
120 |                                           //   in which case we can't touch it.
121 |         fRawLength = 0;
122 |         int remainingLength = kBufSize;
123 |         while (remainingLength > 0 ) {
124 |             // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
125 |             int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
126 |             if (bytesRead <= 0) {
127 |                  break;
128 |             }
129 |             fRawLength += bytesRead;
130 |             remainingLength -= bytesRead;
131 |         }
132 |         fInputStream.reset();
133 |         
134 |         return this;
135 |     }
136 | 
137 |   
138 |     /**
139 |      * Return the charset that best matches the supplied input data.
140 |      * 
141 |      * Note though, that because the detection 
142 |      * only looks at the start of the input data,
143 |      * there is a possibility that the returned charset will fail to handle
144 |      * the full set of input data.
145 |      * <p>
146 |      * Raise an exception if 
147 |      *  <ul>
148 |      *    <li>no charset appears to match the data.</li>
149 |      *    <li>no input text has been provided</li>
150 |      *  </ul>
151 |      *
152 |      * @return a CharsetMatch object representing the best matching charset, or
153 |      *         <code>null</code> if there are no matches.
154 |      *
155 |      * @stable ICU 3.4
156 |      */
157 |     public CharsetMatch detect() {
158 | //   TODO:  A better implementation would be to copy the detect loop from
159 | //          detectAll(), and cut it short as soon as a match with a high confidence
160 | //          is found.  This is something to be done later, after things are otherwise
161 | //          working.
162 |         CharsetMatch matches[] = detectAll();
163 |         
164 |         if (matches == null || matches.length == 0) {
165 |             return null;
166 |         }
167 |         
168 |         return matches[0];
169 |      }
170 |     
171 |     /**
172 |      *  Return an array of all charsets that appear to be plausible
173 |      *  matches with the input data.  The array is ordered with the
174 |      *  best quality match first.
175 |      * <p>
176 |      * Raise an exception if 
177 |      *  <ul>
178 |      *    <li>no charsets appear to match the input data.</li>
179 |      *    <li>no input text has been provided</li>
180 |      *  </ul>
181 |      * 
182 |      * @return An array of CharsetMatch objects representing possibly matching charsets.
183 |      *
184 |      * @stable ICU 3.4
185 |      */
186 |     public CharsetMatch[] detectAll() {
187 |         ArrayList<CharsetMatch>         matches = new ArrayList<CharsetMatch>();
188 |         
189 |         MungeInput();  // Strip html markup, collect byte stats.
190 |         
191 |         //  Iterate over all possible charsets, remember all that
192 |         //    give a match quality > 0.
193 |         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
194 |             CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
195 |             boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
196 |             if (active) {
197 |                 CharsetMatch m = rcinfo.recognizer.match(this);
198 |                 if (m != null) {
199 |                     matches.add(m);
200 |                 }
201 |             }
202 |         }
203 |         Collections.sort(matches);      // CharsetMatch compares on confidence
204 |         Collections.reverse(matches);   //  Put best match first.
205 |         CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
206 |         resultArray = matches.toArray(resultArray);
207 |         return resultArray;
208 |     }
209 | 
210 |     
211 |     /**
212 |      * Autodetect the charset of an inputStream, and return a Java Reader
213 |      * to access the converted input data.
214 |      * <p>
215 |      * This is a convenience method that is equivalent to
216 |      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
217 |      * <p>
218 |      *   For the input stream that supplies the character data, markSupported()
219 |      *   must be true; the  charset detection will read a small amount of data,
220 |      *   then return the stream to its original position via
221 |      *   the InputStream.reset() operation.  The exact amount that will
222 |      *    be read depends on the characteristics of the data itself.
223 |      *<p>
224 |      * Raise an exception if no charsets appear to match the input data.
225 |      * 
226 |      * @param in The source of the byte data in the unknown charset.
227 |      *
228 |      * @param declaredEncoding  A declared encoding for the data, if available,
229 |      *           or null or an empty string if none is available.
230 |      *
231 |      * @stable ICU 3.4
232 |      */
233 |     public Reader getReader(InputStream in, String declaredEncoding) {
234 |         fDeclaredEncoding = declaredEncoding;
235 |         
236 |         try {
237 |             setText(in);
238 |             
239 |             CharsetMatch match = detect();
240 |             
241 |             if (match == null) {
242 |                 return null;
243 |             }
244 |             
245 |             return match.getReader();
246 |         } catch (IOException e) {
247 |             return null;
248 |         }
249 |     }
250 | 
251 |     /**
252 |      * Autodetect the charset of an inputStream, and return a String
253 |      * containing the converted input data.
254 |      * <p>
255 |      * This is a convenience method that is equivalent to
256 |      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
257 |      *<p>
258 |      * Raise an exception if no charsets appear to match the input data.
259 |      * 
260 |      * @param in The source of the byte data in the unknown charset.
261 |      *
262 |      * @param declaredEncoding  A declared encoding for the data, if available,
263 |      *           or null or an empty string if none is available.
264 |      *
265 |      * @stable ICU 3.4
266 |      */
267 |     public String getString(byte[] in, String declaredEncoding)
268 |     {
269 |         fDeclaredEncoding = declaredEncoding;
270 |        
271 |         try {
272 |             setText(in);
273 |             
274 |             CharsetMatch match = detect();
275 |             
276 |             if (match == null) {
277 |                 return null;
278 |             }
279 |             
280 |             return match.getString(-1);
281 |         } catch (IOException e) {
282 |             return null;
283 |         }
284 |     }
285 | 
286 |  
287 |     /**
288 |      * Get the names of all charsets supported by <code>CharsetDetector</code> class.
289 |      * <p>
290 |      * <b>Note:</b> Multiple different charset encodings in a same family may use
291 |      * a single shared name in this implementation. For example, this method returns
292 |      * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
293 |      * (Windows Latin 1). However, actual detection result could be "windows-1252"
294 |      * when the input data matches Latin 1 code points with any points only available
295 |      * in "windows-1252".
296 |      *
297 |      * @return an array of the names of all charsets supported by
298 |      * <code>CharsetDetector</code> class.
299 |      *
300 |      * @stable ICU 3.4
301 |      */
302 |     public static String[] getAllDetectableCharsets() {
303 |         String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
304 |         for (int i = 0; i < allCharsetNames.length; i++) {
305 |             allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
306 |         }
307 |         return allCharsetNames;
308 |     }   
309 |        
310 |     /**
311 |      * Test whether or not input filtering is enabled.
312 |      * 
313 |      * @return <code>true</code> if input text will be filtered.
314 |      * 
315 |      * @see #enableInputFilter
316 |      *
317 |      * @stable ICU 3.4
318 |      */
319 |     public boolean inputFilterEnabled()
320 |     {
321 |         return fStripTags;
322 |     }
323 |     
324 |     /**
325 |      * Enable filtering of input text. If filtering is enabled,
326 |      * text within angle brackets ("&lt;" and "&gt;") will be removed
327 |      * before detection.
328 |      * 
329 |      * @param filter <code>true</code> to enable input text filtering.
330 |      * 
331 |      * @return The previous setting.
332 |      *
333 |      * @stable ICU 3.4
334 |      */
335 |     public boolean enableInputFilter(boolean filter)
336 |     {
337 |         boolean previous = fStripTags;
338 |         
339 |         fStripTags = filter;
340 |         
341 |         return previous;
342 |     }
343 |     
344 |     /*
345 |      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
346 |      *               it by removing what appears to be html markup.
347 |      */
348 |     private void MungeInput() {
349 |         int srci = 0;
350 |         int dsti = 0;
351 |         byte b;
352 |         boolean  inMarkup = false;
353 |         int      openTags = 0;
354 |         int      badTags  = 0;
355 |         
356 |         //
357 |         //  html / xml markup stripping.
358 |         //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
359 |         //     discard everything within < brackets >
360 |         //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
361 |         //     guess as to whether the input was actually marked up at all.
362 |         if (fStripTags) {
363 |             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
364 |                 b = fRawInput[srci];
365 |                 if (b == (byte)'<') {
366 |                     if (inMarkup) {
367 |                         badTags++;
368 |                     }
369 |                     inMarkup = true;
370 |                     openTags++;
371 |                 }
372 |                 
373 |                 if (! inMarkup) {
374 |                     fInputBytes[dsti++] = b;
375 |                 }
376 |                 
377 |                 if (b == (byte)'>') {
378 |                     inMarkup = false;
379 |                 }        
380 |             }
381 |             
382 |             fInputLen = dsti;
383 |         }
384 |         
385 |         //
386 |         //  If it looks like this input wasn't marked up, or if it looks like it's
387 |         //    essentially nothing but markup abandon the markup stripping.
388 |         //    Detection will have to work on the unstripped input.
389 |         //
390 |         if (openTags<5 || openTags/5 < badTags || 
391 |                 (fInputLen < 100 && fRawLength>600)) {
392 |             int limit = fRawLength;
393 |             
394 |             if (limit > kBufSize) {
395 |                 limit = kBufSize;
396 |             }
397 |             
398 |             for (srci=0; srci<limit; srci++) {
399 |                 fInputBytes[srci] = fRawInput[srci];
400 |             }
401 |             fInputLen = srci;
402 |         }
403 |         
404 |         //
405 |         // Tally up the byte occurrence statistics.
406 |         //   These are available for use by the various detectors.
407 |         //
408 |         Arrays.fill(fByteStats, (short)0);
409 |         for (srci=0; srci<fInputLen; srci++) {
410 |             int val = fInputBytes[srci] & 0x00ff;
411 |             fByteStats[val]++;
412 |         }
413 |         
414 |         fC1Bytes = false;
415 |         for (int i = 0x80; i <= 0x9F; i += 1) {
416 |             if (fByteStats[i] != 0) {
417 |                 fC1Bytes = true;
418 |                 break;
419 |             }
420 |         }
421 |      }
422 | 
423 |     /*
424 |      *  The following items are accessed by individual CharsetRecongizers during
425 |      *     the recognition process
426 |      * 
427 |      */
428 |     byte[]      fInputBytes =       // The text to be checked.  Markup will have been
429 |                    new byte[kBufSize];  //   removed if appropriate.
430 |     
431 |     int         fInputLen;          // Length of the byte data in fInputBytes.
432 |     
433 |     short       fByteStats[] =      // byte frequency statistics for the input text.
434 |                    new short[256];  //   Value is percent, not absolute.
435 |                                     //   Value is rounded up, so zero really means zero occurrences.
436 |     
437 |     boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
438 |                    false;
439 |     
440 |     String      fDeclaredEncoding;
441 | 
442 | 
443 |     byte[]               fRawInput;     // Original, untouched input bytes.
444 |                                         //  If user gave us a byte array, this is it.
445 |                                         //  If user gave us a stream, it's read to a 
446 |                                         //  buffer here.
447 |     int                  fRawLength;    // Length of data in fRawInput array.
448 |     
449 |     InputStream          fInputStream;  // User's input stream, or null if the user
450 |                                         //   gave us a byte array.
451 |      
452 |     //
453 |     //  Stuff private to CharsetDetector
454 |     //
455 |     private boolean      fStripTags =   // If true, setText() will strip tags from input text.
456 |                            false;
457 | 
458 |     private boolean[]    fEnabledRecognizers;   // If not null, active set of charset recognizers had
459 |                                                 // been changed from the default. The array index is
460 |                                                 // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
461 | 
462 |     private static class CSRecognizerInfo {
463 |         CharsetRecognizer recognizer;
464 |         boolean isDefaultEnabled;
465 | 
466 |         CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
467 |             this.recognizer = recognizer;
468 |             this.isDefaultEnabled = isDefaultEnabled;
469 |         }
470 |     }
471 | 
472 |     /*
473 |      * List of recognizers for all charsets known to the implementation.
474 |      */
475 |     private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
476 | 
477 |     static {
478 |         List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>();
479 | 
480 |         list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
481 |         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
482 |         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
483 |         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
484 |         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
485 | 
486 |         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
487 |         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
488 |         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
489 |         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
490 |         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
491 |         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
492 |         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
493 |         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
494 | 
495 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
496 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
497 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
498 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
499 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
500 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
501 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
502 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
503 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
504 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
505 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
506 | 
507 |         // IBM 420/424 recognizers are disabled by default
508 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
509 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
510 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
511 |         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
512 | 
513 |         ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
514 |     }
515 | 
516 |     /**
517 |      * Get the names of charsets that can be recognized by this CharsetDetector instance.
518 |      *
519 |      * @return an array of the names of charsets that can be recognized by this CharsetDetector
520 |      * instance.
521 |      *
522 |      * @internal
523 |      * @deprecated This API is ICU internal only.
524 |      */
525 |     @Deprecated
526 |     public String[] getDetectableCharsets() {
527 |         List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size());
528 |         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
529 |             CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
530 |             boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
531 |             if (active) {
532 |                 csnames.add(rcinfo.recognizer.getName());
533 |             }
534 |         }
535 |         return csnames.toArray(new String[csnames.size()]);
536 |     }
537 | 
538 |     /**
539 |      * Enable or disable individual charset encoding.
540 |      * A name of charset encoding must be included in the names returned by
541 |      * {@link #getAllDetectableCharsets()}.
542 |      *
543 |      * @param encoding the name of charset encoding.
544 |      * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
545 |      * charset encoding.
546 |      * @return A reference to this <code>CharsetDetector</code>.
547 |      * @throws IllegalArgumentException when the name of charset encoding is
548 |      * not supported.
549 |      *
550 |      * @internal
551 |      * @deprecated This API is ICU internal only.
552 |      */
553 |     @Deprecated
554 |     public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
555 |         int modIdx = -1;
556 |         boolean isDefaultVal = false;
557 |         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
558 |             CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
559 |             if (csrinfo.recognizer.getName().equals(encoding)) {
560 |                 modIdx = i;
561 |                 isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
562 |                 break;
563 |             }
564 |         }
565 |         if (modIdx < 0) {
566 |             // No matching encoding found
567 |             throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
568 |         }
569 | 
570 |         if (fEnabledRecognizers == null && !isDefaultVal) {
571 |             // Create an array storing the non default setting
572 |             fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
573 | 
574 |             // Initialize the array with default info
575 |             for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
576 |                 fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
577 |             }
578 |         }
579 |  
580 |         if (fEnabledRecognizers != null) {
581 |             fEnabledRecognizers[modIdx] = enabled;
582 |         }
583 | 
584 |         return this;
585 |     }
586 | }
587 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetMatch.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /**
  4 | *******************************************************************************
  5 | * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  6 | * others. All Rights Reserved.                                                *
  7 | *******************************************************************************
  8 | */
  9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 10 | 
 11 | import java.io.ByteArrayInputStream;
 12 | import java.io.IOException;
 13 | import java.io.InputStream;
 14 | import java.io.InputStreamReader;
 15 | import java.io.Reader;
 16 | 
 17 | 
 18 | /**
 19 |  * This class represents a charset that has been identified by a CharsetDetector
 20 |  * as a possible encoding for a set of input data.  From an instance of this
 21 |  * class, you can ask for a confidence level in the charset identification,
 22 |  * or for Java Reader or String to access the original byte data in Unicode form.
 23 |  * <p>
 24 |  * Instances of this class are created only by CharsetDetectors.
 25 |  * <p>
 26 |  * Note:  this class has a natural ordering that is inconsistent with equals.
 27 |  *        The natural ordering is based on the match confidence value.
 28 |  *
 29 |  * @stable ICU 3.4
 30 |  */
 31 | public class CharsetMatch implements Comparable<CharsetMatch> {
 32 | 
 33 | 
 34 |     /**
 35 |      * Create a java.io.Reader for reading the Unicode character data corresponding
 36 |      * to the original byte data supplied to the Charset detect operation.
 37 |      * <p>
 38 |      * CAUTION:  if the source of the byte data was an InputStream, a Reader
 39 |      * can be created for only one matching char set using this method.  If more
 40 |      * than one charset needs to be tried, the caller will need to reset
 41 |      * the InputStream and create InputStreamReaders itself, based on the charset name.
 42 |      *
 43 |      * @return the Reader for the Unicode character data.
 44 |      *
 45 |      * @stable ICU 3.4
 46 |      */
 47 |     public Reader getReader() {
 48 |         InputStream inputStream = fInputStream;
 49 | 
 50 |         if (inputStream == null) {
 51 |             inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
 52 |         }
 53 | 
 54 |         try {
 55 |             inputStream.reset();
 56 |             return new InputStreamReader(inputStream, getName());
 57 |         } catch (IOException e) {
 58 |             return null;
 59 |         }
 60 |     }
 61 | 
 62 |     /**
 63 |      * Create a Java String from Unicode character data corresponding
 64 |      * to the original byte data supplied to the Charset detect operation.
 65 |      *
 66 |      * @return a String created from the converted input data.
 67 |      *
 68 |      * @stable ICU 3.4
 69 |      */
 70 |     public String getString()  throws java.io.IOException {
 71 |         return getString(-1);
 72 | 
 73 |     }
 74 | 
 75 |     /**
 76 |      * Create a Java String from Unicode character data corresponding
 77 |      * to the original byte data supplied to the Charset detect operation.
 78 |      * The length of the returned string is limited to the specified size;
 79 |      * the string will be trunctated to this length if necessary.  A limit value of
 80 |      * zero or less is ignored, and treated as no limit.
 81 |      *
 82 |      * @param maxLength The maximum length of the String to be created when the
 83 |      *                  source of the data is an input stream, or -1 for
 84 |      *                  unlimited length.
 85 |      * @return a String created from the converted input data.
 86 |      *
 87 |      * @stable ICU 3.4
 88 |      */
 89 |     public String getString(int maxLength) throws java.io.IOException {
 90 |         String result = null;
 91 |         if (fInputStream != null) {
 92 |             StringBuilder sb = new StringBuilder();
 93 |             char[] buffer = new char[1024];
 94 |             Reader reader = getReader();
 95 |             int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
 96 |             int bytesRead = 0;
 97 | 
 98 |             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
 99 |                 sb.append(buffer, 0, bytesRead);
100 |                 max -= bytesRead;
101 |             }
102 | 
103 |             reader.close();
104 | 
105 |             return sb.toString();
106 |         } else {
107 |             String name = getName();
108 |             /*
109 |              * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
110 |              * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
111 |              * should be stripped off before creating the string.
112 |              */
113 |             int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
114 |             if (startSuffix > 0) {
115 |                 name = name.substring(0, startSuffix);
116 |             }
117 |             result = new String(fRawInput, name);
118 |         }
119 |         return result;
120 | 
121 |     }
122 | 
123 |     /**
124 |      * Get an indication of the confidence in the charset detected.
125 |      * Confidence values range from 0-100, with larger numbers indicating
126 |      * a better match of the input data to the characteristics of the
127 |      * charset.
128 |      *
129 |      * @return the confidence in the charset match
130 |      *
131 |      * @stable ICU 3.4
132 |      */
133 |     public int getConfidence() {
134 |         return fConfidence;
135 |     }
136 | 
137 |     /**
138 |      * Get the name of the detected charset.
139 |      * The name will be one that can be used with other APIs on the
140 |      * platform that accept charset names.  It is the "Canonical name"
141 |      * as defined by the class java.nio.charset.Charset; for
142 |      * charsets that are registered with the IANA charset registry,
143 |      * this is the MIME-preferred registerd name.
144 |      *
145 |      * @see java.nio.charset.Charset
146 |      * @see java.io.InputStreamReader
147 |      *
148 |      * @return The name of the charset.
149 |      *
150 |      * @stable ICU 3.4
151 |      */
152 |     public String getName() {
153 |         return fCharsetName;
154 |     }
155 | 
156 |     /**
157 |      * Get the ISO code for the language of the detected charset.
158 |      *
159 |      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
160 |      *
161 |      * @stable ICU 3.4
162 |      */
163 |     public String getLanguage() {
164 |         return fLang;
165 |     }
166 | 
167 |     /**
168 |      * Compare to other CharsetMatch objects.
169 |      * Comparison is based on the match confidence value, which
170 |      *   allows CharsetDetector.detectAll() to order its results.
171 |      *
172 |      * @param other the CharsetMatch object to compare against.
173 |      * @return  a negative integer, zero, or a positive integer as the
174 |      *          confidence level of this CharsetMatch
175 |      *          is less than, equal to, or greater than that of
176 |      *          the argument.
177 |      * @throws ClassCastException if the argument is not a CharsetMatch.
178 |      * @stable ICU 4.4
179 |      */
180 |     @Override
181 |     public int compareTo (CharsetMatch other) {
182 |         int compareResult = 0;
183 |         if (this.fConfidence > other.fConfidence) {
184 |             compareResult = 1;
185 |         } else if (this.fConfidence < other.fConfidence) {
186 |             compareResult = -1;
187 |         }
188 |         return compareResult;
189 |     }
190 | 
191 |     /*
192 |      *  Constructor.  Implementation internal
193 |      */
194 |     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
195 |         fConfidence = conf;
196 | 
197 |         // The references to the original application input data must be copied out
198 |         //   of the charset recognizer to here, in case the application resets the
199 |         //   recognizer before using this CharsetMatch.
200 |         if (det.fInputStream == null) {
201 |             // We only want the existing input byte data if it came straight from the user,
202 |             //   not if is just the head of a stream.
203 |             fRawInput    = det.fRawInput;
204 |             fRawLength   = det.fRawLength;
205 |         }
206 |         fInputStream = det.fInputStream;
207 |         fCharsetName = rec.getName();
208 |         fLang = rec.getLanguage();
209 |     }
210 | 
211 |     /*
212 |      *  Constructor.  Implementation internal
213 |      */
214 |     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
215 |         fConfidence = conf;
216 | 
217 |         // The references to the original application input data must be copied out
218 |         //   of the charset recognizer to here, in case the application resets the
219 |         //   recognizer before using this CharsetMatch.
220 |         if (det.fInputStream == null) {
221 |             // We only want the existing input byte data if it came straight from the user,
222 |             //   not if is just the head of a stream.
223 |             fRawInput    = det.fRawInput;
224 |             fRawLength   = det.fRawLength;
225 |         }
226 |         fInputStream = det.fInputStream;
227 |         fCharsetName = csName;
228 |         fLang = lang;
229 |     }
230 | 
231 | 
232 |     //
233 |     //   Private Data
234 |     //
235 |     private int                 fConfidence;
236 |     private byte[]              fRawInput = null;     // Original, untouched input bytes.
237 |                                                       //  If user gave us a byte array, this is it.
238 |     private int                 fRawLength;           // Length of data in fRawInput array.
239 | 
240 |     private InputStream         fInputStream = null;  // User's input stream, or null if the user
241 |                                                       //   gave us a byte array.
242 | 
243 |     private String              fCharsetName;         // The name of the charset this CharsetMatch
244 |                                                       //   represents.  Filled in by the recognizer.
245 |     private String              fLang;                // The language, if one was determined by
246 |                                                       //   the recognizer during the detect operation.
247 | }
248 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_2022.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /*
  4 | *******************************************************************************
  5 | * Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
  6 | * others. All Rights Reserved.                                                *
  7 | *******************************************************************************
  8 | */
  9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 10 | 
 11 | /**
 12 |  *  class CharsetRecog_2022  part of the ICU charset detection implementation.
 13 |  *                           This is a superclass for the individual detectors for
 14 |  *                           each of the detectable members of the ISO 2022 family
 15 |  *                           of encodings.
 16 |  *
 17 |  *                           The separate classes are nested within this class.
 18 |  */
 19 | abstract class CharsetRecog_2022 extends CharsetRecognizer {
 20 | 
 21 | 
 22 |     /**
 23 |      * Matching function shared among the 2022 detectors JP, CN and KR
 24 |      * Counts up the number of legal an unrecognized escape sequences in
 25 |      * the sample of text, and computes a score based on the total number &
 26 |      * the proportion that fit the encoding.
 27 |      *
 28 |      *
 29 |      * @param text the byte buffer containing text to analyse
 30 |      * @param textLen  the size of the text in the byte.
 31 |      * @param escapeSequences the byte escape sequences to test for.
 32 |      * @return match quality, in the range of 0-100.
 33 |      */
 34 |     int   match(byte [] text, int textLen, byte [][] escapeSequences) {
 35 |         int     i, j;
 36 |         int     escN;
 37 |         int     hits   = 0;
 38 |         int     misses = 0;
 39 |         int     shifts = 0;
 40 |         int     quality;
 41 |         scanInput:
 42 |             for (i=0; i<textLen; i++) {
 43 |                 if (text[i] == 0x1b) {
 44 |                     checkEscapes:
 45 |                         for (escN=0; escN<escapeSequences.length; escN++) {
 46 |                             byte [] seq = escapeSequences[escN];
 47 | 
 48 |                             if ((textLen - i) < seq.length) {
 49 |                                 continue checkEscapes;
 50 |                             }
 51 | 
 52 |                             for (j=1; j<seq.length; j++) {
 53 |                                 if (seq[j] != text[i+j])  {
 54 |                                     continue checkEscapes;
 55 |                                 }
 56 |                             }
 57 | 
 58 |                             hits++;
 59 |                             i += seq.length-1;
 60 |                             continue scanInput;
 61 |                         }
 62 | 
 63 |                         misses++;
 64 |                 }
 65 | 
 66 |                 if (text[i] == 0x0e || text[i] == 0x0f) {
 67 |                     // Shift in/out
 68 |                     shifts++;
 69 |                 }
 70 |             }
 71 | 
 72 |         if (hits == 0) {
 73 |             return 0;
 74 |         }
 75 | 
 76 |         //
 77 |         // Initial quality is based on relative proportion of recognized vs.
 78 |         //   unrecognized escape sequences.
 79 |         //   All good:  quality = 100;
 80 |         //   half or less good: quality = 0;
 81 |         //   linear inbetween.
 82 |         quality = (100*hits - 100*misses) / (hits + misses);
 83 | 
 84 |         // Back off quality if there were too few escape sequences seen.
 85 |         //   Include shifts in this computation, so that KR does not get penalized
 86 |         //   for having only a single Escape sequence, but many shifts.
 87 |         if (hits+shifts < 5) {
 88 |             quality -= (5-(hits+shifts))*10;
 89 |         }
 90 | 
 91 |         if (quality < 0) {
 92 |             quality = 0;
 93 |         }
 94 |         return quality;
 95 |     }
 96 | 
 97 | 
 98 | 
 99 | 
100 |     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
101 |         private byte [] [] escapeSequences = {
102 |                 {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
103 |                 {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
104 |                 {0x1b, 0x24, 0x40},         // JIS C 6226-1978
105 |                 {0x1b, 0x24, 0x41},         // GB 2312-80
106 |                 {0x1b, 0x24, 0x42},         // JIS X 208-1983
107 |                 {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
108 |                 {0x1b, 0x28, 0x42},         // ASCII
109 |                 {0x1b, 0x28, 0x48},         // JIS-Roman
110 |                 {0x1b, 0x28, 0x49},         // Half-width katakana
111 |                 {0x1b, 0x28, 0x4a},         // JIS-Roman
112 |                 {0x1b, 0x2e, 0x41},         // ISO 8859-1
113 |                 {0x1b, 0x2e, 0x46}          // ISO 8859-7
114 |                 };
115 | 
116 |         @Override
117 |         String getName() {
118 |             return "ISO-2022-JP";
119 |         }
120 | 
121 |         @Override
122 |         CharsetMatch   match(CharsetDetector det) {
123 |             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
124 |             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
125 |         }
126 |     }
127 | 
128 |     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
129 |         private byte [] [] escapeSequences = {
130 |                 {0x1b, 0x24, 0x29, 0x43}
131 |                  };
132 | 
133 |         @Override
134 |         String getName() {
135 |             return "ISO-2022-KR";
136 |         }
137 | 
138 |         @Override
139 |         CharsetMatch   match(CharsetDetector det) {
140 |             int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
141 |             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
142 |         }
143 |     }
144 | 
145 |     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
146 |         private byte [] [] escapeSequences = {
147 |                 {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
148 |                 {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
149 |                 {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
150 |                 {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
151 |                 {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
152 |                 {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
153 |                 {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
154 |                 {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
155 |                 {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
156 |                 {0x1b, 0x4e},               // SS2
157 |                 {0x1b, 0x4f},               // SS3
158 |         };
159 | 
160 |         @Override
161 |         String getName() {
162 |             return "ISO-2022-CN";
163 |         }
164 | 
165 |         @Override
166 |         CharsetMatch   match(CharsetDetector det) {
167 |             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
168 |             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
169 |         }
170 |     }
171 | 
172 | }
173 | 
174 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_UTF8.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /**
  4 | *******************************************************************************
  5 | * Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
  6 | * others. All Rights Reserved.                                                *
  7 | *******************************************************************************
  8 | */
  9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 10 | 
 11 | /**
 12 |  * Charset recognizer for UTF-8
 13 |  */
 14 | class CharsetRecog_UTF8 extends CharsetRecognizer {
 15 | 
 16 |     @Override
 17 |     String getName() {
 18 |         return "UTF-8";
 19 |     }
 20 | 
 21 |     /* (non-Javadoc)
 22 |      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
 23 |      */
 24 |     @Override
 25 |     CharsetMatch match(CharsetDetector det) {
 26 |         boolean     hasBOM = false;
 27 |         int         numValid = 0;
 28 |         int         numInvalid = 0;
 29 |         byte        input[] = det.fRawInput;
 30 |         int         i;
 31 |         int         trailBytes = 0;
 32 |         int         confidence;
 33 | 
 34 |         if (det.fRawLength >= 3 &&
 35 |                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
 36 |             hasBOM = true;
 37 |         }
 38 | 
 39 |         // Scan for multi-byte sequences
 40 |         for (i=0; i<det.fRawLength; i++) {
 41 |             int b = input[i];
 42 |             if ((b & 0x80) == 0) {
 43 |                 continue;   // ASCII
 44 |             }
 45 | 
 46 |             // Hi bit on char found.  Figure out how long the sequence should be
 47 |             if ((b & 0x0e0) == 0x0c0) {
 48 |                 trailBytes = 1;
 49 |             } else if ((b & 0x0f0) == 0x0e0) {
 50 |                 trailBytes = 2;
 51 |             } else if ((b & 0x0f8) == 0xf0) {
 52 |                 trailBytes = 3;
 53 |             } else {
 54 |                 numInvalid++;
 55 |                 continue;
 56 |             }
 57 | 
 58 |             // Verify that we've got the right number of trail bytes in the sequence
 59 |             for (;;) {
 60 |                 i++;
 61 |                 if (i>=det.fRawLength) {
 62 |                     break;
 63 |                 }
 64 |                 b = input[i];
 65 |                 if ((b & 0xc0) != 0x080) {
 66 |                     numInvalid++;
 67 |                     break;
 68 |                 }
 69 |                 if (--trailBytes == 0) {
 70 |                     numValid++;
 71 |                     break;
 72 |                 }
 73 |             }
 74 |         }
 75 | 
 76 |         // Cook up some sort of confidence score, based on presence of a BOM
 77 |         //    and the existence of valid and/or invalid multi-byte sequences.
 78 |         confidence = 0;
 79 |         if (hasBOM && numInvalid==0) {
 80 |             confidence = 100;
 81 |         } else if (hasBOM && numValid > numInvalid*10) {
 82 |             confidence = 80;
 83 |         } else if (numValid > 3 && numInvalid == 0) {
 84 |             confidence = 100;
 85 |         } else if (numValid > 0 && numInvalid == 0) {
 86 |             confidence = 80;
 87 |         } else if (numValid == 0 && numInvalid == 0) {
 88 |             // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
 89 |             //              accepts ASCII with confidence = 10.
 90 |             // TODO: add plain ASCII as an explicitly detected type.
 91 |             confidence = 15;
 92 |         } else if (numValid > numInvalid*10) {
 93 |             // Probably corrupt utf-8 data.  Valid sequences aren't likely by chance.
 94 |             confidence = 25;
 95 |         }
 96 |         return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 97 |     }
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_Unicode.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /*
  4 |  *******************************************************************************
  5 |  * Copyright (C) 1996-2013, International Business Machines Corporation and    *
  6 |  * others. All Rights Reserved.                                                *
  7 |  *******************************************************************************
  8 |  *
  9 |  */
 10 | 
 11 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 12 | 
 13 | /**
 14 |  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
 15 |  * BOM will be used if it is present.
 16 |  */
 17 | abstract class CharsetRecog_Unicode extends CharsetRecognizer {
 18 | 
 19 |     /* (non-Javadoc)
 20 |      * @see com.ibm.icu.text.CharsetRecognizer#getName()
 21 |      */
 22 |     @Override
 23 |     abstract String getName();
 24 | 
 25 |     /* (non-Javadoc)
 26 |      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
 27 |      */
 28 |     @Override
 29 |     abstract CharsetMatch match(CharsetDetector det);
 30 | 
 31 |     static int codeUnit16FromBytes(byte hi, byte lo) {
 32 |         return ((hi & 0xff) << 8) | (lo & 0xff);
 33 |     }
 34 | 
 35 |     // UTF-16 confidence calculation. Very simple minded, but better than nothing.
 36 |     //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
 37 |     //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
 38 |     //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
 39 |     //   NULs should be rare in actual text.
 40 |     static int adjustConfidence(int codeUnit, int confidence) {
 41 |         if (codeUnit == 0) {
 42 |             confidence -= 10;
 43 |         } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
 44 |             confidence += 10;
 45 |         }
 46 |         if (confidence < 0) {
 47 |             confidence = 0;
 48 |         } else if (confidence > 100) {
 49 |             confidence = 100;
 50 |         }
 51 |         return confidence;
 52 |     }
 53 | 
 54 |     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
 55 |     {
 56 |         @Override
 57 |         String getName()
 58 |         {
 59 |             return "UTF-16BE";
 60 |         }
 61 | 
 62 |         @Override
 63 |         CharsetMatch match(CharsetDetector det)
 64 |         {
 65 |             byte[] input = det.fRawInput;
 66 |             int confidence = 10;
 67 | 
 68 |             int bytesToCheck = Math.min(input.length, 30);
 69 |             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
 70 |                 int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
 71 |                 if (charIndex == 0 && codeUnit == 0xFEFF) {
 72 |                     confidence = 100;
 73 |                     break;
 74 |                 }
 75 |                 confidence = adjustConfidence(codeUnit, confidence);
 76 |                 if (confidence == 0 || confidence == 100) {
 77 |                     break;
 78 |                 }
 79 |             }
 80 |             if (bytesToCheck < 4 && confidence < 100) {
 81 |                 confidence = 0;
 82 |             }
 83 |             if (confidence > 0) {
 84 |                 return new CharsetMatch(det, this, confidence);
 85 |             }
 86 |             return null;
 87 |         }
 88 |     }
 89 | 
 90 |     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
 91 |     {
 92 |         @Override
 93 |         String getName()
 94 |         {
 95 |             return "UTF-16LE";
 96 |         }
 97 | 
 98 |         @Override
 99 |         CharsetMatch match(CharsetDetector det)
100 |         {
101 |             byte[] input = det.fRawInput;
102 |             int confidence = 10;
103 | 
104 |             int bytesToCheck = Math.min(input.length, 30);
105 |             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
106 |                 int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
107 |                 if (charIndex == 0 && codeUnit == 0xFEFF) {
108 |                     confidence = 100;
109 |                     break;
110 |                 }
111 |                 confidence = adjustConfidence(codeUnit, confidence);
112 |                 if (confidence == 0 || confidence == 100) {
113 |                     break;
114 |                 }
115 |             }
116 |             if (bytesToCheck < 4 && confidence < 100) {
117 |                 confidence = 0;
118 |             }
119 |             if (confidence > 0) {
120 |                 return new CharsetMatch(det, this, confidence);
121 |             }
122 |             return null;
123 |         }
124 |     }
125 | 
126 |     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
127 |     {
128 |         abstract int getChar(byte[] input, int index);
129 | 
130 |         @Override
131 |         abstract String getName();
132 | 
133 |         @Override
134 |         CharsetMatch match(CharsetDetector det)
135 |         {
136 |             byte[] input   = det.fRawInput;
137 |             int limit      = (det.fRawLength / 4) * 4;
138 |             int numValid   = 0;
139 |             int numInvalid = 0;
140 |             boolean hasBOM = false;
141 |             int confidence = 0;
142 | 
143 |             if (limit==0) {
144 |                 return null;
145 |             }
146 |             if (getChar(input, 0) == 0x0000FEFF) {
147 |                 hasBOM = true;
148 |             }
149 | 
150 |             for(int i = 0; i < limit; i += 4) {
151 |                 int ch = getChar(input, i);
152 | 
153 |                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
154 |                     numInvalid += 1;
155 |                 } else {
156 |                     numValid += 1;
157 |                 }
158 |             }
159 | 
160 | 
161 |             // Cook up some sort of confidence score, based on presence of a BOM
162 |             //    and the existence of valid and/or invalid multi-byte sequences.
163 |             if (hasBOM && numInvalid==0) {
164 |                 confidence = 100;
165 |             } else if (hasBOM && numValid > numInvalid*10) {
166 |                 confidence = 80;
167 |             } else if (numValid > 3 && numInvalid == 0) {
168 |                 confidence = 100;
169 |             } else if (numValid > 0 && numInvalid == 0) {
170 |                 confidence = 80;
171 |             } else if (numValid > numInvalid*10) {
172 |                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
173 |                 confidence = 25;
174 |             }
175 | 
176 |             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
177 |         }
178 |     }
179 | 
180 |     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
181 |     {
182 |         @Override
183 |         int getChar(byte[] input, int index)
184 |         {
185 |             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
186 |                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
187 |         }
188 | 
189 |         @Override
190 |         String getName()
191 |         {
192 |             return "UTF-32BE";
193 |         }
194 |     }
195 | 
196 | 
197 |     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
198 |     {
199 |         @Override
200 |         int getChar(byte[] input, int index)
201 |         {
202 |             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
203 |                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
204 |         }
205 | 
206 |         @Override
207 |         String getName()
208 |         {
209 |             return "UTF-32LE";
210 |         }
211 |     }
212 | }
213 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_mbcs.java:
--------------------------------------------------------------------------------
  1 | // © 2016 and later: Unicode, Inc. and others.
  2 | // License & terms of use: http://www.unicode.org/copyright.html
  3 | /*
  4 |  ****************************************************************************
  5 |  * Copyright (C) 2005-2012, International Business Machines Corporation and *
  6 |  * others. All Rights Reserved.                                             *
  7 |  ****************************************************************************
  8 |  *
  9 |  */
 10 | package com.sigpwned.chardet4j.com.ibm.icu.text;
 11 | 
 12 | import java.util.Arrays;
 13 | 
 14 | /**
 15 |  * CharsetRecognizer implementation for Asian  - double or multi-byte - charsets.
 16 |  *                   Match is determined mostly by the input data adhering to the
 17 |  *                   encoding scheme for the charset, and, optionally,
 18 |  *                   frequency-of-occurrence of characters.
 19 |  * <p/>
 20 |  *                   Instances of this class are singletons, one per encoding
 21 |  *                   being recognized.  They are created in the main
 22 |  *                   CharsetDetector class and kept in the global list of available
 23 |  *                   encodings to be checked.  The specific encoding being recognized
 24 |  *                   is determined by subclass.
 25 |  */
 26 | abstract class CharsetRecog_mbcs extends CharsetRecognizer {
 27 | 
 28 |    /**
 29 |      * Get the IANA name of this charset.
 30 |      * @return the charset name.
 31 |      */
 32 |     @Override
 33 |     abstract String      getName() ;
 34 | 
 35 | 
 36 |     /**
 37 |      * Test the match of this charset with the input text data
 38 |      *      which is obtained via the CharsetDetector object.
 39 |      *
 40 |      * @param det  The CharsetDetector, which contains the input text
 41 |      *             to be checked for being in this charset.
 42 |      * @return     Two values packed into one int  (Damn java, anyhow)
 43 |      *             <br/>
 44 |      *             bits 0-7:  the match confidence, ranging from 0-100
 45 |      *             <br/>
 46 |      *             bits 8-15: The match reason, an enum-like value.
 47 |      */
 48 |     int match(CharsetDetector det, int [] commonChars) {
 49 |         @SuppressWarnings("unused")
 50 |         int   singleByteCharCount = 0;  //TODO Do we really need this?
 51 |         int   doubleByteCharCount = 0;
 52 |         int   commonCharCount     = 0;
 53 |         int   badCharCount        = 0;
 54 |         int   totalCharCount      = 0;
 55 |         int   confidence          = 0;
 56 |         iteratedChar   iter       = new iteratedChar();
 57 | 
 58 |         detectBlock: {
 59 |             for (iter.reset(); nextChar(iter, det);) {
 60 |                 totalCharCount++;
 61 |                 if (iter.error) {
 62 |                     badCharCount++;
 63 |                 } else {
 64 |                     long cv = iter.charValue & 0xFFFFFFFFL;
 65 | 
 66 |                     if (cv <= 0xff) {
 67 |                         singleByteCharCount++;
 68 |                     } else {
 69 |                         doubleByteCharCount++;
 70 |                         if (commonChars != null) {
 71 |                             // NOTE: This assumes that there are no 4-byte common chars.
 72 |                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
 73 |                                 commonCharCount++;
 74 |                             }
 75 |                         }
 76 |                     }
 77 |                 }
 78 |                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 79 |                     // Bail out early if the byte data is not matching the encoding scheme.
 80 |                     break detectBlock;
 81 |                 }
 82 |             }
 83 | 
 84 |             if (doubleByteCharCount <= 10 && badCharCount== 0) {
 85 |                 // Not many multi-byte chars.
 86 |                 if (doubleByteCharCount == 0 && totalCharCount < 10) {
 87 |                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 88 |                     // We don't have enough data to have any confidence.
 89 |                     // Statistical analysis of single byte non-ASCII characters would probably help here.
 90 |                     confidence = 0;
 91 |                 }
 92 |                 else {
 93 |                     //   ASCII or ISO file?  It's probably not our encoding,
 94 |                     //   but is not incompatible with our encoding, so don't give it a zero.
 95 |                     confidence = 10;
 96 |                 }
 97 | 
 98 |                 break detectBlock;
 99 |             }
100 | 
101 |             //
102 |             //  No match if there are too many characters that don't fit the encoding scheme.
103 |             //    (should we have zero tolerance for these?)
104 |             //
105 |             if (doubleByteCharCount < 20*badCharCount) {
106 |                 confidence = 0;
107 |                 break detectBlock;
108 |             }
109 | 
110 |             if (commonChars == null) {
111 |                 // We have no statistics on frequently occurring characters.
112 |                 //  Assess confidence purely on having a reasonable number of
113 |                 //  multi-byte characters (the more the better
114 |                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
115 |                 if (confidence > 100) {
116 |                     confidence = 100;
117 |                 }
118 |             }else {
119 |                 //
120 |                 // Frequency of occurrence statistics exist.
121 |                 //
122 |                 double maxVal = Math.log((float)doubleByteCharCount / 4);
123 |                 double scaleFactor = 90.0 / maxVal;
124 |                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
125 |                 confidence = Math.min(confidence, 100);
126 |             }
127 |         }   // end of detectBlock:
128 | 
129 |         return confidence;
130 |     }
131 | 
132 |      // "Character"  iterated character class.
133 |      //    Recognizers for specific mbcs encodings make their "characters" available
134 |      //    by providing a nextChar() function that fills in an instance of iteratedChar
135 |      //    with the next char from the input.
136 |      //    The returned characters are not converted to Unicode, but remain as the raw
137 |      //    bytes (concatenated into an int) from the codepage data.
138 |      //
139 |      //  For Asian charsets, use the raw input rather than the input that has been
140 |      //   stripped of markup.  Detection only considers multi-byte chars, effectively
141 |      //   stripping markup anyway, and double byte chars do occur in markup too.
142 |      //
143 |      static class iteratedChar {
144 |          int             charValue = 0;             // 1-4 bytes from the raw input data
145 |          int             nextIndex = 0;
146 |          boolean         error     = false;
147 |          boolean         done      = false;
148 | 
149 |          void reset() {
150 |              charValue = 0;
151 |              nextIndex = 0;
152 |              error     = false;
153 |              done      = false;
154 |          }
155 | 
156 |          int nextByte(CharsetDetector det) {
157 |              if (nextIndex >= det.fRawLength) {
158 |                  done = true;
159 |                  return -1;
160 |              }
161 |              int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
162 |              return byteValue;
163 |          }
164 |      }
165 | 
166 |      /**
167 |       * Get the next character (however many bytes it is) from the input data
168 |       *    Subclasses for specific charset encodings must implement this function
169 |       *    to get characters according to the rules of their encoding scheme.
170 |       *
171 |       *  This function is not a method of class iteratedChar only because
172 |       *   that would require a lot of extra derived classes, which is awkward.
173 |       * @param it  The iteratedChar "struct" into which the returned char is placed.
174 |       * @param det The charset detector, which is needed to get at the input byte data
175 |       *            being iterated over.
176 |       * @return    True if a character was returned, false at end of input.
177 |       */
178 |      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
179 | 
180 | 
181 | 
182 | 
183 | 
184 |      /**
185 |       *   Shift-JIS charset recognizer.
186 |       *
187 |       */
188 |      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
189 |          static int [] commonChars =
190 |              // TODO:  This set of data comes from the character frequency-
191 |              //        of-occurrence analysis tool.  The data needs to be moved
192 |              //        into a resource and loaded from there.
193 |             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
194 |              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
195 |              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
196 |              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
197 |              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
198 |              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199 | 
200 |          @Override
201 |         boolean nextChar(iteratedChar it, CharsetDetector det) {
202 |              it.error = false;
203 |              int firstByte;
204 |              firstByte = it.charValue = it.nextByte(det);
205 |              if (firstByte < 0) {
206 |                  return false;
207 |              }
208 | 
209 |              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
210 |                  return true;
211 |              }
212 | 
213 |              int secondByte = it.nextByte(det);
214 |              if (secondByte < 0)  {
215 |                  return false;
216 |              }
217 |              it.charValue = (firstByte << 8) | secondByte;
218 |              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
219 |                  // Illegal second byte value.
220 |                  it.error = true;
221 |              }
222 |              return true;
223 |          }
224 | 
225 |          @Override
226 |         CharsetMatch match(CharsetDetector det) {
227 |              int confidence = match(det, commonChars);
228 |              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
229 |          }
230 | 
231 |          @Override
232 |         String getName() {
233 |              return "Shift_JIS";
234 |          }
235 | 
236 |          @Override
237 |         public String getLanguage()
238 |          {
239 |              return "ja";
240 |          }
241 | 
242 | 
243 |      }
244 | 
245 | 
246 |      /**
247 |       *   Big5 charset recognizer.
248 |       *
249 |       */
250 |      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
251 |          static int [] commonChars =
252 |              // TODO:  This set of data comes from the character frequency-
253 |              //        of-occurrence analysis tool.  The data needs to be moved
254 |              //        into a resource and loaded from there.
255 |             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256 |              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257 |              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258 |              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259 |              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260 |              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261 |              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262 |              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263 |              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264 |              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
265 | 
266 |          @Override
267 |         boolean nextChar(iteratedChar it, CharsetDetector det) {
268 |              it.error = false;
269 |              int firstByte;
270 |              firstByte = it.charValue = it.nextByte(det);
271 |              if (firstByte < 0) {
272 |                  return false;
273 |              }
274 | 
275 |              if (firstByte <= 0x7f || firstByte==0xff) {
276 |                  // single byte character.
277 |                  return true;
278 |              }
279 | 
280 |              int secondByte = it.nextByte(det);
281 |              if (secondByte < 0)  {
282 |                  return false;
283 |              }
284 |              it.charValue = (it.charValue << 8) | secondByte;
285 | 
286 |              if (secondByte < 0x40 ||
287 |                  secondByte ==0x7f ||
288 |                  secondByte == 0xff) {
289 |                      it.error = true;
290 |              }
291 |              return true;
292 |          }
293 | 
294 |          @Override
295 |         CharsetMatch match(CharsetDetector det) {
296 |              int confidence = match(det, commonChars);
297 |              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
298 |          }
299 | 
300 |          @Override
301 |         String getName() {
302 |              return "Big5";
303 |          }
304 | 
305 | 
306 |          @Override
307 |         public String getLanguage()
308 |          {
309 |              return "zh";
310 |          }
311 |      }
312 | 
313 | 
314 |      /**
315 |       *   EUC charset recognizers.  One abstract class that provides the common function
316 |       *             for getting the next character according to the EUC encoding scheme,
317 |       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
318 |       *
319 |       */
320 |      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
321 | 
322 |          /*
323 |           *  (non-Javadoc)
324 |           *  Get the next character value for EUC based encodings.
325 |           *  Character "value" is simply the raw bytes that make up the character
326 |           *     packed into an int.
327 |           */
328 |          @Override
329 |         boolean nextChar(iteratedChar it, CharsetDetector det) {
330 |              it.error = false;
331 |              int firstByte  = 0;
332 |              int secondByte = 0;
333 |              int thirdByte  = 0;
334 |              //int fourthByte = 0;
335 | 
336 |              buildChar: {
337 |                  firstByte = it.charValue = it.nextByte(det);
338 |                  if (firstByte < 0) {
339 |                      // Ran off the end of the input data
340 |                      it.done = true;
341 |                      break buildChar;
342 |                  }
343 |                  if (firstByte <= 0x8d) {
344 |                      // single byte char
345 |                      break buildChar;
346 |                  }
347 | 
348 |                  secondByte = it.nextByte(det);
349 |                  it.charValue = (it.charValue << 8) | secondByte;
350 | 
351 |                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
352 |                      // Two byte Char
353 |                      if (secondByte < 0xa1) {
354 |                          it.error = true;
355 |                      }
356 |                      break buildChar;
357 |                  }
358 |                  if (firstByte == 0x8e) {
359 |                      // Code Set 2.
360 |                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
361 |                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
362 |                      // We don't know which we've got.
363 |                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
364 |                      //   bytes will look like a well formed 2 byte char.
365 |                      if (secondByte < 0xa1) {
366 |                          it.error = true;
367 |                      }
368 |                      break buildChar;
369 |                  }
370 | 
371 |                  if (firstByte == 0x8f) {
372 |                      // Code set 3.
373 |                      // Three byte total char size, two bytes of actual char value.
374 |                      thirdByte    = it.nextByte(det);
375 |                      it.charValue = (it.charValue << 8) | thirdByte;
376 |                      if (thirdByte < 0xa1) {
377 |                          it.error = true;
378 |                      }
379 |                  }
380 |               }
381 | 
382 |              return (it.done == false);
383 |          }
384 | 
385 |          /**
386 |           * The charset recognize for EUC-JP.  A singleton instance of this class
387 |           *    is created and kept by the public CharsetDetector class
388 |           */
389 |          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
390 |              static int [] commonChars =
391 |                  // TODO:  This set of data comes from the character frequency-
392 |                  //        of-occurrence analysis tool.  The data needs to be moved
393 |                  //        into a resource and loaded from there.
394 |                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
395 |                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
396 |                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
397 |                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
398 |                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
399 |                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
400 |                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
401 |                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
402 |                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
403 |                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
404 |              @Override
405 |             String getName() {
406 |                  return "EUC-JP";
407 |              }
408 | 
409 |              @Override
410 |             CharsetMatch match(CharsetDetector det) {
411 |                  int confidence = match(det, commonChars);
412 |                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
413 |              }
414 | 
415 |              @Override
416 |             public String getLanguage()
417 |              {
418 |                  return "ja";
419 |              }
420 |          }
421 | 
422 |          /**
423 |           * The charset recognize for EUC-KR.  A singleton instance of this class
424 |           *    is created and kept by the public CharsetDetector class
425 |           */
426 |          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
427 |              static int [] commonChars =
428 |                  // TODO:  This set of data comes from the character frequency-
429 |                  //        of-occurrence analysis tool.  The data needs to be moved
430 |                  //        into a resource and loaded from there.
431 |                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
432 |                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
433 |                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
434 |                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
435 |                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
436 |                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
437 |                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
438 |                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
439 |                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
440 |                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
441 | 
442 |              @Override
443 |             String getName() {
444 |                  return "EUC-KR";
445 |              }
446 | 
447 |              @Override
448 |             CharsetMatch match(CharsetDetector det) {
449 |                  int confidence = match(det, commonChars);
450 |                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
451 |              }
452 | 
453 |              @Override
454 |             public String getLanguage()
455 |              {
456 |                  return "ko";
457 |              }
458 |          }
459 |      }
460 | 
461 |      /**
462 |       *
463 |       *   GB-18030 recognizer. Uses simplified Chinese statistics.
464 |       *
465 |       */
466 |      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
467 | 
468 |          /*
469 |           *  (non-Javadoc)
470 |           *  Get the next character value for EUC based encodings.
471 |           *  Character "value" is simply the raw bytes that make up the character
472 |           *     packed into an int.
473 |           */
474 |          @Override
475 |         boolean nextChar(iteratedChar it, CharsetDetector det) {
476 |              it.error = false;
477 |              int firstByte  = 0;
478 |              int secondByte = 0;
479 |              int thirdByte  = 0;
480 |              int fourthByte = 0;
481 | 
482 |              buildChar: {
483 |                  firstByte = it.charValue = it.nextByte(det);
484 | 
485 |                  if (firstByte < 0) {
486 |                      // Ran off the end of the input data
487 |                      it.done = true;
488 |                      break buildChar;
489 |                  }
490 | 
491 |                  if (firstByte <= 0x80) {
492 |                      // single byte char
493 |                      break buildChar;
494 |                  }
495 | 
496 |                  secondByte = it.nextByte(det);
497 |                  it.charValue = (it.charValue << 8) | secondByte;
498 | 
499 |                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
500 |                      // Two byte Char
501 |                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
502 |                          break buildChar;
503 |                      }
504 | 
505 |                      // Four byte char
506 |                      if (secondByte >= 0x30 && secondByte <= 0x39) {
507 |                          thirdByte = it.nextByte(det);
508 | 
509 |                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
510 |                              fourthByte = it.nextByte(det);
511 | 
512 |                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
513 |                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
514 |                                  break buildChar;
515 |                              }
516 |                          }
517 |                      }
518 | 
519 |                      it.error = true;
520 |                      break buildChar;
521 |                  }
522 |              }
523 | 
524 |              return (it.done == false);
525 |          }
526 | 
527 |          static int [] commonChars =
528 |              // TODO:  This set of data comes from the character frequency-
529 |              //        of-occurrence analysis tool.  The data needs to be moved
530 |              //        into a resource and loaded from there.
531 |             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
532 |              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
533 |              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
534 |              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
535 |              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
536 |              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
537 |              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
538 |              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
539 |              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
540 |              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
541 | 
542 | 
543 |          @Override
544 |         String getName() {
545 |              return "GB18030";
546 |          }
547 | 
548 |          @Override
549 |         CharsetMatch match(CharsetDetector det) {
550 |              int confidence = match(det, commonChars);
551 |              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
552 |          }
553 | 
554 |          @Override
555 |         public String getLanguage()
556 |          {
557 |              return "zh";
558 |          }
559 |      }
560 | 
561 | 
562 | }
563 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecognizer.java:
--------------------------------------------------------------------------------
 1 | // © 2016 and later: Unicode, Inc. and others.
 2 | // License & terms of use: http://www.unicode.org/copyright.html
 3 | /**
 4 | *******************************************************************************
 5 | * Copyright (C) 2005-2012, International Business Machines Corporation and    *
 6 | * others. All Rights Reserved.                                                *
 7 | *******************************************************************************
 8 | */
 9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
10 | 
11 | /**
12 |  * Abstract class for recognizing a single charset.
13 |  * Part of the implementation of ICU's CharsetDetector.
14 |  * 
15 |  * Each specific charset that can be recognized will have an instance
16 |  * of some subclass of this class.  All interaction between the overall
17 |  * CharsetDetector and the stuff specific to an individual charset happens
18 |  * via the interface provided here.
19 |  * 
20 |  * Instances of CharsetDetector DO NOT have or maintain 
21 |  * state pertaining to a specific match or detect operation.
22 |  * The WILL be shared by multiple instances of CharsetDetector.
23 |  * They encapsulate const charset-specific information.
24 |  */
25 | abstract class CharsetRecognizer {
26 |     /**
27 |      * Get the IANA name of this charset.
28 |      * @return the charset name.
29 |      */
30 |     abstract String      getName();
31 |     
32 |     /**
33 |      * Get the ISO language code for this charset.
34 |      * @return the language code, or <code>null</code> if the language cannot be determined.
35 |      */
36 |     public   String      getLanguage()
37 |     {
38 |         return null;
39 |     }
40 |     
41 |     /**
42 |      * Test the match of this charset with the input text data
43 |      *      which is obtained via the CharsetDetector object.
44 |      * 
45 |      * @param det  The CharsetDetector, which contains the input text
46 |      *             to be checked for being in this charset.
47 |      * @return     A CharsetMatch object containing details of match
48 |      *             with this charset, or null if there was no match.
49 |      */
50 |     abstract CharsetMatch  match(CharsetDetector det);
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * This package contains code from the icu4j project. This was originally released under the ICU
3 |  * license. This project is released under the Apache 2 license.
4 |  * 
5 |  * @see <a href="https://github.com/unicode-org/icu">https://github.com/unicode-org/icu</a>
6 |  */
7 | package com.sigpwned.chardet4j.com.ibm.icu.text;
8 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/io/BomAwareInputStream.java:
--------------------------------------------------------------------------------
 1 | /*-
 2 |  * =================================LICENSE_START==================================
 3 |  * chardet4j
 4 |  * ====================================SECTION=====================================
 5 |  * Copyright (C) 2022 - 2024 Andy Boothe
 6 |  * ====================================SECTION=====================================
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  * 
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  * 
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  * ==================================LICENSE_END===================================
19 |  */
20 | package com.sigpwned.chardet4j.io;
21 | 
22 | import java.io.ByteArrayInputStream;
23 | import java.io.FilterInputStream;
24 | import java.io.IOException;
25 | import java.io.InputStream;
26 | import java.io.SequenceInputStream;
27 | import java.util.Optional;
28 | import com.sigpwned.chardet4j.ByteOrderMark;
29 | import com.sigpwned.chardet4j.util.ByteStreams;
30 | 
31 | /**
32 |  * A wrapper {@link InputStream} that remembers the {@link ByteOrderMark} that was detected at the
33 |  * beginning of the stream.
34 |  */
35 | public final class BomAwareInputStream extends FilterInputStream {
36 |   /**
37 |    * Detect the {@link ByteOrderMark} at the beginning of the stream, if any, and return a
38 |    * {@link BomAwareInputStream} that wraps the stream.
39 |    *
40 |    * @param in the input stream
41 |    * @return the {@link BomAwareInputStream}
42 |    * @throws IOException if an I/O error occurs
43 |    */
44 |   public static BomAwareInputStream detect(InputStream in) throws IOException {
45 |     final byte[] buf = ByteStreams.readNBytes(in, ByteOrderMark.MAX_BYTE_LENGTH);
46 | 
47 |     ByteOrderMark bom = ByteOrderMark.detect(buf).orElse(null);
48 | 
49 |     // If there is no BOM, then return all the bytes read so far, followed by the rest of the stream
50 |     if (bom == null)
51 |       return new BomAwareInputStream(new SequenceInputStream(new ByteArrayInputStream(buf), in),
52 |           null);
53 | 
54 |     final int bomlen = bom.length();
55 | 
56 |     // If there is a BOM and it is the same length as the bytes read so far, then return the rest of
57 |     // the stream
58 |     if (bomlen == buf.length)
59 |       return new BomAwareInputStream(in, bom);
60 | 
61 |     // If there is a BOM and it is shorter than the bytes read so far, then return the BOM followed
62 |     // by the rest of the bytes read so far, followed by the rest of the stream
63 |     return new BomAwareInputStream(
64 |         new SequenceInputStream(new ByteArrayInputStream(buf, bomlen, buf.length - bomlen), in),
65 |         bom);
66 |   }
67 | 
68 |   private final ByteOrderMark bom;
69 | 
70 |   private BomAwareInputStream(InputStream delegate, ByteOrderMark bom) {
71 |     super(delegate);
72 |     this.bom = bom;
73 |   }
74 | 
75 |   /**
76 |    * The {@link ByteOrderMark} that was detected at the beginning of the stream, if any, or else
77 |    * empty.
78 |    *
79 |    * @return the {@link ByteOrderMark}
80 |    */
81 |   public Optional<ByteOrderMark> bom() {
82 |     return Optional.ofNullable(bom);
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/io/DecodedInputStreamReader.java:
--------------------------------------------------------------------------------
 1 | /*-
 2 |  * =================================LICENSE_START==================================
 3 |  * chardet4j
 4 |  * ====================================SECTION=====================================
 5 |  * Copyright (C) 2022 - 2024 Andy Boothe
 6 |  * ====================================SECTION=====================================
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  * 
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  * 
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  * ==================================LICENSE_END===================================
19 |  */
20 | package com.sigpwned.chardet4j.io;
21 | 
22 | import static java.util.Objects.requireNonNull;
23 | import java.io.InputStream;
24 | import java.io.InputStreamReader;
25 | import java.nio.charset.Charset;
26 | 
27 | /**
28 |  * A simple wrapper around an InputStreamReader that remembers the charset that was used to decode
29 |  * the input stream.
30 |  */
31 | public final class DecodedInputStreamReader extends InputStreamReader {
32 |   private final Charset charset;
33 | 
34 |   public DecodedInputStreamReader(InputStream in, Charset charset) {
35 |     super(in, charset);
36 |     this.charset = requireNonNull(charset);
37 |   }
38 | 
39 |   /**
40 |    * The charset that was used to decode the input stream.
41 |    *
42 |    * @return the charset
43 |    */
44 |   public Charset charset() {
45 |     return charset;
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/util/ByteStreams.java:
--------------------------------------------------------------------------------
 1 | /*-
 2 |  * =================================LICENSE_START==================================
 3 |  * chardet4j
 4 |  * ====================================SECTION=====================================
 5 |  * Copyright (C) 2022 - 2024 Andy Boothe
 6 |  * ====================================SECTION=====================================
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  * 
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  * 
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  * ==================================LICENSE_END===================================
19 |  */
20 | package com.sigpwned.chardet4j.util;
21 | 
22 | import java.io.IOException;
23 | import java.io.InputStream;
24 | import java.util.Arrays;
25 | 
26 | /**
27 |  * Utility methods for working with {@link InputStream byte streams}.
28 |  */
29 | public final class ByteStreams {
30 |   private ByteStreams() {}
31 | 
32 |   /**
33 |    * Read as many bytes as possible from the the given {@link InputStream}, up to count, and return
34 |    * them as a byte array. If the stream ends before count bytes can be read, then the returned
35 |    * array will be shorter than count. Equivalent to the Java 9+ {@code InputStream} method of the
36 |    * same name.
37 |    * 
38 |    * @param in the input stream
39 |    * @param count the maximum number of bytes to read
40 |    * @return the bytes read
41 |    * @throws NullPointerException if in is null
42 |    * @throws IllegalArgumentException if count is negative
43 |    * @throws IOException if an I/O error occurs
44 |    */
45 |   public static byte[] readNBytes(InputStream in, int count) throws IOException {
46 |     if (in == null)
47 |       throw new NullPointerException();
48 |     if (count < 0)
49 |       throw new IllegalArgumentException("count must not be negative");
50 | 
51 |     final byte[] buf = new byte[count];
52 |     if (count == 0)
53 |       return buf;
54 | 
55 |     int len = 0;
56 |     for (int nread = in.read(buf); nread != -1; nread = in.read(buf, len, buf.length - len)) {
57 |       len = len + nread;
58 |       if (len == buf.length)
59 |         break;
60 |     }
61 | 
62 |     if (len == buf.length)
63 |       return buf;
64 | 
65 |     return Arrays.copyOf(buf, len);
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/util/CharStreams.java:
--------------------------------------------------------------------------------
 1 | /*-
 2 |  * =================================LICENSE_START==================================
 3 |  * chardet4j
 4 |  * ====================================SECTION=====================================
 5 |  * Copyright (C) 2022 - 2024 Andy Boothe
 6 |  * ====================================SECTION=====================================
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  * 
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  * 
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  * ==================================LICENSE_END===================================
19 |  */
20 | package com.sigpwned.chardet4j.util;
21 | 
22 | import java.io.IOException;
23 | import java.io.Reader;
24 | import java.io.Writer;
25 | 
26 | public final class CharStreams {
27 |   private CharStreams() {}
28 | 
29 |   /**
30 |    * Copy all characters from the given {@link Reader} to the given {@link Writer} and return the
31 |    * total number of characters copied. Equivalent to the Java 9+ {@code Reader} method of the same
32 |    * name.
33 |    *
34 |    * @param in the input reader
35 |    * @param out the output writer
36 |    * @return the total number of characters copied
37 |    * @throws NullPointerException if in or out is null
38 |    * @throws IOException if an I/O error occurs
39 |    */
40 |   public static long transferTo(Reader in, Writer out) throws IOException {
41 |     if (in == null)
42 |       throw new NullPointerException();
43 |     if (out == null)
44 |       throw new NullPointerException();
45 | 
46 |     long total = 0;
47 | 
48 |     final char[] buf = new char[8192];
49 |     for (int nread = in.read(buf); nread != -1; nread = in.read(buf)) {
50 |       out.write(buf, 0, nread);
51 |       total = total + nread;
52 |     }
53 | 
54 |     return total;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/java/com/sigpwned/chardet4j/ChardetTest.java:
--------------------------------------------------------------------------------
  1 | /*-
  2 |  * =================================LICENSE_START==================================
  3 |  * chardet4j
  4 |  * ====================================SECTION=====================================
  5 |  * Copyright (C) 2022 Andy Boothe
  6 |  * ====================================SECTION=====================================
  7 |  * Licensed under the Apache License, Version 2.0 (the "License");
  8 |  * you may not use this file except in compliance with the License.
  9 |  * You may obtain a copy of the License at
 10 |  * 
 11 |  *      http://www.apache.org/licenses/LICENSE-2.0
 12 |  * 
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  * ==================================LICENSE_END===================================
 19 |  */
 20 | package com.sigpwned.chardet4j;
 21 | 
 22 | import static java.util.Arrays.asList;
 23 | import static java.util.Objects.requireNonNull;
 24 | import static org.hamcrest.CoreMatchers.anyOf;
 25 | import static org.hamcrest.CoreMatchers.is;
 26 | import static org.hamcrest.MatcherAssert.assertThat;
 27 | import java.io.ByteArrayInputStream;
 28 | import java.io.ByteArrayOutputStream;
 29 | import java.io.IOException;
 30 | import java.io.Reader;
 31 | import java.io.SequenceInputStream;
 32 | import java.io.StringWriter;
 33 | import java.nio.charset.Charset;
 34 | import java.nio.charset.StandardCharsets;
 35 | import java.nio.charset.UnsupportedCharsetException;
 36 | import java.util.List;
 37 | import java.util.Optional;
 38 | import org.junit.Test;
 39 | import com.google.common.io.CharStreams;
 40 | import com.google.common.io.Resources;
 41 | import com.sigpwned.chardet4j.io.DecodedInputStreamReader;
 42 | 
 43 | public class ChardetTest {
 44 |   @Test
 45 |   public void iso8859Test() {
 46 |     Charset charset =
 47 |         Chardet.detectCharset("Hello, world!".getBytes(StandardCharsets.ISO_8859_1)).get();
 48 | 
 49 |     assertThat(charset, is(StandardCharsets.ISO_8859_1));
 50 |   }
 51 | 
 52 |   @Test
 53 |   public void iso8859Utf8Test() {
 54 |     Charset charset =
 55 |         Chardet.detectCharset("Hello, world!".getBytes(StandardCharsets.UTF_8), "utf-8").get();
 56 | 
 57 |     assertThat(charset, is(StandardCharsets.UTF_8));
 58 |   }
 59 | 
 60 |   @Test
 61 |   public void utf8Test() {
 62 |     Charset charset = Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8)).get();
 63 | 
 64 |     assertThat(charset, is(StandardCharsets.UTF_8));
 65 |   }
 66 | 
 67 |   @Test
 68 |   public void utf16BeTest() {
 69 |     Charset charset =
 70 |         Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_16BE)).get();
 71 | 
 72 |     assertThat(charset, is(StandardCharsets.UTF_16BE));
 73 |   }
 74 | 
 75 |   @Test
 76 |   public void utf16LeTest() {
 77 |     Charset charset =
 78 |         Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_16LE)).get();
 79 | 
 80 |     assertThat(charset, is(StandardCharsets.UTF_16LE));
 81 |   }
 82 | 
 83 |   @Test
 84 |   public void utf8BomTest() throws IOException {
 85 |     ByteArrayOutputStream buf = new ByteArrayOutputStream();
 86 |     buf.write(ByteOrderMark.UTF_8.getBytes());
 87 |     buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8));
 88 | 
 89 |     Charset charset = Chardet.detectCharset(buf.toByteArray()).get();
 90 | 
 91 |     assertThat(charset, is(StandardCharsets.UTF_8));
 92 |   }
 93 | 
 94 |   @Test
 95 |   public void utf16BeBomTest() throws IOException {
 96 |     ByteArrayOutputStream buf = new ByteArrayOutputStream();
 97 |     buf.write(ByteOrderMark.UTF_16BE.getBytes());
 98 |     buf.write("Hello, world!".getBytes(StandardCharsets.UTF_16BE));
 99 | 
100 |     Charset charset = Chardet.detectCharset(buf.toByteArray()).get();
101 | 
102 |     assertThat(charset, is(StandardCharsets.UTF_16BE));
103 |   }
104 | 
105 |   @Test
106 |   public void utf16LeBomTest() throws IOException {
107 |     ByteArrayOutputStream buf = new ByteArrayOutputStream();
108 |     buf.write(ByteOrderMark.UTF_16LE.getBytes());
109 |     buf.write("Hello, world!".getBytes(StandardCharsets.UTF_16LE));
110 | 
111 |     Charset charset = Chardet.detectCharset(buf.toByteArray()).get();
112 | 
113 |     assertThat(charset, is(StandardCharsets.UTF_16LE));
114 |   }
115 | 
116 |   /**
117 |    * We should detect the correct charset if the declared hint is wrong
118 |    */
119 |   @Test
120 |   public void mismatchedDeclaredEncodingTest() {
121 |     Charset charset =
122 |         Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8), "UTF-16").get();
123 | 
124 |     assertThat(charset, is(StandardCharsets.UTF_8));
125 |   }
126 | 
127 |   /**
128 |    * We should detect the correct charset if the declared hint is not a valid charset
129 |    */
130 |   @Test
131 |   public void invalidDeclaredEncodingTest() {
132 |     Charset charset =
133 |         Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8), "FOOBAR").get();
134 | 
135 |     assertThat(charset, is(StandardCharsets.UTF_8));
136 |   }
137 | 
138 |   /**
139 |    * We should ignore the BOM
140 |    */
141 |   @Test
142 |   public void decodeStreamTest() throws IOException {
143 |     ByteArrayOutputStream buf = new ByteArrayOutputStream();
144 |     buf.write(ByteOrderMark.UTF_8.getBytes());
145 |     buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8));
146 | 
147 |     String decoded;
148 |     try (Reader r = Chardet.decode(new ByteArrayInputStream(buf.toByteArray()), "utf-8",
149 |         StandardCharsets.UTF_8)) {
150 |       decoded = CharStreams.toString(r);
151 |     }
152 | 
153 |     assertThat(decoded, is("Hello, world!"));
154 |   }
155 | 
156 |   /**
157 |    * We should ignore the BOM
158 |    */
159 |   @Test
160 |   public void decodeArrayTest() throws IOException {
161 |     ByteArrayOutputStream buf = new ByteArrayOutputStream();
162 |     buf.write(ByteOrderMark.UTF_8.getBytes());
163 |     buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8));
164 | 
165 |     String decoded = Chardet.decode(buf.toByteArray(), "utf-8", StandardCharsets.UTF_8);
166 | 
167 |     assertThat(decoded, is("Hello, world!"));
168 |   }
169 | 
170 |   /**
171 |    * We should ignore the BOM
172 |    */
173 |   @Test
174 |   public void longTest() throws IOException {
175 |     byte[] data = Resources.toByteArray(Resources.getResource("webpage.html"));
176 | 
177 |     Charset charset = Chardet.detectCharset(data, "utf-8").get();
178 | 
179 |     assertThat(charset, is(StandardCharsets.UTF_8));
180 |   }
181 | 
182 |   public static class TestableCharset {
183 |     public final boolean standard;
184 |     public final String charsetName;
185 |     public final ByteOrderMark bom;
186 | 
187 |     public TestableCharset(boolean standard, String charsetName, ByteOrderMark bom) {
188 |       this.standard = standard;
189 |       this.charsetName = requireNonNull(charsetName);
190 |       this.bom = requireNonNull(bom);
191 |     }
192 | 
193 |     public Optional<Charset> getCharset() {
194 |       try {
195 |         return Optional.of(Charset.forName(charsetName));
196 |       } catch (UnsupportedCharsetException e) {
197 |         return Optional.empty();
198 |       }
199 |     }
200 |   }
201 | 
202 |   public static byte[] concat(byte[] xs, byte[] ys) {
203 |     byte[] zs = new byte[xs.length + ys.length];
204 |     System.arraycopy(xs, 0, zs, 0, xs.length);
205 |     System.arraycopy(ys, 0, zs, xs.length, ys.length);
206 |     return zs;
207 |   }
208 | 
209 |   /**
210 |    * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
211 |    */
212 |   public static final List<TestableCharset> DETECT_CHARSET_TEST_CHARSETS =
213 |       asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
214 |           new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
215 |           new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
216 |           new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
217 |           new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
218 |           new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
219 |           new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));
220 | 
221 |   /**
222 |    * Test a variety of charsets using a known text and detect them
223 |    */
224 |   @Test
225 |   public void detectCharsetTest() throws IOException {
226 |     // Stopping by Woods on a Snowy Evening, by Robert Frost
227 |     // We'll encode this in various charsets and decode them
228 |     // We use a text without diacritics to avoid any issues with encoding. We're not here to test
229 |     // the correctness of charset implementations, only correct application of same.
230 |     // Note: The poem is public domain.
231 |     final String originalText = "Whose woods these are I think I know.   \n"
232 |         + "His house is in the village though;   \n" + "He will not see me stopping here   \n"
233 |         + "To watch his woods fill up with snow.   \n" + "\n"
234 |         + "My little horse must think it queer   \n" + "To stop without a farmhouse near   \n"
235 |         + "Between the woods and frozen lake   \n" + "The darkest evening of the year.   \n" + "\n"
236 |         + "He gives his harness bells a shake   \n" + "To ask if there is some mistake.   \n"
237 |         + "The only other sound’s the sweep   \n" + "Of easy wind and downy flake.   \n" + "\n"
238 |         + "The woods are lovely, dark and deep,   \n" + "But I have promises to keep,   \n"
239 |         + "And miles to go before I sleep,   \n" + "And miles to go before I sleep.";
240 | 
241 |     // These are all the charsets that Java is required to support
242 |     for (TestableCharset testableCharset : DETECT_CHARSET_TEST_CHARSETS) {
243 |       if (!testableCharset.getCharset().isPresent()) {
244 |         if (testableCharset.standard)
245 |           throw new AssertionError(
246 |               "JVM does not support standard charset " + testableCharset.charsetName);
247 |         continue;
248 |       }
249 | 
250 |       final Charset charset = testableCharset.getCharset().get();
251 | 
252 | 
253 |       // Make sure we get the right charset when we decode WITHOUT a BOM
254 |       final byte[] plainEncodedText = originalText.getBytes(charset);
255 |       final Charset plainDetectedCharset = Chardet.detectCharset(plainEncodedText).get();
256 |       if (testableCharset.charsetName.equals("UTF-8")) {
257 |         // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
258 |         assertThat(plainDetectedCharset,
259 |             anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
260 |       } else {
261 |         assertThat(plainDetectedCharset, is(charset));
262 |       }
263 | 
264 |       // Make sure we get the right charset when we decode WITHOUT a BOM
265 |       final byte[] bomEncodedText =
266 |           concat(testableCharset.bom.getBytes(), originalText.getBytes(charset));
267 |       final Charset bomDetectedCharset = Chardet.detectCharset(bomEncodedText).get();
268 |       if (testableCharset.charsetName.equals("UTF-8")) {
269 |         // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
270 |         assertThat(bomDetectedCharset,
271 |             anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
272 |       } else {
273 |         assertThat(bomDetectedCharset, is(charset));
274 |       }
275 |     }
276 |   }
277 | 
278 |   /**
279 |    * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
280 |    */
281 |   public static final List<TestableCharset> DECODE_TEST_CHARSETS =
282 |       asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
283 |           new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
284 |           new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
285 |           new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
286 |           new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
287 |           new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
288 |           new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));
289 | 
290 |   /**
291 |    * Test the ability to decode an InputStream
292 |    * 
293 |    * @see Chardet#decode(byte[], Charset)
294 |    */
295 |   @Test
296 |   public void decodeTest() throws IOException {
297 |     // Stopping by Woods on a Snowy Evening, by Robert Frost
298 |     // We'll encode this in various charsets and decode them
299 |     // We use a text without diacritics to avoid any issues with encoding. We're not here to test
300 |     // the correctness of charset implementations, only correct application of same.
301 |     // Note: The poem is public domain.
302 |     final String originalText = "Whose woods these are I think I know.   \n"
303 |         + "His house is in the village though;   \n" + "He will not see me stopping here   \n"
304 |         + "To watch his woods fill up with snow.   \n" + "\n"
305 |         + "My little horse must think it queer   \n" + "To stop without a farmhouse near   \n"
306 |         + "Between the woods and frozen lake   \n" + "The darkest evening of the year.   \n" + "\n"
307 |         + "He gives his harness bells a shake   \n" + "To ask if there is some mistake.   \n"
308 |         + "The only other sound’s the sweep   \n" + "Of easy wind and downy flake.   \n" + "\n"
309 |         + "The woods are lovely, dark and deep,   \n" + "But I have promises to keep,   \n"
310 |         + "And miles to go before I sleep,   \n" + "And miles to go before I sleep.";
311 | 
312 |     for (TestableCharset testableCharset : DECODE_TEST_CHARSETS) {
313 |       if (!testableCharset.getCharset().isPresent()) {
314 |         if (testableCharset.standard)
315 |           throw new AssertionError(
316 |               "JVM does not support standard charset " + testableCharset.charsetName);
317 |         continue;
318 |       }
319 | 
320 |       final Charset charset = testableCharset.getCharset().get();
321 | 
322 |       final byte[] encodedText = originalText.getBytes(charset);
323 | 
324 |       // Make sure we get the right charset when we decode WITHOUT a BOM
325 |       final StringWriter plainWriter = new StringWriter();
326 |       try (DecodedInputStreamReader plainReader =
327 |           Chardet.decode(new ByteArrayInputStream(encodedText), charset)) {
328 |         final Charset detectedCharset = plainReader.charset();
329 |         if (testableCharset.charsetName.equals("UTF-8")) {
330 |           // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
331 |           assertThat(detectedCharset,
332 |               anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
333 |         } else {
334 |           assertThat(detectedCharset, is(charset));
335 |         }
336 |         CharStreams.copy(plainReader, plainWriter);
337 |       }
338 |       assertThat(plainWriter.toString(), is(originalText));
339 | 
340 |       // Make sure we get the right charset when we decode WITH a BOM
341 |       final StringWriter bomWriter = new StringWriter();
342 |       try (DecodedInputStreamReader bomReader = Chardet
343 |           .decode(new SequenceInputStream(new ByteArrayInputStream(testableCharset.bom.getBytes()),
344 |               new ByteArrayInputStream(encodedText)), charset)) {
345 |         final Charset detectedCharset = bomReader.charset();
346 |         if (testableCharset.charsetName.equals("UTF-8")) {
347 |           // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
348 |           assertThat(detectedCharset,
349 |               anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
350 |         } else {
351 |           assertThat(detectedCharset, is(charset));
352 |         }
353 |         CharStreams.copy(bomReader, bomWriter);
354 |       }
355 |       assertThat(bomWriter.toString(), is(originalText));
356 |     }
357 |   }
358 | }
359 | 


--------------------------------------------------------------------------------