├── .gitattributes ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ └── tests.yml ├── .gitignore ├── CONTRIBUTORS.md ├── LICENSE ├── README.md ├── pom.xml └── src ├── main └── java │ └── com │ └── sigpwned │ └── chardet4j │ ├── ByteOrderMark.java │ ├── Chardet.java │ ├── com │ └── ibm │ │ └── icu │ │ └── text │ │ ├── CharsetDetector.java │ │ ├── CharsetMatch.java │ │ ├── CharsetRecog_2022.java │ │ ├── CharsetRecog_UTF8.java │ │ ├── CharsetRecog_Unicode.java │ │ ├── CharsetRecog_mbcs.java │ │ ├── CharsetRecog_sbcs.java │ │ ├── CharsetRecognizer.java │ │ └── package-info.java │ ├── io │ ├── BomAwareInputStream.java │ └── DecodedInputStreamReader.java │ └── util │ ├── ByteStreams.java │ └── CharStreams.java └── test ├── java └── com │ └── sigpwned │ └── chardet4j │ └── ChardetTest.java └── resources └── webpage.html /.gitattributes: -------------------------------------------------------------------------------- 1 | **/*.html -linguist-detectable 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @sigpwned -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "maven" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | - package-ecosystem: "github-actions" 8 | directory: "/" 9 | schedule: 10 | interval: "daily" 11 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: 9 | - opened 10 | - synchronize 11 | - reopened 12 | branches: 13 | - main 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis 22 | - name: Set up JDK 8 23 | uses: actions/setup-java@v4 24 | with: 25 | java-version: 8 26 | distribution: temurin 27 | cache: maven 28 | - name: Cache Maven packages 29 | uses: actions/cache@v4 30 | with: 31 | path: ~/.m2 32 | key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} 33 | restore-keys: ${{ runner.os }}-m2 34 | - name: Build and analyze 35 | run: | 36 | mvn \ 37 | -B \ 38 | clean \ 39 | verify \ 40 | --file pom.xml 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # From https://github.com/github/gitignore/blob/main/Java.gitignore 2 | 3 | # Compiled class file 4 | *.class 5 | 6 | # Log file 7 | *.log 8 | 9 | # BlueJ files 10 | *.ctxt 11 | 12 | # Mobile Tools for Java (J2ME) 13 | .mtj.tmp/ 14 | 15 | # Package Files # 16 | *.jar 17 | *.war 18 | *.nar 19 | *.ear 20 | *.zip 21 | *.tar.gz 22 | *.rar 23 | 24 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 25 | hs_err_pid* 26 | replay_pid* 27 | 28 | # From https://github.com/github/gitignore/blob/main/Maven.gitignore 29 | 30 | target/ 31 | pom.xml.tag 32 | pom.xml.releaseBackup 33 | pom.xml.versionsBackup 34 | pom.xml.next 35 | release.properties 36 | dependency-reduced-pom.xml 37 | buildNumber.properties 38 | .mvn/timing.properties 39 | # https://github.com/takari/maven-wrapper#usage-without-binary-jar 40 | .mvn/wrapper/maven-wrapper.jar 41 | 42 | # Eclipse m2e generated files 43 | # Eclipse Core 44 | .project 45 | # JDT-specific (Eclipse Java Development Tools) 46 | .classpath 47 | 48 | # Other 49 | 50 | # Emacs temporary files 51 | *~ 52 | 53 | # Eclipse m2e 54 | .settings 55 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | ## Special thanks for all the people who have contributed code to this project so far: 4 | 5 | * [sigpwned](https://github.com/sigpwned) (Founder) 6 | 7 | ## Thanks to everyone who has reported issues to this project so far: 8 | 9 | * [chrisbrookes](https://github.com/chrisbrookes) 10 | 11 | ## I would like to join this list. How can I help the project? 12 | 13 | Outstanding! We're currently looking for contributions for the following: 14 | 15 | - [ ] Bug fixes 16 | - [ ] More tests 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CHARDET4J [![tests](https://github.com/sigpwned/chardet4j/actions/workflows/tests.yml/badge.svg)](https://github.com/sigpwned/chardet4j/actions/workflows/tests.yml) [![Maven Central](https://img.shields.io/maven-central/v/com.sigpwned/chardet4j)](https://central.sonatype.com/search?q=g%3Acom.sigpwned%20a%3Achardet4j) [![javadoc](https://javadoc.io/badge2/com.sigpwned/chardet4j/javadoc.svg)](https://javadoc.io/doc/com.sigpwned/chardet4j) 2 | 3 | ## Introduction 4 | 5 | The state-of-the-art character set detection library for Java is 6 | [icu4j](https://github.com/unicode-org/icu). However, the icu4j JAR 7 | file is about 13MB. This is a hefty price to pay for programs that 8 | only require charset detection! There should be a smaller option of 9 | the same quality. 10 | 11 | The chardet4j library pulls the `CharsetDetector` feature from icu4j 12 | and repackages it as this standalone library. This allows programs to 13 | make good use of this important feature without bloating their 14 | JARs. At the time of this writing, the chardet4j JAR comes in around 15 | 85KB. There are no dependencies. 16 | 17 | This library also implements some other important components of 18 | character set detection and decoding, namely byte order mark handling. 19 | 20 | ## Features 21 | 22 | The library assists the user with detecting character set encodings for byte 23 | streams and decoding them into character streams. It offers specific 24 | abstractions for byte order marks (BOMs) and specific methods for identifying 25 | and decoding character encodings for byte arrays and input streams. 26 | 27 | The library uses the following algorithm to determine character encoding of 28 | binary data: 29 | 30 | 1. Check for a BOM. If one is present, then trust it, and use the corresponding 31 | charset to decode the data. 32 | 2. Use a battery of bespoke character set detectors to guess which charset is 33 | most likely. Users may provide a declared encoding, which provides a boost 34 | to the given charset in this estimation process. If a charset is identified 35 | with sufficient confidence, then use it to decode the data. 36 | 3. The default charset is used to decode the data, if one is given. 37 | 38 | ## Installation 39 | 40 | The library can be found in Maven Central with the following coordinates: 41 | 42 | 43 | com.sigpwned 44 | chardet4j 45 | 75.1.2 46 | 47 | 48 | It is compatible with Java versions 8 and later. chardet4j has no dependencies. 49 | 50 | The `$major.$minor.$patch` version of the library is determined by the underlying 51 | icu4j version and the local release version. The `$major` and `$minor` are taken 52 | from the icu4j version, and `$patch` is the release number of this library for 53 | the icu4j version, starting with 0. 54 | 55 | ## Getting Started 56 | 57 | To decode an `InputStream` to a `Reader` by detecting its character set: 58 | 59 | try (Reader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) { 60 | // Process chars here 61 | } 62 | 63 | Charset detection is important when dealing with content of unknown provenance, 64 | like content downloaded from the internet or text files uploaded by users. In 65 | such cases, users often have a declared encoding, typically from a content type. 66 | The name of the declared encoding can be provided as a hint to charset 67 | detection: 68 | 69 | try (Reader chars=Chardet.decode(bytes, declaredEncoding, StandardCharsets.UTF_8)) { 70 | // Process chars here 71 | } 72 | 73 | Byte arrays can be converted directly to Strings as well: 74 | 75 | String chars=Chardet.decode(bytes, declaredEncoding, StandardCharsets.UTF_8); 76 | 77 | Users only interested in detection can detect the charset directly, or by name 78 | in case the detected charset is not supported by the JVM: 79 | 80 | // Throws an UnsupportedCharsetException if the charset is not supported by JVM 81 | Optional maybeCharset = Chardet.detect(bytes, declaredEncoding); 82 | 83 | // Never throws 84 | Optional maybeCharsetName = Chardet.detectName(bytes, declaredEncoding); 85 | 86 | ## Advanced Usage 87 | 88 | The following are more sophisticated use cases and edge cases that most users 89 | will not need to worry about. 90 | 91 | ### Working with BOMs Directly 92 | 93 | The easiest way to work with byte order marks directly is with the 94 | `BomAwareInputStream` class: 95 | 96 | try (BomAwareInputStream bomed=BomAwareInputStream.detect(in)) { 97 | if(bomed.bom().isPresent()) { 98 | // A BOM was detected in this byte stream, and can be accessed using 99 | // bomed.bom() 100 | } else { 101 | // No BOM was detected in this byte stream. 102 | } 103 | } 104 | 105 | It is not typically required to work with BOMs directly, but it can be useful 106 | when creating a custom decode pipeline. 107 | 108 | ### Accessing Character Encoding 109 | 110 | The easiest way to determine which character encoding is in use is with the 111 | `DecodedInputStreamReader` class: 112 | 113 | try (DecodedInputStreamReader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) { 114 | // The charset that was detected and is being used to decode the given byte 115 | // stream can be accessed using chars.charset() 116 | Charset charset = chars.charset(); 117 | } 118 | 119 | ### Handling Unsupported Charsets 120 | 121 | The Java Standard only requires that distributions support the 122 | [standard charsets](https://docs.oracle.com/javase/8/docs/api/index.html?java/nio/charset/StandardCharsets.html) 123 | ISO-8859-1, US-ASCII, UTF-8, UTF-16BE, and UTF-16LE. This library detects those 124 | charsets and many more besides, so there is a possibility that the detected 125 | charset is not supported by the current JVM. 126 | 127 | Users are unlikely to hit this situation in the wild, since (a) Java generally 128 | supports almost all of the charsets this library detects, and (b) the 129 | unsupported charsets are scarce in the wild, and getting more scarce every year. 130 | 131 | Regardless, there are a couple ways to manage this situation. 132 | 133 | #### Catch UnsupportedCharsetException 134 | 135 | The library throws a `UnsupportedCharsetException` when the detected charset is not 136 | supported by the current JVM. Users are free to catch this exception and handle 137 | as desired. 138 | 139 | try (Reader chars=Chardet.decode(bytes, StandardCharsets.UTF_8)) { 140 | // Process chars here 141 | } catch(UnsupportedCharsetException e) { 142 | // The charset was detected, but is not supported by current JVM. There are a 143 | // few ways this is typically handled: 144 | // 145 | // - Propagate as an IOException, since the content cannot be decoded properly 146 | // - Ignore the error and use a default charset 147 | } 148 | 149 | #### Detect Charset Names 150 | 151 | Rather than working with charsets, work with charset names instead. This will 152 | never throw an exception. 153 | 154 | Optional maybeCharsetName = Chardet.detectCharsetName(bytes); 155 | if(maybeCharsetName.isPresent()) { 156 | // The charset was detected successfully, and the name can be accessed using 157 | // maybeCharsetName.get() 158 | } else { 159 | // The charset could not be detected 160 | } 161 | 162 | ### Using Custom Charsets 163 | 164 | Users who wish to add new charsets to the JVM should follow the instructions 165 | on the 166 | [CharsetProvider](https://docs.oracle.com/javase/8/docs/api/java/nio/charset/spi/CharsetProvider.html) 167 | class. The library will automatically pick up any such new charsets. 168 | 169 | ## Configuration 170 | 171 | The following configuration variables are available to customize the working of 172 | the library. 173 | 174 | ### System Property chardet4j.detect.bufsize 175 | 176 | One way the library detects character encodings is by analyzing the leading 177 | bytes of a binary file. The more data the library analyzes, the more accurate 178 | the estimates will be, but the longer it will take. By default, this value is 179 | 8192 bytes, or 8KiB. Users can change this value by setting the 180 | `chardet4j.detect.bufsize` system property. For example, to set this value to 181 | 16KiB, use: 182 | 183 | java -Dchardet.detect.bufsize=16384 ... 184 | 185 | Adjusting the buffer size can be useful when dealing with particularly large 186 | files where detection accuracy or performance might be a concern. 187 | 188 | ## Supported Character Encodings 189 | 190 | The chardet4j library and Java in general supports the following character 191 | encodings at the following levels: 192 | 193 | | Name | Standard | ICU4J | BOM | Laptop | 194 | |:------------:|:--------:|:-----:|:---:|:------:| 195 | | Big5 | | ✔ | | ✔ | 196 | | EUC-JP | | ✔ | | ✔ | 197 | | EUC-KR | | ✔ | | ✔ | 198 | | GB18030 | | ✔ | ✔ | ✔ | 199 | | ISO-2022-CN | | ✔ | | ✔ | 200 | | ISO-2022-JP | | ✔ | | ✔ | 201 | | ISO-2022-KR | | ✔ | | ✔ | 202 | | ISO-8859-1 | | ✔ | | ✔ | 203 | | ISO-8859-2 | | ✔ | | ✔ | 204 | | ISO-8859-5 | | ✔ | | ✔ | 205 | | ISO-8859-6 | | ✔ | | ✔ | 206 | | ISO-8859-7 | | ✔ | | ✔ | 207 | | ISO-8859-8 | | ✔ | | ✔ | 208 | | ISO-8859-8-I | | ✔ | | | 209 | | ISO-8859-9 | | ✔ | | ✔ | 210 | | KOI8-R | | ✔ | | ✔ | 211 | | Shift_JIS | | ✔ | | ✔ | 212 | | US-ASCII | ✔ | ✔* | | ✔ | 213 | | UTF-1 | | | ✔ | | 214 | | UTF-16BE | ✔ | ✔ | ✔ | ✔ | 215 | | UTF-16LE | ✔ | ✔ | ✔ | ✔ | 216 | | UTF-32BE | | ✔ | ✔ | ✔ | 217 | | UTF-32LE | | ✔ | ✔ | ✔ | 218 | | UTF-8 | ✔ | ✔ | ✔ | ✔ | 219 | | UTF-EBCDIC | | | ✔ | | 220 | | windows-1250 | | ✔ | | ✔ | 221 | | windows-1251 | | ✔ | | ✔ | 222 | | windows-1252 | | ✔ | | ✔ | 223 | | windows-1253 | | ✔ | | ✔ | 224 | | windows-1254 | | ✔ | | ✔ | 225 | | windows-1255 | | ✔ | | ✔ | 226 | | windows-1256 | | ✔ | | ✔ | 227 | 228 | Notes: 229 | `*`: ICU4J detects US-ASCII as ISO-8859-1, a superset of US-ASCII 230 | 231 | The support levels have the following meanings: 232 | 233 | * `Standard` -- The Java Standard requires that all JVMs support this 234 | character encoding 235 | * `ICU4J` -- The ICU4J project has a bespoke charset recognizer for this 236 | character encoding 237 | * `BOM` -- The character encoding can be detected by Byte Order Mark 238 | * `Laptop` -- The character sets supported by `java version "1.8.0_321"` on my 239 | laptop (Obviously, this test is completely unscientific. If you have a 240 | better suggestion, please open an issue!) 241 | 242 | ## Licensing 243 | 244 | The icu library is released under the ICU license. The chardet4j library is 245 | released under the Apache license. For more details, see the LICENSE file. 246 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | com.sigpwned 5 | chardet4j 6 | 77.1.1-SNAPSHOT 7 | chardet4j 8 | 2022 9 | Simple, compact charset detection for Java 8+ 10 | https://github.com/sigpwned/chardet4j 11 | jar 12 | 13 | 14 | Andy Boothe 15 | https://www.sigpwned.com/ 16 | 17 | 18 | 19 | scm:git:ssh://git@github.com/sigpwned/chardet4j.git 20 | scm:git:ssh://git@github.com/sigpwned/chardet4j.git 21 | https://github.com/sigpwned/chardet4j/tree/main 22 | v70.1.0 23 | 24 | 25 | 26 | 27 | Apache License, Version 2.0 28 | http://www.apache.org/licenses/LICENSE-2.0.txt 29 | 30 | 31 | 32 | 33 | 34 | Andy Boothe 35 | andy.boothe@gmail.com 36 | 37 | 38 | 39 | 40 | 41 | ossrh 42 | Sonatype Nexus Snapshots 43 | https://oss.sonatype.org/content/repositories/snapshots/ 44 | 45 | 46 | ossrh 47 | Nexus Release Repository 48 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 49 | 50 | 51 | 52 | 53 | GitHub 54 | https://github.com/sigpwned/chardet4j 55 | 56 | 57 | 58 | UTF-8 59 | 1.8 60 | 1.8 61 | 77.1 62 | 33.4.5-jre 63 | 2.0.16 64 | 4.13.2 65 | 1.3 66 | 67 | 68 | 69 | 70 | 71 | org.codehaus.mojo 72 | license-maven-plugin 73 | 2.5.0 74 | 75 | false 76 | 77 | 78 | 79 | update-file-header 80 | 81 | update-file-header 82 | 83 | process-sources 84 | 85 | false 86 | =================================LICENSE_START================================== 87 | ==================================LICENSE_END=================================== 88 | ====================================SECTION===================================== 89 | apache_v2 90 | 91 | src/main/java 92 | src/test/java 93 | 94 | 95 | **/com/ibm/icu/** 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | com.ibm.icu 110 | icu4j 111 | ${icu4j.version} 112 | 113 | 114 | 115 | 116 | 117 | 118 | com.google.guava 119 | guava 120 | ${guava.version} 121 | test 122 | 123 | 124 | junit 125 | junit 126 | ${junit.version} 127 | test 128 | 129 | 130 | org.hamcrest 131 | hamcrest-all 132 | ${hamcrest.version} 133 | test 134 | 135 | 136 | 137 | 138 | 139 | 140 | release 141 | 142 | 143 | 144 | org.apache.maven.plugins 145 | maven-source-plugin 146 | 147 | 148 | attach-sources 149 | 150 | jar-no-fork 151 | 152 | 153 | 154 | 155 | 156 | org.apache.maven.plugins 157 | maven-javadoc-plugin 158 | 159 | 160 | attach-javadocs 161 | 162 | jar 163 | 164 | 165 | false 166 | 167 | 168 | 169 | 170 | 171 | org.apache.maven.plugins 172 | maven-gpg-plugin 173 | 174 | 175 | sign-artifacts 176 | verify 177 | 178 | sign 179 | 180 | 181 | 182 | 183 | 184 | org.apache.maven.plugins 185 | maven-release-plugin 186 | 187 | v@{project.version} 188 | true 189 | false 190 | release 191 | deploy 192 | 193 | 194 | 195 | org.sonatype.plugins 196 | nexus-staging-maven-plugin 197 | true 198 | 199 | ossrh 200 | https://oss.sonatype.org/ 201 | true 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/ByteOrderMark.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j; 21 | 22 | import static java.util.Objects.requireNonNull; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.nio.charset.Charset; 26 | import java.nio.charset.IllegalCharsetNameException; 27 | import java.nio.charset.StandardCharsets; 28 | import java.nio.charset.UnsupportedCharsetException; 29 | import java.nio.charset.spi.CharsetProvider; 30 | import java.util.Arrays; 31 | import java.util.Comparator; 32 | import java.util.Optional; 33 | import java.util.concurrent.atomic.AtomicReference; 34 | import com.sigpwned.chardet4j.io.BomAwareInputStream; 35 | 36 | /** 37 | * A byte order mark (BOM) that hard-codes charset into an input stream. At this time, this 38 | * implementation only supports BOMs for the character sets the JVM supports, namely UTF-8, 39 | * UTF-16LE, and UTF-16BE. 40 | * 41 | * @see https://en.wikipedia.org/wiki/Byte_order_mark 43 | */ 44 | public enum ByteOrderMark { 45 | /** 46 | * The BOM for a UTF-8 stream 47 | */ 48 | UTF_8(new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.UTF_8, "UTF-8"), 49 | 50 | /** 51 | * The BOM for a UTF-16 big endian stream 52 | */ 53 | UTF_16BE(new byte[] {(byte) 0xFE, (byte) 0xFF}, StandardCharsets.UTF_16BE, "UTF-16BE"), 54 | 55 | /** 56 | * The BOM for a UTF-16 little endian stream 57 | */ 58 | UTF_16LE(new byte[] {(byte) 0xFF, (byte) 0xFE}, StandardCharsets.UTF_16LE, "UTF-16LE"), 59 | 60 | /** 61 | * The BOM for a UTF-32 big endian stream 62 | */ 63 | UTF_32BE(new byte[] {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}, null, "UTF-32BE"), 64 | 65 | /** 66 | * The BOM for a UTF-32 little endian stream 67 | */ 68 | UTF_32LE(new byte[] {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}, null, "UTF-32LE"), 69 | 70 | /** 71 | * The BOM for a UTF-1 stream 72 | */ 73 | UTF_1(new byte[] {(byte) 0xF7, (byte) 0x64, (byte) 0x4C}, null, "UTF-1"), 74 | 75 | /** 76 | * The BOM for a UTF-EBCDIC 77 | */ 78 | UTF_EBCDIC(new byte[] {(byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73}, null, "UTF-EBCDIC"), 79 | 80 | /** 81 | * The BOM for a GB-18030 stream 82 | */ 83 | GB_18030(new byte[] {(byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33}, null, "GB-18030"); 84 | 85 | // While BOMs for UTF-7, SCSU, and BOCU-1 exist, they are not deterministic and may not observe 86 | // byte boundaries. Also, the JVM generally does not support these charsets out of the box. So, 87 | // to keep things simple, these BOMs are not supported here. 88 | 89 | public static final int MAX_BYTE_LENGTH = 90 | Arrays.stream(values()).mapToInt(bom -> bom.getBytes().length).max().getAsInt(); 91 | 92 | /** 93 | * The values of the enum, sorted by the length of the BOM bytes, with the longest BOMs first. 94 | */ 95 | private static final ByteOrderMark[] VALUES = Arrays.copyOf(values(), values().length); 96 | static { 97 | Arrays.sort(VALUES, Comparator.comparingInt(bom -> bom.getBytes().length) 98 | .reversed().thenComparing(ByteOrderMark::getCharsetName)); 99 | } 100 | 101 | /** 102 | * Detects the BOM in the given input stream, if any, and returns a {@link BomAwareInputStream} 103 | * that wraps the stream. 104 | * 105 | * @param in the input stream 106 | * @return the {@link BomAwareInputStream} 107 | * @throws IOException if an I/O error 108 | */ 109 | public static BomAwareInputStream detect(InputStream in) throws IOException { 110 | return BomAwareInputStream.detect(in); 111 | } 112 | 113 | /** 114 | * Returns the BOM for the given data, if it is supported. Searches the whole array. 115 | * 116 | * @param data the data to check 117 | * @return the BOM, if found, otherwise empty 118 | * 119 | * @throws NullPointerException if {@code data} is {@code null} 120 | * 121 | * @see #detect(byte[], int) 122 | */ 123 | public static Optional detect(byte[] data) { 124 | if (data == null) 125 | throw new NullPointerException(); 126 | return detect(data, data.length); 127 | } 128 | 129 | /** 130 | * Detects the BOM in the given data, starting at 0, up to the given length. 131 | * 132 | * @param data the data to check 133 | * @param len the length of the data to check 134 | * @return the BOM, if found, otherwise empty 135 | * 136 | * @throws NullPointerException if {@code data} is {@code null} 137 | * @throws IllegalArgumentException if {@code len < 0} 138 | * @throws ArrayIndexOutOfBoundsException if {@code len > data.length} 139 | * 140 | * @see #detect(byte[], int, int) 141 | */ 142 | public static Optional detect(byte[] data, int len) { 143 | return detect(data, 0, len); 144 | } 145 | 146 | /** 147 | * Detects the BOM in the given data, starting at the given offset and continuing for the given 148 | * length. 149 | * 150 | * @param data the data to check 151 | * @param off the offset in the data to start checking 152 | * @param len the length of the data to check 153 | * @return the BOM, if found, otherwise empty 154 | * 155 | * @throws NullPointerException if {@code data} is {@code null} 156 | * @throws IllegalArgumentException if {@code len < 0} 157 | * @throws ArrayIndexOutOfBoundsException if {@code off < 0} or {@code off + len > data.length} 158 | */ 159 | public static Optional detect(byte[] data, int off, int len) { 160 | if (data == null) 161 | throw new NullPointerException(); 162 | if (len < 0) 163 | throw new IllegalArgumentException("len < 0"); 164 | if (off < 0) 165 | throw new ArrayIndexOutOfBoundsException(off); 166 | if (off + len > data.length) 167 | throw new ArrayIndexOutOfBoundsException(off + len); 168 | 169 | for (ByteOrderMark value : VALUES) { 170 | byte[] bom = value.getBytes(); 171 | int bomlen = value.getBytes().length; 172 | if (off + bomlen <= len && equals(data, off, off + bomlen, bom, 0, bomlen)) { 173 | return Optional.of(value); 174 | } 175 | } 176 | 177 | return Optional.empty(); 178 | } 179 | 180 | /** 181 | * Returns true if the two specified arrays of bytes, over the specified ranges, are equal 182 | * to one another. 183 | * 184 | *

185 | * Two arrays are considered equal if the number of elements covered by each range is the same, 186 | * and all corresponding pairs of elements over the specified ranges in the two arrays are equal. 187 | * In other words, two arrays are equal if they contain, over the specified ranges, the same 188 | * elements in the same order. 189 | * 190 | * @param a the first array to be tested for equality 191 | * @param aFromIndex the index (inclusive) of the first element in the first array to be tested 192 | * @param aToIndex the index (exclusive) of the last element in the first array to be tested 193 | * @param b the second array to be tested for equality 194 | * @param bFromIndex the index (inclusive) of the first element in the second array to be tested 195 | * @param bToIndex the index (exclusive) of the last element in the second array to be tested 196 | * @return {@code true} if the two arrays, over the specified ranges, are equal 197 | * @throws IllegalArgumentException if {@code aFromIndex > aToIndex} or if 198 | * {@code bFromIndex > bToIndex} 199 | * @throws ArrayIndexOutOfBoundsException if {@code aFromIndex < 0 or aToIndex > a.length} or if 200 | * {@code bFromIndex < 0 or bToIndex > b.length} 201 | * @throws NullPointerException if either array is {@code null} 202 | */ 203 | private static boolean equals(byte[] a, int aFromIndex, int aToIndex, byte[] b, int bFromIndex, 204 | int bToIndex) { 205 | rangeCheck(a.length, aFromIndex, aToIndex); 206 | rangeCheck(b.length, bFromIndex, bToIndex); 207 | 208 | int aLength = aToIndex - aFromIndex; 209 | int bLength = bToIndex - bFromIndex; 210 | if (aLength != bLength) 211 | return false; 212 | int length = aLength; 213 | 214 | for (int i = 0; i < length; i++) { 215 | if (a[aFromIndex + i] != b[bFromIndex + i]) { 216 | return false; 217 | } 218 | } 219 | 220 | return true; 221 | } 222 | 223 | /** 224 | * Checks that {@code fromIndex} and {@code toIndex} are in the range and throws an exception if 225 | * they aren't. 226 | */ 227 | private static void rangeCheck(int arrayLength, int fromIndex, int toIndex) { 228 | if (fromIndex > toIndex) { 229 | throw new IllegalArgumentException("fromIndex(" + fromIndex + ") > toIndex(" + toIndex + ")"); 230 | } 231 | if (fromIndex < 0) { 232 | throw new ArrayIndexOutOfBoundsException(fromIndex); 233 | } 234 | if (toIndex > arrayLength) { 235 | throw new ArrayIndexOutOfBoundsException(toIndex); 236 | } 237 | } 238 | 239 | private final byte[] bytes; 240 | private final Charset standardCharset; 241 | private final String charsetName; 242 | private volatile AtomicReference charset; 243 | 244 | private ByteOrderMark(byte[] bytes, Charset standardCharset, String charsetName) { 245 | this.bytes = requireNonNull(bytes); 246 | this.standardCharset = standardCharset; 247 | this.charsetName = requireNonNull(charsetName); 248 | if (standardCharset != null) 249 | this.charset = new AtomicReference<>(standardCharset); 250 | } 251 | 252 | /** 253 | * @return the bytes 254 | */ 255 | /* default */ byte[] getBytes() { 256 | return bytes; 257 | } 258 | 259 | public int length() { 260 | return bytes.length; 261 | } 262 | 263 | /** 264 | * Returns the charset for this BOM. Checks for standard charsets first, then attempts to load the 265 | * charset using {@link Charset#forName(String)}. If the charset is not supported, then throws an 266 | * {@link UnsupportedCharsetException}. 267 | * 268 | * @return the charset 269 | * @throws UnsupportedCharsetException if the charset is not supported, e.g., UTF-32BE 270 | * 271 | * @see #getCharsetIfSupported() 272 | * @see CharsetProvider 273 | */ 274 | public Charset getCharset() { 275 | return getCharsetIfSupported().orElseThrow(() -> new UnsupportedCharsetException(charsetName)); 276 | } 277 | 278 | /** 279 | * Returns the charset for this BOM. Checks for standard charsets first, then attempts to load the 280 | * charset using {@link Charset#forName}. If the charset is not supported, then returns empty. 281 | * 282 | * @return the charset, if supported, otherwise empty 283 | * @see #getCharset() 284 | */ 285 | public Optional getCharsetIfSupported() { 286 | // If it's a standard charset, return it 287 | if (standardCharset != null) 288 | return Optional.of(standardCharset); 289 | 290 | // If it's not a standard charset, then attempt to load it and cache the result. 291 | if (charset == null) { 292 | Charset c; 293 | try { 294 | c = Charset.forName(charsetName); 295 | } catch (IllegalCharsetNameException e) { 296 | // Odd. None of these charset names should be invalid. Just treat it like it's not supported 297 | // and set the cached charset to null. 298 | c = null; 299 | } catch (UnsupportedCharsetException e) { 300 | // If the charset is not supported, then set the cached charset to null. 301 | c = null; 302 | } 303 | charset = new AtomicReference<>(c); 304 | } 305 | 306 | // If the cached charset is null, then return empty. Otherwise, return. 307 | return Optional.ofNullable(charset.get()); 308 | } 309 | 310 | /** 311 | * @return the charset name 312 | */ 313 | public String getCharsetName() { 314 | return charsetName; 315 | } 316 | } 317 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/Chardet.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j; 21 | 22 | import static java.util.stream.Collectors.toList; 23 | import java.io.ByteArrayInputStream; 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.io.Reader; 27 | import java.io.SequenceInputStream; 28 | import java.io.StringWriter; 29 | import java.io.UncheckedIOException; 30 | import java.io.Writer; 31 | import java.nio.charset.Charset; 32 | import java.nio.charset.UnsupportedCharsetException; 33 | import java.util.Arrays; 34 | import java.util.Comparator; 35 | import java.util.List; 36 | import java.util.Objects; 37 | import java.util.Optional; 38 | import com.sigpwned.chardet4j.com.ibm.icu.text.CharsetDetector; 39 | import com.sigpwned.chardet4j.io.BomAwareInputStream; 40 | import com.sigpwned.chardet4j.io.DecodedInputStreamReader; 41 | import com.sigpwned.chardet4j.util.ByteStreams; 42 | import com.sigpwned.chardet4j.util.CharStreams; 43 | 44 | /** 45 | * Simple interface to charset detection. 46 | */ 47 | public final class Chardet { 48 | private Chardet() {} 49 | 50 | private static final int MIN_CONFIDENCE = 0; 51 | private static final int MAX_CONFIDENCE = 100; 52 | 53 | private static final int DECLARED_ENCODING_BUMP = Optional 54 | .ofNullable(System.getProperty("chardet4j.detect.bump")).map(Integer::parseInt).orElse(10); 55 | 56 | /** 57 | * We have to do this because the ICU detector ignores the declared encoding, but the CharsetMatch 58 | * values are immutable and the constructor isn't visible. 59 | */ 60 | private static class ChardetMatch implements Comparable { 61 | public static ChardetMatch of(String name, int confidence) { 62 | return new ChardetMatch(name, confidence); 63 | } 64 | 65 | private final String name; 66 | private final int confidence; 67 | 68 | public ChardetMatch(String name, int confidence) { 69 | if (name == null) 70 | throw new NullPointerException(); 71 | if (confidence < MIN_CONFIDENCE || confidence > MAX_CONFIDENCE) 72 | throw new IllegalArgumentException("confidence out of range " + confidence); 73 | this.name = name; 74 | this.confidence = confidence; 75 | } 76 | 77 | /** 78 | * @return the name 79 | */ 80 | public String getName() { 81 | return name; 82 | } 83 | 84 | /** 85 | * @return the confidence 86 | */ 87 | public int getConfidence() { 88 | return confidence; 89 | } 90 | 91 | @Override 92 | public int hashCode() { 93 | return Objects.hash(confidence, name); 94 | } 95 | 96 | @Override 97 | public boolean equals(Object obj) { 98 | if (this == obj) 99 | return true; 100 | if (obj == null) 101 | return false; 102 | if (getClass() != obj.getClass()) 103 | return false; 104 | ChardetMatch other = (ChardetMatch) obj; 105 | return confidence == other.confidence && Objects.equals(name, other.name); 106 | } 107 | 108 | @Override 109 | public String toString() { 110 | return "PossibleMatch [name=" + name + ", confidence=" + confidence + "]"; 111 | } 112 | 113 | @Override 114 | public int compareTo(ChardetMatch o) { 115 | return Integer.compare(getConfidence(), o.getConfidence()); 116 | } 117 | } 118 | 119 | // detectCharset ///////////////////////////////////////////////////////////////////////////////// 120 | 121 | /** 122 | * Detect the charset of the given byte data. Input includes the entire array. If the character 123 | * encoding is detected, but not supported, then an {@link UnsupportedCharsetException} is thrown. 124 | * 125 | * @throws NullPointerException if data is null 126 | * @throws UnsupportedOperationException If the charset can be detected, but is not supported. 127 | */ 128 | public static Optional detectCharset(byte[] data) { 129 | return detectCharset(data, null); 130 | } 131 | 132 | /** 133 | * Detect the charset of the given byte data with the given encoding as a hint. Input includes the 134 | * entire array. If the character encoding is detected, but not supported, then an 135 | * {@link UnsupportedCharsetException} is thrown. 136 | * 137 | * @param data the byte data 138 | * @param declaredEncoding the declared encoding, treated as a hint 139 | * @return the charset, if one can be detected 140 | * 141 | * @throws NullPointerException if data is null 142 | * @throws UnsupportedOperationException If the charset can be detected, but is not supported. 143 | */ 144 | public static Optional detectCharset(byte[] data, String declaredEncoding) { 145 | if (data == null) 146 | throw new NullPointerException(); 147 | return detectCharset(data, data.length, declaredEncoding); 148 | } 149 | 150 | /** 151 | * Detect the charset encoding of the given byte data in the first len bytes of the given array. 152 | * If the character encoding is detected, but not supported, then an 153 | * {@link UnsupportedCharsetException} is thrown. 154 | * 155 | * @param data the byte data 156 | * @param len the number of bytes to consider, starting from 0 157 | * @param declaredEncoding the optional declared encoding, which is treated as a hint 158 | * @return the charset encoding, if one can be detected 159 | * 160 | * @throws NullPointerException if data is null 161 | * @throws IllegalArgumentException if len < 0 162 | * @throws ArrayIndexOutOfBoundsException if len > data.length 163 | * @throws UnsupportedOperationException If the charset can be detected, but is not supported. 164 | */ 165 | public static Optional detectCharset(byte[] data, int len, String declaredEncoding) { 166 | return detectCharset(data, 0, len, declaredEncoding); 167 | } 168 | 169 | /** 170 | * Detect the charset encoding of the given byte data in the given range of the given array. If 171 | * the character encoding is detected, but not supported, then an 172 | * {@link UnsupportedCharsetException} is thrown. 173 | * 174 | * @param data the byte data 175 | * @param off the offset into the byte data 176 | * @param len the number of bytes to consider 177 | * @param declaredEncoding the optional declared encoding, which is treated as a hint 178 | * @return the charset encoding, if one can be detected 179 | * 180 | * @throws NullPointerException if data is null 181 | * @throws IllegalArgumentException if len < 0 182 | * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length 183 | * @throws UnsupportedOperationException If the charset can be detected, but is not supported. To 184 | * get the charset name whether it is supported or not, use 185 | * {@link #detectCharsetName(byte[], int, String)}. 186 | */ 187 | public static Optional detectCharset(byte[] data, int off, int len, 188 | String declaredEncoding) { 189 | return detectCharsetName(data, off, len, declaredEncoding).map(Charset::forName); 190 | } 191 | 192 | // detectCharsetName ///////////////////////////////////////////////////////////////////////////// 193 | 194 | /** 195 | * Detect the charset of the given byte data. Input includes the entire array. 196 | * 197 | * @throws NullPointerException if data is null 198 | */ 199 | public static Optional detectCharsetName(byte[] data) { 200 | return detectCharsetName(data, null); 201 | } 202 | 203 | /** 204 | * Detect the charset of the given byte data. Input includes the entire array. 205 | * 206 | * @param data the byte data 207 | * @param declaredEncoding the declared encoding, treated as a hint 208 | * @return the charset name, if one is detected 209 | * 210 | * @throws NullPointerException if data is null 211 | */ 212 | public static Optional detectCharsetName(byte[] data, String declaredEncoding) { 213 | if (data == null) 214 | throw new NullPointerException(); 215 | return detectCharsetName(data, data.length, declaredEncoding); 216 | } 217 | 218 | /** 219 | * Detect the name of the charset encoding of the given byte data in the first len bytes of the 220 | * given array. 221 | * 222 | * @param data the byte data 223 | * @param len the number of bytes to consider, starting from 0 224 | * @param declaredEncoding the optional declared encoding, which is treated as a hint 225 | * @return the charset encoding, if one can be detected 226 | * 227 | * @throws NullPointerException if data is null 228 | * @throws IllegalArgumentException if len < 0 229 | * @throws ArrayIndexOutOfBoundsException if len > data.length 230 | */ 231 | public static Optional detectCharsetName(byte[] data, int len, String declaredEncoding) { 232 | return detectCharsetName(data, 0, len, declaredEncoding); 233 | } 234 | 235 | /** 236 | * Detect the name of the charset encoding of the given range of the given array. 237 | * 238 | * @param data the byte data 239 | * @param len the number of bytes to consider, starting from 0 240 | * @param declaredEncoding the optional declared encoding, which is treated as a hint 241 | * @return the charset encoding, if one can be detected 242 | * 243 | * @throws NullPointerException if data is null 244 | * @throws IllegalArgumentException if len < 0 245 | * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length 246 | * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O 247 | * operations are performed in-memory 248 | */ 249 | public static Optional detectCharsetName(byte[] data, int off, int len, 250 | String declaredEncoding) { 251 | if (data == null) 252 | throw new NullPointerException(); 253 | if (len < 0) 254 | throw new IllegalArgumentException("len < 0"); 255 | if (off < 0) 256 | throw new ArrayIndexOutOfBoundsException(off); 257 | if (off + len > data.length) 258 | throw new ArrayIndexOutOfBoundsException(off + len); 259 | 260 | Optional maybeBom = ByteOrderMark.detect(data, off, len); 261 | if (maybeBom.isPresent()) { 262 | return maybeBom.map(ByteOrderMark::getCharsetName); 263 | } 264 | 265 | CharsetDetector chardet = new CharsetDetector(); 266 | 267 | if (off == 0 && len == data.length) { 268 | // Let's avoid a byte copy if we can 269 | chardet.setText(data); 270 | } else { 271 | try { 272 | chardet.setText(new ByteArrayInputStream(data, off, len)); 273 | } catch (IOException e) { 274 | // This should never happen in a ByteArrayInputStream 275 | throw new UncheckedIOException("unexpected exception when reading from byte array", e); 276 | } 277 | } 278 | 279 | // Ideally, we'd just use this methods from the CharsetDetector class, but the declared encoding 280 | // is ignored. So we have to do it ourselves. 281 | // if (declaredEncoding != null) 282 | // chardet.setDeclaredEncoding(declaredEncoding); 283 | 284 | List matches = Arrays.stream(chardet.detectAll()).map(mi -> { 285 | String name = mi.getName(); 286 | 287 | int confidence = mi.getConfidence(); 288 | if (declaredEncoding != null && name.equalsIgnoreCase(declaredEncoding)) 289 | confidence = Math.min(confidence + DECLARED_ENCODING_BUMP, MAX_CONFIDENCE); 290 | 291 | return ChardetMatch.of(name, confidence); 292 | }).sorted(Comparator.reverseOrder()).collect(toList()); 293 | 294 | if (matches.isEmpty()) { 295 | return Optional.empty(); 296 | } 297 | 298 | return Optional.of(matches.get(0).getName()); 299 | } 300 | 301 | // decode //////////////////////////////////////////////////////////////////////////////////////// 302 | 303 | /** 304 | * The default is chosen based on a reading of the CharsetDetector source code, which sets buffer 305 | * size for byte frequency analysis at 8000. (Ample) extra space is left for BOMs. 306 | */ 307 | public static final int DECODE_DETECT_BUFSIZE = 308 | Optional.ofNullable(System.getProperty("chardet4j.detect.bufsize")).map(Integer::parseInt) 309 | .orElse(8192); 310 | 311 | /** 312 | * Returns a character-decoded version of the given byte stream. Any leading BOMs are discarded. 313 | * If no character set can be detected, then the given default is used. 314 | * 315 | * @param input the input stream 316 | * @param defaultCharset the default charset to use if no other can be detected 317 | * 318 | * @throws NullPointerException if input is null 319 | * @throws NullPointerException if defaultCharset is null 320 | * @throws IOException if an I/O error occurs 321 | * @throws UnsupportedCharsetException if the detected charset is not supported 322 | */ 323 | public static DecodedInputStreamReader decode(InputStream input, Charset defaultCharset) 324 | throws IOException { 325 | return decode(input, null, defaultCharset); 326 | } 327 | 328 | /** 329 | * Returns a character-decoded version of the given byte stream. The declared encoding is treated 330 | * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given 331 | * default is used. If the character set is detected, but not supported, then an 332 | * {@link UnsupportedCharsetException} is thrown. 333 | * 334 | * @param input the input stream 335 | * @param declaredEncoding the declared encoding, treated as a hint 336 | * @param defaultCharset the default charset to use if no other can be detected 337 | * @return the character-decoded stream 338 | * 339 | * @throws NullPointerException if input is null 340 | * @throws NullPointerException if defaultCharset is null 341 | * @throws IOException if an I/O error occurs 342 | * @throws UnsupportedCharsetException if the detected charset is not supported 343 | */ 344 | public static DecodedInputStreamReader decode(InputStream input, String declaredEncoding, 345 | Charset defaultCharset) throws IOException { 346 | if (input == null) 347 | throw new NullPointerException(); 348 | if (defaultCharset == null) 349 | throw new NullPointerException(); 350 | 351 | // Detect the BOM, if any. If there is one, then trust it and use the corresponding charset. 352 | final BomAwareInputStream bomed = BomAwareInputStream.detect(input); 353 | if (bomed.bom().isPresent()) 354 | return new DecodedInputStreamReader(bomed, bomed.bom().get().getCharset()); 355 | 356 | // If there is no BOM, then read some bytes to detect the charset. 357 | final byte[] buf = ByteStreams.readNBytes(bomed, DECODE_DETECT_BUFSIZE); 358 | 359 | // Note that charset cannot be null, since we check defaultCharset above. 360 | Charset charset = detectCharset(buf, declaredEncoding).orElse(defaultCharset); 361 | 362 | return new DecodedInputStreamReader( 363 | new SequenceInputStream(new ByteArrayInputStream(buf), bomed), charset); 364 | } 365 | 366 | /** 367 | * Returns a character-decoded String version of the given bytes. The declared encoding is treated 368 | * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given 369 | * default is used. If the character set is detected, but not supported, then an 370 | * {@link UnsupportedCharsetException} is thrown. 371 | * 372 | * @param data the byte data 373 | * @param declaredEncoding the declared encoding, treated as a hint 374 | * @param defaultCharset the default charset to use if no other can be detected 375 | * @return the character-decoded string 376 | * 377 | * @throws NullPointerException if data is null 378 | * @throws UnsupportedCharsetException if the detected charset is not supported 379 | * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O 380 | * operations are performed in-memory 381 | */ 382 | public static String decode(byte[] data, Charset defaultCharset) { 383 | return decode(data, null, defaultCharset); 384 | } 385 | 386 | /** 387 | * Returns a character-decoded String version of the given bytes. The declared encoding is treated 388 | * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given 389 | * default is used. If the character set is detected, but not supported, then an 390 | * {@link UnsupportedCharsetException} is thrown. 391 | * 392 | * @param data the byte data 393 | * @param declaredEncoding the declared encoding, treated as a hint 394 | * @param defaultCharset the default charset to use if no other can be detected 395 | * @return the character-decoded string 396 | * 397 | * @throws NullPointerException if data is null 398 | * @throws UnsupportedCharsetException if the detected charset is not supported 399 | * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O 400 | * operations are performed in-memory 401 | */ 402 | public static String decode(byte[] data, String declaredEncoding, Charset defaultCharset) { 403 | if (data == null) 404 | throw new NullPointerException(); 405 | return decode(data, data.length, declaredEncoding, defaultCharset); 406 | } 407 | 408 | /** 409 | * Returns a character-decoded String version of the given bytes. The declared encoding is treated 410 | * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given 411 | * default is used. If the character set is detected, but not supported, then an 412 | * {@link UnsupportedCharsetException} is thrown. 413 | * 414 | * @param data the byte data 415 | * @param len the number of bytes to consider, starting from 0 416 | * @param declaredEncoding the declared encoding, treated as a hint 417 | * @param defaultCharset the default charset to use if no other can be detected 418 | * @return the character-decoded string 419 | * 420 | * @throws NullPointerException if data is null 421 | * @throws IllegalArgumentException if len < 0 422 | * @throws ArrayIndexOutOfBoundsException if len > data.length 423 | * @throws UnsupportedCharsetException if the detected charset is detected, but not supported 424 | * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O 425 | * operations are performed in-memory 426 | */ 427 | public static String decode(byte[] data, int len, String declaredEncoding, 428 | Charset defaultCharset) { 429 | return decode(data, 0, len, declaredEncoding, defaultCharset); 430 | } 431 | 432 | /** 433 | * Returns a character-decoded String version of the given bytes. The declared encoding is treated 434 | * as a hint. Any leading BOMs are discarded. If no character set can be detected, then the given 435 | * default is used. If the character set is detected, but not supported, then an 436 | * {@link UnsupportedCharsetException} is thrown. 437 | * 438 | * @param data the byte data 439 | * @param off the offset into the byte data 440 | * @param len the number of bytes to consider, starting at off 441 | * @param declaredEncoding the declared encoding, treated as a hint 442 | * @param defaultCharset the default charset to use if no other can be detected 443 | * @return the character-decoded string 444 | * 445 | * @throws NullPointerException if data is null 446 | * @throws NullPointerException if defaultCharset is null 447 | * @throws IllegalArgumentException if len < 0 448 | * @throws ArrayIndexOutOfBoundsException if off < 0 or off + len > data.length 449 | * @throws UnsupportedCharsetException if the detected charset is detected, but not supported 450 | * @throws UncheckedIOException if an I/O error occurs, which should not happen because all I/O 451 | * operations are performed in-memory 452 | */ 453 | public static String decode(byte[] data, int off, int len, String declaredEncoding, 454 | Charset defaultCharset) { 455 | if (data == null) 456 | throw new NullPointerException(); 457 | if (defaultCharset == null) 458 | throw new NullPointerException(); 459 | if (len < 0) 460 | throw new IllegalArgumentException("len < 0"); 461 | if (off < 0) 462 | throw new ArrayIndexOutOfBoundsException(off); 463 | if (off + len > data.length) 464 | throw new ArrayIndexOutOfBoundsException(off + len); 465 | 466 | try (InputStream in = new ByteArrayInputStream(data, off, len); 467 | Reader r = decode(in, declaredEncoding, defaultCharset); 468 | Writer w = new StringWriter()) { 469 | CharStreams.transferTo(r, w); 470 | return w.toString(); 471 | } catch (IOException e) { 472 | throw new UncheckedIOException(e); 473 | } 474 | } 475 | } 476 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetDetector.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /** 4 | ******************************************************************************* 5 | * Copyright (C) 2005-2016, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | ******************************************************************************* 8 | */ 9 | package com.sigpwned.chardet4j.com.ibm.icu.text; 10 | 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.io.Reader; 14 | import java.util.ArrayList; 15 | import java.util.Arrays; 16 | import java.util.Collections; 17 | import java.util.List; 18 | 19 | 20 | /** 21 | * CharsetDetector provides a facility for detecting the 22 | * charset or encoding of character data in an unknown format. 23 | * The input data can either be from an input stream or an array of bytes. 24 | * The result of the detection operation is a list of possibly matching 25 | * charsets, or, for simple use, you can just ask for a Java Reader that 26 | * will will work over the input data. 27 | *

28 | * Character set detection is at best an imprecise operation. The detection 29 | * process will attempt to identify the charset that best matches the characteristics 30 | * of the byte data, but the process is partly statistical in nature, and 31 | * the results can not be guaranteed to always be correct. 32 | *

33 | * For best accuracy in charset detection, the input data should be primarily 34 | * in a single language, and a minimum of a few hundred bytes worth of plain text 35 | * in the language are needed. The detection process will attempt to 36 | * ignore html or xml style markup that could otherwise obscure the content. 37 | *

38 | * @stable ICU 3.4 39 | */ 40 | public class CharsetDetector { 41 | 42 | // Question: Should we have getters corresponding to the setters for input text 43 | // and declared encoding? 44 | 45 | // A thought: If we were to create our own type of Java Reader, we could defer 46 | // figuring out an actual charset for data that starts out with too much English 47 | // only ASCII until the user actually read through to something that didn't look 48 | // like 7 bit English. If nothing else ever appeared, we would never need to 49 | // actually choose the "real" charset. All assuming that the application just 50 | // wants the data, and doesn't care about a char set name. 51 | 52 | /** 53 | * Constructor 54 | * 55 | * @stable ICU 3.4 56 | */ 57 | public CharsetDetector() { 58 | } 59 | 60 | /** 61 | * Set the declared encoding for charset detection. 62 | * The declared encoding of an input text is an encoding obtained 63 | * from an http header or xml declaration or similar source that 64 | * can be provided as additional information to the charset detector. 65 | * A match between a declared encoding and a possible detected encoding 66 | * will raise the quality of that detected encoding by a small delta, 67 | * and will also appear as a "reason" for the match. 68 | *

69 | * A declared encoding that is incompatible with the input data being 70 | * analyzed will not be added to the list of possible encodings. 71 | * 72 | * @param encoding The declared encoding 73 | * 74 | * @stable ICU 3.4 75 | */ 76 | public CharsetDetector setDeclaredEncoding(String encoding) { 77 | fDeclaredEncoding = encoding; 78 | return this; 79 | } 80 | 81 | /** 82 | * Set the input text (byte) data whose charset is to be detected. 83 | * 84 | * @param in the input text of unknown encoding 85 | * 86 | * @return This CharsetDetector 87 | * 88 | * @stable ICU 3.4 89 | */ 90 | public CharsetDetector setText(byte [] in) { 91 | fRawInput = in; 92 | fRawLength = in.length; 93 | 94 | return this; 95 | } 96 | 97 | private static final int kBufSize = 8000; 98 | 99 | /** 100 | * Set the input text (byte) data whose charset is to be detected. 101 | *

102 | * The input stream that supplies the character data must have markSupported() 103 | * == true; the charset detection process will read a small amount of data, 104 | * then return the stream to its original position via 105 | * the InputStream.reset() operation. The exact amount that will 106 | * be read depends on the characteristics of the data itself. 107 | * 108 | * @param in the input text of unknown encoding 109 | * 110 | * @return This CharsetDetector 111 | * 112 | * @stable ICU 3.4 113 | */ 114 | 115 | public CharsetDetector setText(InputStream in) throws IOException { 116 | fInputStream = in; 117 | fInputStream.mark(kBufSize); 118 | fRawInput = new byte[kBufSize]; // Always make a new buffer because the 119 | // previous one may have come from the caller, 120 | // in which case we can't touch it. 121 | fRawLength = 0; 122 | int remainingLength = kBufSize; 123 | while (remainingLength > 0 ) { 124 | // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. 125 | int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); 126 | if (bytesRead <= 0) { 127 | break; 128 | } 129 | fRawLength += bytesRead; 130 | remainingLength -= bytesRead; 131 | } 132 | fInputStream.reset(); 133 | 134 | return this; 135 | } 136 | 137 | 138 | /** 139 | * Return the charset that best matches the supplied input data. 140 | * 141 | * Note though, that because the detection 142 | * only looks at the start of the input data, 143 | * there is a possibility that the returned charset will fail to handle 144 | * the full set of input data. 145 | *

146 | * Raise an exception if 147 | *

    148 | *
  • no charset appears to match the data.
  • 149 | *
  • no input text has been provided
  • 150 | *
151 | * 152 | * @return a CharsetMatch object representing the best matching charset, or 153 | * null if there are no matches. 154 | * 155 | * @stable ICU 3.4 156 | */ 157 | public CharsetMatch detect() { 158 | // TODO: A better implementation would be to copy the detect loop from 159 | // detectAll(), and cut it short as soon as a match with a high confidence 160 | // is found. This is something to be done later, after things are otherwise 161 | // working. 162 | CharsetMatch matches[] = detectAll(); 163 | 164 | if (matches == null || matches.length == 0) { 165 | return null; 166 | } 167 | 168 | return matches[0]; 169 | } 170 | 171 | /** 172 | * Return an array of all charsets that appear to be plausible 173 | * matches with the input data. The array is ordered with the 174 | * best quality match first. 175 | *

176 | * Raise an exception if 177 | *

    178 | *
  • no charsets appear to match the input data.
  • 179 | *
  • no input text has been provided
  • 180 | *
181 | * 182 | * @return An array of CharsetMatch objects representing possibly matching charsets. 183 | * 184 | * @stable ICU 3.4 185 | */ 186 | public CharsetMatch[] detectAll() { 187 | ArrayList matches = new ArrayList(); 188 | 189 | MungeInput(); // Strip html markup, collect byte stats. 190 | 191 | // Iterate over all possible charsets, remember all that 192 | // give a match quality > 0. 193 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 194 | CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 195 | boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; 196 | if (active) { 197 | CharsetMatch m = rcinfo.recognizer.match(this); 198 | if (m != null) { 199 | matches.add(m); 200 | } 201 | } 202 | } 203 | Collections.sort(matches); // CharsetMatch compares on confidence 204 | Collections.reverse(matches); // Put best match first. 205 | CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; 206 | resultArray = matches.toArray(resultArray); 207 | return resultArray; 208 | } 209 | 210 | 211 | /** 212 | * Autodetect the charset of an inputStream, and return a Java Reader 213 | * to access the converted input data. 214 | *

215 | * This is a convenience method that is equivalent to 216 | * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader(); 217 | *

218 | * For the input stream that supplies the character data, markSupported() 219 | * must be true; the charset detection will read a small amount of data, 220 | * then return the stream to its original position via 221 | * the InputStream.reset() operation. The exact amount that will 222 | * be read depends on the characteristics of the data itself. 223 | *

224 | * Raise an exception if no charsets appear to match the input data. 225 | * 226 | * @param in The source of the byte data in the unknown charset. 227 | * 228 | * @param declaredEncoding A declared encoding for the data, if available, 229 | * or null or an empty string if none is available. 230 | * 231 | * @stable ICU 3.4 232 | */ 233 | public Reader getReader(InputStream in, String declaredEncoding) { 234 | fDeclaredEncoding = declaredEncoding; 235 | 236 | try { 237 | setText(in); 238 | 239 | CharsetMatch match = detect(); 240 | 241 | if (match == null) { 242 | return null; 243 | } 244 | 245 | return match.getReader(); 246 | } catch (IOException e) { 247 | return null; 248 | } 249 | } 250 | 251 | /** 252 | * Autodetect the charset of an inputStream, and return a String 253 | * containing the converted input data. 254 | *

255 | * This is a convenience method that is equivalent to 256 | * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString(); 257 | *

258 | * Raise an exception if no charsets appear to match the input data. 259 | * 260 | * @param in The source of the byte data in the unknown charset. 261 | * 262 | * @param declaredEncoding A declared encoding for the data, if available, 263 | * or null or an empty string if none is available. 264 | * 265 | * @stable ICU 3.4 266 | */ 267 | public String getString(byte[] in, String declaredEncoding) 268 | { 269 | fDeclaredEncoding = declaredEncoding; 270 | 271 | try { 272 | setText(in); 273 | 274 | CharsetMatch match = detect(); 275 | 276 | if (match == null) { 277 | return null; 278 | } 279 | 280 | return match.getString(-1); 281 | } catch (IOException e) { 282 | return null; 283 | } 284 | } 285 | 286 | 287 | /** 288 | * Get the names of all charsets supported by CharsetDetector class. 289 | *

290 | * Note: Multiple different charset encodings in a same family may use 291 | * a single shared name in this implementation. For example, this method returns 292 | * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 293 | * (Windows Latin 1). However, actual detection result could be "windows-1252" 294 | * when the input data matches Latin 1 code points with any points only available 295 | * in "windows-1252". 296 | * 297 | * @return an array of the names of all charsets supported by 298 | * CharsetDetector class. 299 | * 300 | * @stable ICU 3.4 301 | */ 302 | public static String[] getAllDetectableCharsets() { 303 | String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; 304 | for (int i = 0; i < allCharsetNames.length; i++) { 305 | allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); 306 | } 307 | return allCharsetNames; 308 | } 309 | 310 | /** 311 | * Test whether or not input filtering is enabled. 312 | * 313 | * @return true if input text will be filtered. 314 | * 315 | * @see #enableInputFilter 316 | * 317 | * @stable ICU 3.4 318 | */ 319 | public boolean inputFilterEnabled() 320 | { 321 | return fStripTags; 322 | } 323 | 324 | /** 325 | * Enable filtering of input text. If filtering is enabled, 326 | * text within angle brackets ("<" and ">") will be removed 327 | * before detection. 328 | * 329 | * @param filter true to enable input text filtering. 330 | * 331 | * @return The previous setting. 332 | * 333 | * @stable ICU 3.4 334 | */ 335 | public boolean enableInputFilter(boolean filter) 336 | { 337 | boolean previous = fStripTags; 338 | 339 | fStripTags = filter; 340 | 341 | return previous; 342 | } 343 | 344 | /* 345 | * MungeInput - after getting a set of raw input data to be analyzed, preprocess 346 | * it by removing what appears to be html markup. 347 | */ 348 | private void MungeInput() { 349 | int srci = 0; 350 | int dsti = 0; 351 | byte b; 352 | boolean inMarkup = false; 353 | int openTags = 0; 354 | int badTags = 0; 355 | 356 | // 357 | // html / xml markup stripping. 358 | // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 359 | // discard everything within < brackets > 360 | // Count how many total '<' and illegal (nested) '<' occur, so we can make some 361 | // guess as to whether the input was actually marked up at all. 362 | if (fStripTags) { 363 | for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { 364 | b = fRawInput[srci]; 365 | if (b == (byte)'<') { 366 | if (inMarkup) { 367 | badTags++; 368 | } 369 | inMarkup = true; 370 | openTags++; 371 | } 372 | 373 | if (! inMarkup) { 374 | fInputBytes[dsti++] = b; 375 | } 376 | 377 | if (b == (byte)'>') { 378 | inMarkup = false; 379 | } 380 | } 381 | 382 | fInputLen = dsti; 383 | } 384 | 385 | // 386 | // If it looks like this input wasn't marked up, or if it looks like it's 387 | // essentially nothing but markup abandon the markup stripping. 388 | // Detection will have to work on the unstripped input. 389 | // 390 | if (openTags<5 || openTags/5 < badTags || 391 | (fInputLen < 100 && fRawLength>600)) { 392 | int limit = fRawLength; 393 | 394 | if (limit > kBufSize) { 395 | limit = kBufSize; 396 | } 397 | 398 | for (srci=0; srci ALL_CS_RECOGNIZERS; 476 | 477 | static { 478 | List list = new ArrayList(); 479 | 480 | list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); 481 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); 482 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); 483 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); 484 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); 485 | 486 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); 487 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); 488 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); 489 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); 490 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); 491 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); 492 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); 493 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); 494 | 495 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); 496 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); 497 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); 498 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); 499 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); 500 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); 501 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); 502 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); 503 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); 504 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); 505 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); 506 | 507 | // IBM 420/424 recognizers are disabled by default 508 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); 509 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); 510 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); 511 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); 512 | 513 | ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); 514 | } 515 | 516 | /** 517 | * Get the names of charsets that can be recognized by this CharsetDetector instance. 518 | * 519 | * @return an array of the names of charsets that can be recognized by this CharsetDetector 520 | * instance. 521 | * 522 | * @internal 523 | * @deprecated This API is ICU internal only. 524 | */ 525 | @Deprecated 526 | public String[] getDetectableCharsets() { 527 | List csnames = new ArrayList(ALL_CS_RECOGNIZERS.size()); 528 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 529 | CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 530 | boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; 531 | if (active) { 532 | csnames.add(rcinfo.recognizer.getName()); 533 | } 534 | } 535 | return csnames.toArray(new String[csnames.size()]); 536 | } 537 | 538 | /** 539 | * Enable or disable individual charset encoding. 540 | * A name of charset encoding must be included in the names returned by 541 | * {@link #getAllDetectableCharsets()}. 542 | * 543 | * @param encoding the name of charset encoding. 544 | * @param enabled true to enable, or false to disable the 545 | * charset encoding. 546 | * @return A reference to this CharsetDetector. 547 | * @throws IllegalArgumentException when the name of charset encoding is 548 | * not supported. 549 | * 550 | * @internal 551 | * @deprecated This API is ICU internal only. 552 | */ 553 | @Deprecated 554 | public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { 555 | int modIdx = -1; 556 | boolean isDefaultVal = false; 557 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 558 | CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); 559 | if (csrinfo.recognizer.getName().equals(encoding)) { 560 | modIdx = i; 561 | isDefaultVal = (csrinfo.isDefaultEnabled == enabled); 562 | break; 563 | } 564 | } 565 | if (modIdx < 0) { 566 | // No matching encoding found 567 | throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); 568 | } 569 | 570 | if (fEnabledRecognizers == null && !isDefaultVal) { 571 | // Create an array storing the non default setting 572 | fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; 573 | 574 | // Initialize the array with default info 575 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 576 | fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; 577 | } 578 | } 579 | 580 | if (fEnabledRecognizers != null) { 581 | fEnabledRecognizers[modIdx] = enabled; 582 | } 583 | 584 | return this; 585 | } 586 | } 587 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetMatch.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /** 4 | ******************************************************************************* 5 | * Copyright (C) 2005-2016, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | ******************************************************************************* 8 | */ 9 | package com.sigpwned.chardet4j.com.ibm.icu.text; 10 | 11 | import java.io.ByteArrayInputStream; 12 | import java.io.IOException; 13 | import java.io.InputStream; 14 | import java.io.InputStreamReader; 15 | import java.io.Reader; 16 | 17 | 18 | /** 19 | * This class represents a charset that has been identified by a CharsetDetector 20 | * as a possible encoding for a set of input data. From an instance of this 21 | * class, you can ask for a confidence level in the charset identification, 22 | * or for Java Reader or String to access the original byte data in Unicode form. 23 | *

24 | * Instances of this class are created only by CharsetDetectors. 25 | *

26 | * Note: this class has a natural ordering that is inconsistent with equals. 27 | * The natural ordering is based on the match confidence value. 28 | * 29 | * @stable ICU 3.4 30 | */ 31 | public class CharsetMatch implements Comparable { 32 | 33 | 34 | /** 35 | * Create a java.io.Reader for reading the Unicode character data corresponding 36 | * to the original byte data supplied to the Charset detect operation. 37 | *

38 | * CAUTION: if the source of the byte data was an InputStream, a Reader 39 | * can be created for only one matching char set using this method. If more 40 | * than one charset needs to be tried, the caller will need to reset 41 | * the InputStream and create InputStreamReaders itself, based on the charset name. 42 | * 43 | * @return the Reader for the Unicode character data. 44 | * 45 | * @stable ICU 3.4 46 | */ 47 | public Reader getReader() { 48 | InputStream inputStream = fInputStream; 49 | 50 | if (inputStream == null) { 51 | inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); 52 | } 53 | 54 | try { 55 | inputStream.reset(); 56 | return new InputStreamReader(inputStream, getName()); 57 | } catch (IOException e) { 58 | return null; 59 | } 60 | } 61 | 62 | /** 63 | * Create a Java String from Unicode character data corresponding 64 | * to the original byte data supplied to the Charset detect operation. 65 | * 66 | * @return a String created from the converted input data. 67 | * 68 | * @stable ICU 3.4 69 | */ 70 | public String getString() throws java.io.IOException { 71 | return getString(-1); 72 | 73 | } 74 | 75 | /** 76 | * Create a Java String from Unicode character data corresponding 77 | * to the original byte data supplied to the Charset detect operation. 78 | * The length of the returned string is limited to the specified size; 79 | * the string will be trunctated to this length if necessary. A limit value of 80 | * zero or less is ignored, and treated as no limit. 81 | * 82 | * @param maxLength The maximum length of the String to be created when the 83 | * source of the data is an input stream, or -1 for 84 | * unlimited length. 85 | * @return a String created from the converted input data. 86 | * 87 | * @stable ICU 3.4 88 | */ 89 | public String getString(int maxLength) throws java.io.IOException { 90 | String result = null; 91 | if (fInputStream != null) { 92 | StringBuilder sb = new StringBuilder(); 93 | char[] buffer = new char[1024]; 94 | Reader reader = getReader(); 95 | int max = maxLength < 0? Integer.MAX_VALUE : maxLength; 96 | int bytesRead = 0; 97 | 98 | while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) { 99 | sb.append(buffer, 0, bytesRead); 100 | max -= bytesRead; 101 | } 102 | 103 | reader.close(); 104 | 105 | return sb.toString(); 106 | } else { 107 | String name = getName(); 108 | /* 109 | * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot 110 | * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr' 111 | * should be stripped off before creating the string. 112 | */ 113 | int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl"); 114 | if (startSuffix > 0) { 115 | name = name.substring(0, startSuffix); 116 | } 117 | result = new String(fRawInput, name); 118 | } 119 | return result; 120 | 121 | } 122 | 123 | /** 124 | * Get an indication of the confidence in the charset detected. 125 | * Confidence values range from 0-100, with larger numbers indicating 126 | * a better match of the input data to the characteristics of the 127 | * charset. 128 | * 129 | * @return the confidence in the charset match 130 | * 131 | * @stable ICU 3.4 132 | */ 133 | public int getConfidence() { 134 | return fConfidence; 135 | } 136 | 137 | /** 138 | * Get the name of the detected charset. 139 | * The name will be one that can be used with other APIs on the 140 | * platform that accept charset names. It is the "Canonical name" 141 | * as defined by the class java.nio.charset.Charset; for 142 | * charsets that are registered with the IANA charset registry, 143 | * this is the MIME-preferred registerd name. 144 | * 145 | * @see java.nio.charset.Charset 146 | * @see java.io.InputStreamReader 147 | * 148 | * @return The name of the charset. 149 | * 150 | * @stable ICU 3.4 151 | */ 152 | public String getName() { 153 | return fCharsetName; 154 | } 155 | 156 | /** 157 | * Get the ISO code for the language of the detected charset. 158 | * 159 | * @return The ISO code for the language or null if the language cannot be determined. 160 | * 161 | * @stable ICU 3.4 162 | */ 163 | public String getLanguage() { 164 | return fLang; 165 | } 166 | 167 | /** 168 | * Compare to other CharsetMatch objects. 169 | * Comparison is based on the match confidence value, which 170 | * allows CharsetDetector.detectAll() to order its results. 171 | * 172 | * @param other the CharsetMatch object to compare against. 173 | * @return a negative integer, zero, or a positive integer as the 174 | * confidence level of this CharsetMatch 175 | * is less than, equal to, or greater than that of 176 | * the argument. 177 | * @throws ClassCastException if the argument is not a CharsetMatch. 178 | * @stable ICU 4.4 179 | */ 180 | @Override 181 | public int compareTo (CharsetMatch other) { 182 | int compareResult = 0; 183 | if (this.fConfidence > other.fConfidence) { 184 | compareResult = 1; 185 | } else if (this.fConfidence < other.fConfidence) { 186 | compareResult = -1; 187 | } 188 | return compareResult; 189 | } 190 | 191 | /* 192 | * Constructor. Implementation internal 193 | */ 194 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { 195 | fConfidence = conf; 196 | 197 | // The references to the original application input data must be copied out 198 | // of the charset recognizer to here, in case the application resets the 199 | // recognizer before using this CharsetMatch. 200 | if (det.fInputStream == null) { 201 | // We only want the existing input byte data if it came straight from the user, 202 | // not if is just the head of a stream. 203 | fRawInput = det.fRawInput; 204 | fRawLength = det.fRawLength; 205 | } 206 | fInputStream = det.fInputStream; 207 | fCharsetName = rec.getName(); 208 | fLang = rec.getLanguage(); 209 | } 210 | 211 | /* 212 | * Constructor. Implementation internal 213 | */ 214 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) { 215 | fConfidence = conf; 216 | 217 | // The references to the original application input data must be copied out 218 | // of the charset recognizer to here, in case the application resets the 219 | // recognizer before using this CharsetMatch. 220 | if (det.fInputStream == null) { 221 | // We only want the existing input byte data if it came straight from the user, 222 | // not if is just the head of a stream. 223 | fRawInput = det.fRawInput; 224 | fRawLength = det.fRawLength; 225 | } 226 | fInputStream = det.fInputStream; 227 | fCharsetName = csName; 228 | fLang = lang; 229 | } 230 | 231 | 232 | // 233 | // Private Data 234 | // 235 | private int fConfidence; 236 | private byte[] fRawInput = null; // Original, untouched input bytes. 237 | // If user gave us a byte array, this is it. 238 | private int fRawLength; // Length of data in fRawInput array. 239 | 240 | private InputStream fInputStream = null; // User's input stream, or null if the user 241 | // gave us a byte array. 242 | 243 | private String fCharsetName; // The name of the charset this CharsetMatch 244 | // represents. Filled in by the recognizer. 245 | private String fLang; // The language, if one was determined by 246 | // the recognizer during the detect operation. 247 | } 248 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_2022.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /* 4 | ******************************************************************************* 5 | * Copyright (C) 2005 - 2012, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | ******************************************************************************* 8 | */ 9 | package com.sigpwned.chardet4j.com.ibm.icu.text; 10 | 11 | /** 12 | * class CharsetRecog_2022 part of the ICU charset detection implementation. 13 | * This is a superclass for the individual detectors for 14 | * each of the detectable members of the ISO 2022 family 15 | * of encodings. 16 | * 17 | * The separate classes are nested within this class. 18 | */ 19 | abstract class CharsetRecog_2022 extends CharsetRecognizer { 20 | 21 | 22 | /** 23 | * Matching function shared among the 2022 detectors JP, CN and KR 24 | * Counts up the number of legal an unrecognized escape sequences in 25 | * the sample of text, and computes a score based on the total number & 26 | * the proportion that fit the encoding. 27 | * 28 | * 29 | * @param text the byte buffer containing text to analyse 30 | * @param textLen the size of the text in the byte. 31 | * @param escapeSequences the byte escape sequences to test for. 32 | * @return match quality, in the range of 0-100. 33 | */ 34 | int match(byte [] text, int textLen, byte [][] escapeSequences) { 35 | int i, j; 36 | int escN; 37 | int hits = 0; 38 | int misses = 0; 39 | int shifts = 0; 40 | int quality; 41 | scanInput: 42 | for (i=0; i= 3 && 35 | (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { 36 | hasBOM = true; 37 | } 38 | 39 | // Scan for multi-byte sequences 40 | for (i=0; i=det.fRawLength) { 62 | break; 63 | } 64 | b = input[i]; 65 | if ((b & 0xc0) != 0x080) { 66 | numInvalid++; 67 | break; 68 | } 69 | if (--trailBytes == 0) { 70 | numValid++; 71 | break; 72 | } 73 | } 74 | } 75 | 76 | // Cook up some sort of confidence score, based on presence of a BOM 77 | // and the existence of valid and/or invalid multi-byte sequences. 78 | confidence = 0; 79 | if (hasBOM && numInvalid==0) { 80 | confidence = 100; 81 | } else if (hasBOM && numValid > numInvalid*10) { 82 | confidence = 80; 83 | } else if (numValid > 3 && numInvalid == 0) { 84 | confidence = 100; 85 | } else if (numValid > 0 && numInvalid == 0) { 86 | confidence = 80; 87 | } else if (numValid == 0 && numInvalid == 0) { 88 | // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which 89 | // accepts ASCII with confidence = 10. 90 | // TODO: add plain ASCII as an explicitly detected type. 91 | confidence = 15; 92 | } else if (numValid > numInvalid*10) { 93 | // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. 94 | confidence = 25; 95 | } 96 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_Unicode.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /* 4 | ******************************************************************************* 5 | * Copyright (C) 1996-2013, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | ******************************************************************************* 8 | * 9 | */ 10 | 11 | package com.sigpwned.chardet4j.com.ibm.icu.text; 12 | 13 | /** 14 | * This class matches UTF-16 and UTF-32, both big- and little-endian. The 15 | * BOM will be used if it is present. 16 | */ 17 | abstract class CharsetRecog_Unicode extends CharsetRecognizer { 18 | 19 | /* (non-Javadoc) 20 | * @see com.ibm.icu.text.CharsetRecognizer#getName() 21 | */ 22 | @Override 23 | abstract String getName(); 24 | 25 | /* (non-Javadoc) 26 | * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) 27 | */ 28 | @Override 29 | abstract CharsetMatch match(CharsetDetector det); 30 | 31 | static int codeUnit16FromBytes(byte hi, byte lo) { 32 | return ((hi & 0xff) << 8) | (lo & 0xff); 33 | } 34 | 35 | // UTF-16 confidence calculation. Very simple minded, but better than nothing. 36 | // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, 37 | // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. 38 | // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. 39 | // NULs should be rare in actual text. 40 | static int adjustConfidence(int codeUnit, int confidence) { 41 | if (codeUnit == 0) { 42 | confidence -= 10; 43 | } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { 44 | confidence += 10; 45 | } 46 | if (confidence < 0) { 47 | confidence = 0; 48 | } else if (confidence > 100) { 49 | confidence = 100; 50 | } 51 | return confidence; 52 | } 53 | 54 | static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode 55 | { 56 | @Override 57 | String getName() 58 | { 59 | return "UTF-16BE"; 60 | } 61 | 62 | @Override 63 | CharsetMatch match(CharsetDetector det) 64 | { 65 | byte[] input = det.fRawInput; 66 | int confidence = 10; 67 | 68 | int bytesToCheck = Math.min(input.length, 30); 69 | for (int charIndex=0; charIndex 0) { 84 | return new CharsetMatch(det, this, confidence); 85 | } 86 | return null; 87 | } 88 | } 89 | 90 | static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode 91 | { 92 | @Override 93 | String getName() 94 | { 95 | return "UTF-16LE"; 96 | } 97 | 98 | @Override 99 | CharsetMatch match(CharsetDetector det) 100 | { 101 | byte[] input = det.fRawInput; 102 | int confidence = 10; 103 | 104 | int bytesToCheck = Math.min(input.length, 30); 105 | for (int charIndex=0; charIndex 0) { 120 | return new CharsetMatch(det, this, confidence); 121 | } 122 | return null; 123 | } 124 | } 125 | 126 | static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode 127 | { 128 | abstract int getChar(byte[] input, int index); 129 | 130 | @Override 131 | abstract String getName(); 132 | 133 | @Override 134 | CharsetMatch match(CharsetDetector det) 135 | { 136 | byte[] input = det.fRawInput; 137 | int limit = (det.fRawLength / 4) * 4; 138 | int numValid = 0; 139 | int numInvalid = 0; 140 | boolean hasBOM = false; 141 | int confidence = 0; 142 | 143 | if (limit==0) { 144 | return null; 145 | } 146 | if (getChar(input, 0) == 0x0000FEFF) { 147 | hasBOM = true; 148 | } 149 | 150 | for(int i = 0; i < limit; i += 4) { 151 | int ch = getChar(input, i); 152 | 153 | if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { 154 | numInvalid += 1; 155 | } else { 156 | numValid += 1; 157 | } 158 | } 159 | 160 | 161 | // Cook up some sort of confidence score, based on presence of a BOM 162 | // and the existence of valid and/or invalid multi-byte sequences. 163 | if (hasBOM && numInvalid==0) { 164 | confidence = 100; 165 | } else if (hasBOM && numValid > numInvalid*10) { 166 | confidence = 80; 167 | } else if (numValid > 3 && numInvalid == 0) { 168 | confidence = 100; 169 | } else if (numValid > 0 && numInvalid == 0) { 170 | confidence = 80; 171 | } else if (numValid > numInvalid*10) { 172 | // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. 173 | confidence = 25; 174 | } 175 | 176 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 177 | } 178 | } 179 | 180 | static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 181 | { 182 | @Override 183 | int getChar(byte[] input, int index) 184 | { 185 | return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | 186 | (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); 187 | } 188 | 189 | @Override 190 | String getName() 191 | { 192 | return "UTF-32BE"; 193 | } 194 | } 195 | 196 | 197 | static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 198 | { 199 | @Override 200 | int getChar(byte[] input, int index) 201 | { 202 | return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | 203 | (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); 204 | } 205 | 206 | @Override 207 | String getName() 208 | { 209 | return "UTF-32LE"; 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_mbcs.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /* 4 | **************************************************************************** 5 | * Copyright (C) 2005-2012, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | **************************************************************************** 8 | * 9 | */ 10 | package com.sigpwned.chardet4j.com.ibm.icu.text; 11 | 12 | import java.util.Arrays; 13 | 14 | /** 15 | * CharsetRecognizer implementation for Asian - double or multi-byte - charsets. 16 | * Match is determined mostly by the input data adhering to the 17 | * encoding scheme for the charset, and, optionally, 18 | * frequency-of-occurrence of characters. 19 | *

20 | * Instances of this class are singletons, one per encoding 21 | * being recognized. They are created in the main 22 | * CharsetDetector class and kept in the global list of available 23 | * encodings to be checked. The specific encoding being recognized 24 | * is determined by subclass. 25 | */ 26 | abstract class CharsetRecog_mbcs extends CharsetRecognizer { 27 | 28 | /** 29 | * Get the IANA name of this charset. 30 | * @return the charset name. 31 | */ 32 | @Override 33 | abstract String getName() ; 34 | 35 | 36 | /** 37 | * Test the match of this charset with the input text data 38 | * which is obtained via the CharsetDetector object. 39 | * 40 | * @param det The CharsetDetector, which contains the input text 41 | * to be checked for being in this charset. 42 | * @return Two values packed into one int (Damn java, anyhow) 43 | *
44 | * bits 0-7: the match confidence, ranging from 0-100 45 | *
46 | * bits 8-15: The match reason, an enum-like value. 47 | */ 48 | int match(CharsetDetector det, int [] commonChars) { 49 | @SuppressWarnings("unused") 50 | int singleByteCharCount = 0; //TODO Do we really need this? 51 | int doubleByteCharCount = 0; 52 | int commonCharCount = 0; 53 | int badCharCount = 0; 54 | int totalCharCount = 0; 55 | int confidence = 0; 56 | iteratedChar iter = new iteratedChar(); 57 | 58 | detectBlock: { 59 | for (iter.reset(); nextChar(iter, det);) { 60 | totalCharCount++; 61 | if (iter.error) { 62 | badCharCount++; 63 | } else { 64 | long cv = iter.charValue & 0xFFFFFFFFL; 65 | 66 | if (cv <= 0xff) { 67 | singleByteCharCount++; 68 | } else { 69 | doubleByteCharCount++; 70 | if (commonChars != null) { 71 | // NOTE: This assumes that there are no 4-byte common chars. 72 | if (Arrays.binarySearch(commonChars, (int) cv) >= 0) { 73 | commonCharCount++; 74 | } 75 | } 76 | } 77 | } 78 | if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 79 | // Bail out early if the byte data is not matching the encoding scheme. 80 | break detectBlock; 81 | } 82 | } 83 | 84 | if (doubleByteCharCount <= 10 && badCharCount== 0) { 85 | // Not many multi-byte chars. 86 | if (doubleByteCharCount == 0 && totalCharCount < 10) { 87 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 88 | // We don't have enough data to have any confidence. 89 | // Statistical analysis of single byte non-ASCII characters would probably help here. 90 | confidence = 0; 91 | } 92 | else { 93 | // ASCII or ISO file? It's probably not our encoding, 94 | // but is not incompatible with our encoding, so don't give it a zero. 95 | confidence = 10; 96 | } 97 | 98 | break detectBlock; 99 | } 100 | 101 | // 102 | // No match if there are too many characters that don't fit the encoding scheme. 103 | // (should we have zero tolerance for these?) 104 | // 105 | if (doubleByteCharCount < 20*badCharCount) { 106 | confidence = 0; 107 | break detectBlock; 108 | } 109 | 110 | if (commonChars == null) { 111 | // We have no statistics on frequently occurring characters. 112 | // Assess confidence purely on having a reasonable number of 113 | // multi-byte characters (the more the better 114 | confidence = 30 + doubleByteCharCount - 20*badCharCount; 115 | if (confidence > 100) { 116 | confidence = 100; 117 | } 118 | }else { 119 | // 120 | // Frequency of occurrence statistics exist. 121 | // 122 | double maxVal = Math.log((float)doubleByteCharCount / 4); 123 | double scaleFactor = 90.0 / maxVal; 124 | confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); 125 | confidence = Math.min(confidence, 100); 126 | } 127 | } // end of detectBlock: 128 | 129 | return confidence; 130 | } 131 | 132 | // "Character" iterated character class. 133 | // Recognizers for specific mbcs encodings make their "characters" available 134 | // by providing a nextChar() function that fills in an instance of iteratedChar 135 | // with the next char from the input. 136 | // The returned characters are not converted to Unicode, but remain as the raw 137 | // bytes (concatenated into an int) from the codepage data. 138 | // 139 | // For Asian charsets, use the raw input rather than the input that has been 140 | // stripped of markup. Detection only considers multi-byte chars, effectively 141 | // stripping markup anyway, and double byte chars do occur in markup too. 142 | // 143 | static class iteratedChar { 144 | int charValue = 0; // 1-4 bytes from the raw input data 145 | int nextIndex = 0; 146 | boolean error = false; 147 | boolean done = false; 148 | 149 | void reset() { 150 | charValue = 0; 151 | nextIndex = 0; 152 | error = false; 153 | done = false; 154 | } 155 | 156 | int nextByte(CharsetDetector det) { 157 | if (nextIndex >= det.fRawLength) { 158 | done = true; 159 | return -1; 160 | } 161 | int byteValue = det.fRawInput[nextIndex++] & 0x00ff; 162 | return byteValue; 163 | } 164 | } 165 | 166 | /** 167 | * Get the next character (however many bytes it is) from the input data 168 | * Subclasses for specific charset encodings must implement this function 169 | * to get characters according to the rules of their encoding scheme. 170 | * 171 | * This function is not a method of class iteratedChar only because 172 | * that would require a lot of extra derived classes, which is awkward. 173 | * @param it The iteratedChar "struct" into which the returned char is placed. 174 | * @param det The charset detector, which is needed to get at the input byte data 175 | * being iterated over. 176 | * @return True if a character was returned, false at end of input. 177 | */ 178 | abstract boolean nextChar(iteratedChar it, CharsetDetector det); 179 | 180 | 181 | 182 | 183 | 184 | /** 185 | * Shift-JIS charset recognizer. 186 | * 187 | */ 188 | static class CharsetRecog_sjis extends CharsetRecog_mbcs { 189 | static int [] commonChars = 190 | // TODO: This set of data comes from the character frequency- 191 | // of-occurrence analysis tool. The data needs to be moved 192 | // into a resource and loaded from there. 193 | {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 194 | 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 195 | 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 196 | 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 197 | 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 198 | 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 199 | 200 | @Override 201 | boolean nextChar(iteratedChar it, CharsetDetector det) { 202 | it.error = false; 203 | int firstByte; 204 | firstByte = it.charValue = it.nextByte(det); 205 | if (firstByte < 0) { 206 | return false; 207 | } 208 | 209 | if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { 210 | return true; 211 | } 212 | 213 | int secondByte = it.nextByte(det); 214 | if (secondByte < 0) { 215 | return false; 216 | } 217 | it.charValue = (firstByte << 8) | secondByte; 218 | if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { 219 | // Illegal second byte value. 220 | it.error = true; 221 | } 222 | return true; 223 | } 224 | 225 | @Override 226 | CharsetMatch match(CharsetDetector det) { 227 | int confidence = match(det, commonChars); 228 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 229 | } 230 | 231 | @Override 232 | String getName() { 233 | return "Shift_JIS"; 234 | } 235 | 236 | @Override 237 | public String getLanguage() 238 | { 239 | return "ja"; 240 | } 241 | 242 | 243 | } 244 | 245 | 246 | /** 247 | * Big5 charset recognizer. 248 | * 249 | */ 250 | static class CharsetRecog_big5 extends CharsetRecog_mbcs { 251 | static int [] commonChars = 252 | // TODO: This set of data comes from the character frequency- 253 | // of-occurrence analysis tool. The data needs to be moved 254 | // into a resource and loaded from there. 255 | {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 256 | 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 257 | 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 258 | 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 259 | 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 260 | 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 261 | 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 262 | 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 263 | 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 264 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 265 | 266 | @Override 267 | boolean nextChar(iteratedChar it, CharsetDetector det) { 268 | it.error = false; 269 | int firstByte; 270 | firstByte = it.charValue = it.nextByte(det); 271 | if (firstByte < 0) { 272 | return false; 273 | } 274 | 275 | if (firstByte <= 0x7f || firstByte==0xff) { 276 | // single byte character. 277 | return true; 278 | } 279 | 280 | int secondByte = it.nextByte(det); 281 | if (secondByte < 0) { 282 | return false; 283 | } 284 | it.charValue = (it.charValue << 8) | secondByte; 285 | 286 | if (secondByte < 0x40 || 287 | secondByte ==0x7f || 288 | secondByte == 0xff) { 289 | it.error = true; 290 | } 291 | return true; 292 | } 293 | 294 | @Override 295 | CharsetMatch match(CharsetDetector det) { 296 | int confidence = match(det, commonChars); 297 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 298 | } 299 | 300 | @Override 301 | String getName() { 302 | return "Big5"; 303 | } 304 | 305 | 306 | @Override 307 | public String getLanguage() 308 | { 309 | return "zh"; 310 | } 311 | } 312 | 313 | 314 | /** 315 | * EUC charset recognizers. One abstract class that provides the common function 316 | * for getting the next character according to the EUC encoding scheme, 317 | * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 318 | * 319 | */ 320 | abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { 321 | 322 | /* 323 | * (non-Javadoc) 324 | * Get the next character value for EUC based encodings. 325 | * Character "value" is simply the raw bytes that make up the character 326 | * packed into an int. 327 | */ 328 | @Override 329 | boolean nextChar(iteratedChar it, CharsetDetector det) { 330 | it.error = false; 331 | int firstByte = 0; 332 | int secondByte = 0; 333 | int thirdByte = 0; 334 | //int fourthByte = 0; 335 | 336 | buildChar: { 337 | firstByte = it.charValue = it.nextByte(det); 338 | if (firstByte < 0) { 339 | // Ran off the end of the input data 340 | it.done = true; 341 | break buildChar; 342 | } 343 | if (firstByte <= 0x8d) { 344 | // single byte char 345 | break buildChar; 346 | } 347 | 348 | secondByte = it.nextByte(det); 349 | it.charValue = (it.charValue << 8) | secondByte; 350 | 351 | if (firstByte >= 0xA1 && firstByte <= 0xfe) { 352 | // Two byte Char 353 | if (secondByte < 0xa1) { 354 | it.error = true; 355 | } 356 | break buildChar; 357 | } 358 | if (firstByte == 0x8e) { 359 | // Code Set 2. 360 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 361 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 362 | // We don't know which we've got. 363 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two 364 | // bytes will look like a well formed 2 byte char. 365 | if (secondByte < 0xa1) { 366 | it.error = true; 367 | } 368 | break buildChar; 369 | } 370 | 371 | if (firstByte == 0x8f) { 372 | // Code set 3. 373 | // Three byte total char size, two bytes of actual char value. 374 | thirdByte = it.nextByte(det); 375 | it.charValue = (it.charValue << 8) | thirdByte; 376 | if (thirdByte < 0xa1) { 377 | it.error = true; 378 | } 379 | } 380 | } 381 | 382 | return (it.done == false); 383 | } 384 | 385 | /** 386 | * The charset recognize for EUC-JP. A singleton instance of this class 387 | * is created and kept by the public CharsetDetector class 388 | */ 389 | static class CharsetRecog_euc_jp extends CharsetRecog_euc { 390 | static int [] commonChars = 391 | // TODO: This set of data comes from the character frequency- 392 | // of-occurrence analysis tool. The data needs to be moved 393 | // into a resource and loaded from there. 394 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 395 | 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 396 | 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 397 | 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 398 | 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 399 | 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 400 | 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 401 | 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 402 | 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 403 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 404 | @Override 405 | String getName() { 406 | return "EUC-JP"; 407 | } 408 | 409 | @Override 410 | CharsetMatch match(CharsetDetector det) { 411 | int confidence = match(det, commonChars); 412 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 413 | } 414 | 415 | @Override 416 | public String getLanguage() 417 | { 418 | return "ja"; 419 | } 420 | } 421 | 422 | /** 423 | * The charset recognize for EUC-KR. A singleton instance of this class 424 | * is created and kept by the public CharsetDetector class 425 | */ 426 | static class CharsetRecog_euc_kr extends CharsetRecog_euc { 427 | static int [] commonChars = 428 | // TODO: This set of data comes from the character frequency- 429 | // of-occurrence analysis tool. The data needs to be moved 430 | // into a resource and loaded from there. 431 | {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 432 | 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 433 | 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 434 | 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 435 | 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 436 | 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 437 | 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 438 | 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 439 | 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 440 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 441 | 442 | @Override 443 | String getName() { 444 | return "EUC-KR"; 445 | } 446 | 447 | @Override 448 | CharsetMatch match(CharsetDetector det) { 449 | int confidence = match(det, commonChars); 450 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 451 | } 452 | 453 | @Override 454 | public String getLanguage() 455 | { 456 | return "ko"; 457 | } 458 | } 459 | } 460 | 461 | /** 462 | * 463 | * GB-18030 recognizer. Uses simplified Chinese statistics. 464 | * 465 | */ 466 | static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { 467 | 468 | /* 469 | * (non-Javadoc) 470 | * Get the next character value for EUC based encodings. 471 | * Character "value" is simply the raw bytes that make up the character 472 | * packed into an int. 473 | */ 474 | @Override 475 | boolean nextChar(iteratedChar it, CharsetDetector det) { 476 | it.error = false; 477 | int firstByte = 0; 478 | int secondByte = 0; 479 | int thirdByte = 0; 480 | int fourthByte = 0; 481 | 482 | buildChar: { 483 | firstByte = it.charValue = it.nextByte(det); 484 | 485 | if (firstByte < 0) { 486 | // Ran off the end of the input data 487 | it.done = true; 488 | break buildChar; 489 | } 490 | 491 | if (firstByte <= 0x80) { 492 | // single byte char 493 | break buildChar; 494 | } 495 | 496 | secondByte = it.nextByte(det); 497 | it.charValue = (it.charValue << 8) | secondByte; 498 | 499 | if (firstByte >= 0x81 && firstByte <= 0xFE) { 500 | // Two byte Char 501 | if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { 502 | break buildChar; 503 | } 504 | 505 | // Four byte char 506 | if (secondByte >= 0x30 && secondByte <= 0x39) { 507 | thirdByte = it.nextByte(det); 508 | 509 | if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 510 | fourthByte = it.nextByte(det); 511 | 512 | if (fourthByte >= 0x30 && fourthByte <= 0x39) { 513 | it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; 514 | break buildChar; 515 | } 516 | } 517 | } 518 | 519 | it.error = true; 520 | break buildChar; 521 | } 522 | } 523 | 524 | return (it.done == false); 525 | } 526 | 527 | static int [] commonChars = 528 | // TODO: This set of data comes from the character frequency- 529 | // of-occurrence analysis tool. The data needs to be moved 530 | // into a resource and loaded from there. 531 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 532 | 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 533 | 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 534 | 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 535 | 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 536 | 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 537 | 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 538 | 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 539 | 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 540 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 541 | 542 | 543 | @Override 544 | String getName() { 545 | return "GB18030"; 546 | } 547 | 548 | @Override 549 | CharsetMatch match(CharsetDetector det) { 550 | int confidence = match(det, commonChars); 551 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 552 | } 553 | 554 | @Override 555 | public String getLanguage() 556 | { 557 | return "zh"; 558 | } 559 | } 560 | 561 | 562 | } 563 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecognizer.java: -------------------------------------------------------------------------------- 1 | // © 2016 and later: Unicode, Inc. and others. 2 | // License & terms of use: http://www.unicode.org/copyright.html 3 | /** 4 | ******************************************************************************* 5 | * Copyright (C) 2005-2012, International Business Machines Corporation and * 6 | * others. All Rights Reserved. * 7 | ******************************************************************************* 8 | */ 9 | package com.sigpwned.chardet4j.com.ibm.icu.text; 10 | 11 | /** 12 | * Abstract class for recognizing a single charset. 13 | * Part of the implementation of ICU's CharsetDetector. 14 | * 15 | * Each specific charset that can be recognized will have an instance 16 | * of some subclass of this class. All interaction between the overall 17 | * CharsetDetector and the stuff specific to an individual charset happens 18 | * via the interface provided here. 19 | * 20 | * Instances of CharsetDetector DO NOT have or maintain 21 | * state pertaining to a specific match or detect operation. 22 | * The WILL be shared by multiple instances of CharsetDetector. 23 | * They encapsulate const charset-specific information. 24 | */ 25 | abstract class CharsetRecognizer { 26 | /** 27 | * Get the IANA name of this charset. 28 | * @return the charset name. 29 | */ 30 | abstract String getName(); 31 | 32 | /** 33 | * Get the ISO language code for this charset. 34 | * @return the language code, or null if the language cannot be determined. 35 | */ 36 | public String getLanguage() 37 | { 38 | return null; 39 | } 40 | 41 | /** 42 | * Test the match of this charset with the input text data 43 | * which is obtained via the CharsetDetector object. 44 | * 45 | * @param det The CharsetDetector, which contains the input text 46 | * to be checked for being in this charset. 47 | * @return A CharsetMatch object containing details of match 48 | * with this charset, or null if there was no match. 49 | */ 50 | abstract CharsetMatch match(CharsetDetector det); 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This package contains code from the icu4j project. This was originally released under the ICU 3 | * license. This project is released under the Apache 2 license. 4 | * 5 | * @see https://github.com/unicode-org/icu 6 | */ 7 | package com.sigpwned.chardet4j.com.ibm.icu.text; 8 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/io/BomAwareInputStream.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 - 2024 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j.io; 21 | 22 | import java.io.ByteArrayInputStream; 23 | import java.io.FilterInputStream; 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.io.SequenceInputStream; 27 | import java.util.Optional; 28 | import com.sigpwned.chardet4j.ByteOrderMark; 29 | import com.sigpwned.chardet4j.util.ByteStreams; 30 | 31 | /** 32 | * A wrapper {@link InputStream} that remembers the {@link ByteOrderMark} that was detected at the 33 | * beginning of the stream. 34 | */ 35 | public final class BomAwareInputStream extends FilterInputStream { 36 | /** 37 | * Detect the {@link ByteOrderMark} at the beginning of the stream, if any, and return a 38 | * {@link BomAwareInputStream} that wraps the stream. 39 | * 40 | * @param in the input stream 41 | * @return the {@link BomAwareInputStream} 42 | * @throws IOException if an I/O error occurs 43 | */ 44 | public static BomAwareInputStream detect(InputStream in) throws IOException { 45 | final byte[] buf = ByteStreams.readNBytes(in, ByteOrderMark.MAX_BYTE_LENGTH); 46 | 47 | ByteOrderMark bom = ByteOrderMark.detect(buf).orElse(null); 48 | 49 | // If there is no BOM, then return all the bytes read so far, followed by the rest of the stream 50 | if (bom == null) 51 | return new BomAwareInputStream(new SequenceInputStream(new ByteArrayInputStream(buf), in), 52 | null); 53 | 54 | final int bomlen = bom.length(); 55 | 56 | // If there is a BOM and it is the same length as the bytes read so far, then return the rest of 57 | // the stream 58 | if (bomlen == buf.length) 59 | return new BomAwareInputStream(in, bom); 60 | 61 | // If there is a BOM and it is shorter than the bytes read so far, then return the BOM followed 62 | // by the rest of the bytes read so far, followed by the rest of the stream 63 | return new BomAwareInputStream( 64 | new SequenceInputStream(new ByteArrayInputStream(buf, bomlen, buf.length - bomlen), in), 65 | bom); 66 | } 67 | 68 | private final ByteOrderMark bom; 69 | 70 | private BomAwareInputStream(InputStream delegate, ByteOrderMark bom) { 71 | super(delegate); 72 | this.bom = bom; 73 | } 74 | 75 | /** 76 | * The {@link ByteOrderMark} that was detected at the beginning of the stream, if any, or else 77 | * empty. 78 | * 79 | * @return the {@link ByteOrderMark} 80 | */ 81 | public Optional bom() { 82 | return Optional.ofNullable(bom); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/io/DecodedInputStreamReader.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 - 2024 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j.io; 21 | 22 | import static java.util.Objects.requireNonNull; 23 | import java.io.InputStream; 24 | import java.io.InputStreamReader; 25 | import java.nio.charset.Charset; 26 | 27 | /** 28 | * A simple wrapper around an InputStreamReader that remembers the charset that was used to decode 29 | * the input stream. 30 | */ 31 | public final class DecodedInputStreamReader extends InputStreamReader { 32 | private final Charset charset; 33 | 34 | public DecodedInputStreamReader(InputStream in, Charset charset) { 35 | super(in, charset); 36 | this.charset = requireNonNull(charset); 37 | } 38 | 39 | /** 40 | * The charset that was used to decode the input stream. 41 | * 42 | * @return the charset 43 | */ 44 | public Charset charset() { 45 | return charset; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/util/ByteStreams.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 - 2024 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j.util; 21 | 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | import java.util.Arrays; 25 | 26 | /** 27 | * Utility methods for working with {@link InputStream byte streams}. 28 | */ 29 | public final class ByteStreams { 30 | private ByteStreams() {} 31 | 32 | /** 33 | * Read as many bytes as possible from the the given {@link InputStream}, up to count, and return 34 | * them as a byte array. If the stream ends before count bytes can be read, then the returned 35 | * array will be shorter than count. Equivalent to the Java 9+ {@code InputStream} method of the 36 | * same name. 37 | * 38 | * @param in the input stream 39 | * @param count the maximum number of bytes to read 40 | * @return the bytes read 41 | * @throws NullPointerException if in is null 42 | * @throws IllegalArgumentException if count is negative 43 | * @throws IOException if an I/O error occurs 44 | */ 45 | public static byte[] readNBytes(InputStream in, int count) throws IOException { 46 | if (in == null) 47 | throw new NullPointerException(); 48 | if (count < 0) 49 | throw new IllegalArgumentException("count must not be negative"); 50 | 51 | final byte[] buf = new byte[count]; 52 | if (count == 0) 53 | return buf; 54 | 55 | int len = 0; 56 | for (int nread = in.read(buf); nread != -1; nread = in.read(buf, len, buf.length - len)) { 57 | len = len + nread; 58 | if (len == buf.length) 59 | break; 60 | } 61 | 62 | if (len == buf.length) 63 | return buf; 64 | 65 | return Arrays.copyOf(buf, len); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/sigpwned/chardet4j/util/CharStreams.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 - 2024 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j.util; 21 | 22 | import java.io.IOException; 23 | import java.io.Reader; 24 | import java.io.Writer; 25 | 26 | public final class CharStreams { 27 | private CharStreams() {} 28 | 29 | /** 30 | * Copy all characters from the given {@link Reader} to the given {@link Writer} and return the 31 | * total number of characters copied. Equivalent to the Java 9+ {@code Reader} method of the same 32 | * name. 33 | * 34 | * @param in the input reader 35 | * @param out the output writer 36 | * @return the total number of characters copied 37 | * @throws NullPointerException if in or out is null 38 | * @throws IOException if an I/O error occurs 39 | */ 40 | public static long transferTo(Reader in, Writer out) throws IOException { 41 | if (in == null) 42 | throw new NullPointerException(); 43 | if (out == null) 44 | throw new NullPointerException(); 45 | 46 | long total = 0; 47 | 48 | final char[] buf = new char[8192]; 49 | for (int nread = in.read(buf); nread != -1; nread = in.read(buf)) { 50 | out.write(buf, 0, nread); 51 | total = total + nread; 52 | } 53 | 54 | return total; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test/java/com/sigpwned/chardet4j/ChardetTest.java: -------------------------------------------------------------------------------- 1 | /*- 2 | * =================================LICENSE_START================================== 3 | * chardet4j 4 | * ====================================SECTION===================================== 5 | * Copyright (C) 2022 Andy Boothe 6 | * ====================================SECTION===================================== 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * ==================================LICENSE_END=================================== 19 | */ 20 | package com.sigpwned.chardet4j; 21 | 22 | import static java.util.Arrays.asList; 23 | import static java.util.Objects.requireNonNull; 24 | import static org.hamcrest.CoreMatchers.anyOf; 25 | import static org.hamcrest.CoreMatchers.is; 26 | import static org.hamcrest.MatcherAssert.assertThat; 27 | import java.io.ByteArrayInputStream; 28 | import java.io.ByteArrayOutputStream; 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | import java.io.SequenceInputStream; 32 | import java.io.StringWriter; 33 | import java.nio.charset.Charset; 34 | import java.nio.charset.StandardCharsets; 35 | import java.nio.charset.UnsupportedCharsetException; 36 | import java.util.List; 37 | import java.util.Optional; 38 | import org.junit.Test; 39 | import com.google.common.io.CharStreams; 40 | import com.google.common.io.Resources; 41 | import com.sigpwned.chardet4j.io.DecodedInputStreamReader; 42 | 43 | public class ChardetTest { 44 | @Test 45 | public void iso8859Test() { 46 | Charset charset = 47 | Chardet.detectCharset("Hello, world!".getBytes(StandardCharsets.ISO_8859_1)).get(); 48 | 49 | assertThat(charset, is(StandardCharsets.ISO_8859_1)); 50 | } 51 | 52 | @Test 53 | public void iso8859Utf8Test() { 54 | Charset charset = 55 | Chardet.detectCharset("Hello, world!".getBytes(StandardCharsets.UTF_8), "utf-8").get(); 56 | 57 | assertThat(charset, is(StandardCharsets.UTF_8)); 58 | } 59 | 60 | @Test 61 | public void utf8Test() { 62 | Charset charset = Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8)).get(); 63 | 64 | assertThat(charset, is(StandardCharsets.UTF_8)); 65 | } 66 | 67 | @Test 68 | public void utf16BeTest() { 69 | Charset charset = 70 | Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_16BE)).get(); 71 | 72 | assertThat(charset, is(StandardCharsets.UTF_16BE)); 73 | } 74 | 75 | @Test 76 | public void utf16LeTest() { 77 | Charset charset = 78 | Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_16LE)).get(); 79 | 80 | assertThat(charset, is(StandardCharsets.UTF_16LE)); 81 | } 82 | 83 | @Test 84 | public void utf8BomTest() throws IOException { 85 | ByteArrayOutputStream buf = new ByteArrayOutputStream(); 86 | buf.write(ByteOrderMark.UTF_8.getBytes()); 87 | buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8)); 88 | 89 | Charset charset = Chardet.detectCharset(buf.toByteArray()).get(); 90 | 91 | assertThat(charset, is(StandardCharsets.UTF_8)); 92 | } 93 | 94 | @Test 95 | public void utf16BeBomTest() throws IOException { 96 | ByteArrayOutputStream buf = new ByteArrayOutputStream(); 97 | buf.write(ByteOrderMark.UTF_16BE.getBytes()); 98 | buf.write("Hello, world!".getBytes(StandardCharsets.UTF_16BE)); 99 | 100 | Charset charset = Chardet.detectCharset(buf.toByteArray()).get(); 101 | 102 | assertThat(charset, is(StandardCharsets.UTF_16BE)); 103 | } 104 | 105 | @Test 106 | public void utf16LeBomTest() throws IOException { 107 | ByteArrayOutputStream buf = new ByteArrayOutputStream(); 108 | buf.write(ByteOrderMark.UTF_16LE.getBytes()); 109 | buf.write("Hello, world!".getBytes(StandardCharsets.UTF_16LE)); 110 | 111 | Charset charset = Chardet.detectCharset(buf.toByteArray()).get(); 112 | 113 | assertThat(charset, is(StandardCharsets.UTF_16LE)); 114 | } 115 | 116 | /** 117 | * We should detect the correct charset if the declared hint is wrong 118 | */ 119 | @Test 120 | public void mismatchedDeclaredEncodingTest() { 121 | Charset charset = 122 | Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8), "UTF-16").get(); 123 | 124 | assertThat(charset, is(StandardCharsets.UTF_8)); 125 | } 126 | 127 | /** 128 | * We should detect the correct charset if the declared hint is not a valid charset 129 | */ 130 | @Test 131 | public void invalidDeclaredEncodingTest() { 132 | Charset charset = 133 | Chardet.detectCharset("Hellö, world!".getBytes(StandardCharsets.UTF_8), "FOOBAR").get(); 134 | 135 | assertThat(charset, is(StandardCharsets.UTF_8)); 136 | } 137 | 138 | /** 139 | * We should ignore the BOM 140 | */ 141 | @Test 142 | public void decodeStreamTest() throws IOException { 143 | ByteArrayOutputStream buf = new ByteArrayOutputStream(); 144 | buf.write(ByteOrderMark.UTF_8.getBytes()); 145 | buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8)); 146 | 147 | String decoded; 148 | try (Reader r = Chardet.decode(new ByteArrayInputStream(buf.toByteArray()), "utf-8", 149 | StandardCharsets.UTF_8)) { 150 | decoded = CharStreams.toString(r); 151 | } 152 | 153 | assertThat(decoded, is("Hello, world!")); 154 | } 155 | 156 | /** 157 | * We should ignore the BOM 158 | */ 159 | @Test 160 | public void decodeArrayTest() throws IOException { 161 | ByteArrayOutputStream buf = new ByteArrayOutputStream(); 162 | buf.write(ByteOrderMark.UTF_8.getBytes()); 163 | buf.write("Hello, world!".getBytes(StandardCharsets.UTF_8)); 164 | 165 | String decoded = Chardet.decode(buf.toByteArray(), "utf-8", StandardCharsets.UTF_8); 166 | 167 | assertThat(decoded, is("Hello, world!")); 168 | } 169 | 170 | /** 171 | * We should ignore the BOM 172 | */ 173 | @Test 174 | public void longTest() throws IOException { 175 | byte[] data = Resources.toByteArray(Resources.getResource("webpage.html")); 176 | 177 | Charset charset = Chardet.detectCharset(data, "utf-8").get(); 178 | 179 | assertThat(charset, is(StandardCharsets.UTF_8)); 180 | } 181 | 182 | public static class TestableCharset { 183 | public final boolean standard; 184 | public final String charsetName; 185 | public final ByteOrderMark bom; 186 | 187 | public TestableCharset(boolean standard, String charsetName, ByteOrderMark bom) { 188 | this.standard = standard; 189 | this.charsetName = requireNonNull(charsetName); 190 | this.bom = requireNonNull(bom); 191 | } 192 | 193 | public Optional getCharset() { 194 | try { 195 | return Optional.of(Charset.forName(charsetName)); 196 | } catch (UnsupportedCharsetException e) { 197 | return Optional.empty(); 198 | } 199 | } 200 | } 201 | 202 | public static byte[] concat(byte[] xs, byte[] ys) { 203 | byte[] zs = new byte[xs.length + ys.length]; 204 | System.arraycopy(xs, 0, zs, 0, xs.length); 205 | System.arraycopy(ys, 0, zs, xs.length, ys.length); 206 | return zs; 207 | } 208 | 209 | /** 210 | * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM. 211 | */ 212 | public static final List DETECT_CHARSET_TEST_CHARSETS = 213 | asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE), 214 | new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE), 215 | new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8), 216 | new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE), 217 | new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE), 218 | new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1), 219 | new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC)); 220 | 221 | /** 222 | * Test a variety of charsets using a known text and detect them 223 | */ 224 | @Test 225 | public void detectCharsetTest() throws IOException { 226 | // Stopping by Woods on a Snowy Evening, by Robert Frost 227 | // We'll encode this in various charsets and decode them 228 | // We use a text without diacritics to avoid any issues with encoding. We're not here to test 229 | // the correctness of charset implementations, only correct application of same. 230 | // Note: The poem is public domain. 231 | final String originalText = "Whose woods these are I think I know. \n" 232 | + "His house is in the village though; \n" + "He will not see me stopping here \n" 233 | + "To watch his woods fill up with snow. \n" + "\n" 234 | + "My little horse must think it queer \n" + "To stop without a farmhouse near \n" 235 | + "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n" 236 | + "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n" 237 | + "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n" 238 | + "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n" 239 | + "And miles to go before I sleep, \n" + "And miles to go before I sleep."; 240 | 241 | // These are all the charsets that Java is required to support 242 | for (TestableCharset testableCharset : DETECT_CHARSET_TEST_CHARSETS) { 243 | if (!testableCharset.getCharset().isPresent()) { 244 | if (testableCharset.standard) 245 | throw new AssertionError( 246 | "JVM does not support standard charset " + testableCharset.charsetName); 247 | continue; 248 | } 249 | 250 | final Charset charset = testableCharset.getCharset().get(); 251 | 252 | 253 | // Make sure we get the right charset when we decode WITHOUT a BOM 254 | final byte[] plainEncodedText = originalText.getBytes(charset); 255 | final Charset plainDetectedCharset = Chardet.detectCharset(plainEncodedText).get(); 256 | if (testableCharset.charsetName.equals("UTF-8")) { 257 | // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. 258 | assertThat(plainDetectedCharset, 259 | anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); 260 | } else { 261 | assertThat(plainDetectedCharset, is(charset)); 262 | } 263 | 264 | // Make sure we get the right charset when we decode WITHOUT a BOM 265 | final byte[] bomEncodedText = 266 | concat(testableCharset.bom.getBytes(), originalText.getBytes(charset)); 267 | final Charset bomDetectedCharset = Chardet.detectCharset(bomEncodedText).get(); 268 | if (testableCharset.charsetName.equals("UTF-8")) { 269 | // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. 270 | assertThat(bomDetectedCharset, 271 | anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); 272 | } else { 273 | assertThat(bomDetectedCharset, is(charset)); 274 | } 275 | } 276 | } 277 | 278 | /** 279 | * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM. 280 | */ 281 | public static final List DECODE_TEST_CHARSETS = 282 | asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE), 283 | new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE), 284 | new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8), 285 | new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE), 286 | new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE), 287 | new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1), 288 | new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC)); 289 | 290 | /** 291 | * Test the ability to decode an InputStream 292 | * 293 | * @see Chardet#decode(byte[], Charset) 294 | */ 295 | @Test 296 | public void decodeTest() throws IOException { 297 | // Stopping by Woods on a Snowy Evening, by Robert Frost 298 | // We'll encode this in various charsets and decode them 299 | // We use a text without diacritics to avoid any issues with encoding. We're not here to test 300 | // the correctness of charset implementations, only correct application of same. 301 | // Note: The poem is public domain. 302 | final String originalText = "Whose woods these are I think I know. \n" 303 | + "His house is in the village though; \n" + "He will not see me stopping here \n" 304 | + "To watch his woods fill up with snow. \n" + "\n" 305 | + "My little horse must think it queer \n" + "To stop without a farmhouse near \n" 306 | + "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n" 307 | + "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n" 308 | + "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n" 309 | + "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n" 310 | + "And miles to go before I sleep, \n" + "And miles to go before I sleep."; 311 | 312 | for (TestableCharset testableCharset : DECODE_TEST_CHARSETS) { 313 | if (!testableCharset.getCharset().isPresent()) { 314 | if (testableCharset.standard) 315 | throw new AssertionError( 316 | "JVM does not support standard charset " + testableCharset.charsetName); 317 | continue; 318 | } 319 | 320 | final Charset charset = testableCharset.getCharset().get(); 321 | 322 | final byte[] encodedText = originalText.getBytes(charset); 323 | 324 | // Make sure we get the right charset when we decode WITHOUT a BOM 325 | final StringWriter plainWriter = new StringWriter(); 326 | try (DecodedInputStreamReader plainReader = 327 | Chardet.decode(new ByteArrayInputStream(encodedText), charset)) { 328 | final Charset detectedCharset = plainReader.charset(); 329 | if (testableCharset.charsetName.equals("UTF-8")) { 330 | // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. 331 | assertThat(detectedCharset, 332 | anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); 333 | } else { 334 | assertThat(detectedCharset, is(charset)); 335 | } 336 | CharStreams.copy(plainReader, plainWriter); 337 | } 338 | assertThat(plainWriter.toString(), is(originalText)); 339 | 340 | // Make sure we get the right charset when we decode WITH a BOM 341 | final StringWriter bomWriter = new StringWriter(); 342 | try (DecodedInputStreamReader bomReader = Chardet 343 | .decode(new SequenceInputStream(new ByteArrayInputStream(testableCharset.bom.getBytes()), 344 | new ByteArrayInputStream(encodedText)), charset)) { 345 | final Charset detectedCharset = bomReader.charset(); 346 | if (testableCharset.charsetName.equals("UTF-8")) { 347 | // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. 348 | assertThat(detectedCharset, 349 | anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); 350 | } else { 351 | assertThat(detectedCharset, is(charset)); 352 | } 353 | CharStreams.copy(bomReader, bomWriter); 354 | } 355 | assertThat(bomWriter.toString(), is(originalText)); 356 | } 357 | } 358 | } 359 | --------------------------------------------------------------------------------