185 | * Two arrays are considered equal if the number of elements covered by each range is the same,
186 | * and all corresponding pairs of elements over the specified ranges in the two arrays are equal.
187 | * In other words, two arrays are equal if they contain, over the specified ranges, the same
188 | * elements in the same order.
189 | *
190 | * @param a the first array to be tested for equality
191 | * @param aFromIndex the index (inclusive) of the first element in the first array to be tested
192 | * @param aToIndex the index (exclusive) of the last element in the first array to be tested
193 | * @param b the second array to be tested for equality
194 | * @param bFromIndex the index (inclusive) of the first element in the second array to be tested
195 | * @param bToIndex the index (exclusive) of the last element in the second array to be tested
196 | * @return {@code true} if the two arrays, over the specified ranges, are equal
197 | * @throws IllegalArgumentException if {@code aFromIndex > aToIndex} or if
198 | * {@code bFromIndex > bToIndex}
199 | * @throws ArrayIndexOutOfBoundsException if {@code aFromIndex < 0 or aToIndex > a.length} or if
200 | * {@code bFromIndex < 0 or bToIndex > b.length}
201 | * @throws NullPointerException if either array is {@code null}
202 | */
203 | private static boolean equals(byte[] a, int aFromIndex, int aToIndex, byte[] b, int bFromIndex,
204 | int bToIndex) {
205 | rangeCheck(a.length, aFromIndex, aToIndex);
206 | rangeCheck(b.length, bFromIndex, bToIndex);
207 |
208 | int aLength = aToIndex - aFromIndex;
209 | int bLength = bToIndex - bFromIndex;
210 | if (aLength != bLength)
211 | return false;
212 | int length = aLength;
213 |
214 | for (int i = 0; i < length; i++) {
215 | if (a[aFromIndex + i] != b[bFromIndex + i]) {
216 | return false;
217 | }
218 | }
219 |
220 | return true;
221 | }
222 |
223 | /**
224 | * Checks that {@code fromIndex} and {@code toIndex} are in the range and throws an exception if
225 | * they aren't.
226 | */
227 | private static void rangeCheck(int arrayLength, int fromIndex, int toIndex) {
228 | if (fromIndex > toIndex) {
229 | throw new IllegalArgumentException("fromIndex(" + fromIndex + ") > toIndex(" + toIndex + ")");
230 | }
231 | if (fromIndex < 0) {
232 | throw new ArrayIndexOutOfBoundsException(fromIndex);
233 | }
234 | if (toIndex > arrayLength) {
235 | throw new ArrayIndexOutOfBoundsException(toIndex);
236 | }
237 | }
238 |
239 | private final byte[] bytes;
240 | private final Charset standardCharset;
241 | private final String charsetName;
242 | private volatile AtomicReference
28 | * Character set detection is at best an imprecise operation. The detection
29 | * process will attempt to identify the charset that best matches the characteristics
30 | * of the byte data, but the process is partly statistical in nature, and
31 | * the results can not be guaranteed to always be correct.
32 | *
33 | * For best accuracy in charset detection, the input data should be primarily
34 | * in a single language, and a minimum of a few hundred bytes worth of plain text
35 | * in the language are needed. The detection process will attempt to
36 | * ignore html or xml style markup that could otherwise obscure the content.
37 | *
38 | * @stable ICU 3.4
39 | */
40 | public class CharsetDetector {
41 |
42 | // Question: Should we have getters corresponding to the setters for input text
43 | // and declared encoding?
44 |
45 | // A thought: If we were to create our own type of Java Reader, we could defer
46 | // figuring out an actual charset for data that starts out with too much English
47 | // only ASCII until the user actually read through to something that didn't look
48 | // like 7 bit English. If nothing else ever appeared, we would never need to
49 | // actually choose the "real" charset. All assuming that the application just
50 | // wants the data, and doesn't care about a char set name.
51 |
52 | /**
53 | * Constructor
54 | *
55 | * @stable ICU 3.4
56 | */
57 | public CharsetDetector() {
58 | }
59 |
60 | /**
61 | * Set the declared encoding for charset detection.
62 | * The declared encoding of an input text is an encoding obtained
63 | * from an http header or xml declaration or similar source that
64 | * can be provided as additional information to the charset detector.
65 | * A match between a declared encoding and a possible detected encoding
66 | * will raise the quality of that detected encoding by a small delta,
67 | * and will also appear as a "reason" for the match.
68 | *
69 | * A declared encoding that is incompatible with the input data being
70 | * analyzed will not be added to the list of possible encodings.
71 | *
72 | * @param encoding The declared encoding
73 | *
74 | * @stable ICU 3.4
75 | */
76 | public CharsetDetector setDeclaredEncoding(String encoding) {
77 | fDeclaredEncoding = encoding;
78 | return this;
79 | }
80 |
81 | /**
82 | * Set the input text (byte) data whose charset is to be detected.
83 | *
84 | * @param in the input text of unknown encoding
85 | *
86 | * @return This CharsetDetector
87 | *
88 | * @stable ICU 3.4
89 | */
90 | public CharsetDetector setText(byte [] in) {
91 | fRawInput = in;
92 | fRawLength = in.length;
93 |
94 | return this;
95 | }
96 |
97 | private static final int kBufSize = 8000;
98 |
99 | /**
100 | * Set the input text (byte) data whose charset is to be detected.
101 | *
102 | * The input stream that supplies the character data must have markSupported()
103 | * == true; the charset detection process will read a small amount of data,
104 | * then return the stream to its original position via
105 | * the InputStream.reset() operation. The exact amount that will
106 | * be read depends on the characteristics of the data itself.
107 | *
108 | * @param in the input text of unknown encoding
109 | *
110 | * @return This CharsetDetector
111 | *
112 | * @stable ICU 3.4
113 | */
114 |
115 | public CharsetDetector setText(InputStream in) throws IOException {
116 | fInputStream = in;
117 | fInputStream.mark(kBufSize);
118 | fRawInput = new byte[kBufSize]; // Always make a new buffer because the
119 | // previous one may have come from the caller,
120 | // in which case we can't touch it.
121 | fRawLength = 0;
122 | int remainingLength = kBufSize;
123 | while (remainingLength > 0 ) {
124 | // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
125 | int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
126 | if (bytesRead <= 0) {
127 | break;
128 | }
129 | fRawLength += bytesRead;
130 | remainingLength -= bytesRead;
131 | }
132 | fInputStream.reset();
133 |
134 | return this;
135 | }
136 |
137 |
138 | /**
139 | * Return the charset that best matches the supplied input data.
140 | *
141 | * Note though, that because the detection
142 | * only looks at the start of the input data,
143 | * there is a possibility that the returned charset will fail to handle
144 | * the full set of input data.
145 | *
146 | * Raise an exception if
147 | *
176 | * Raise an exception if
177 | *
215 | * This is a convenience method that is equivalent to
216 | *
218 | * For the input stream that supplies the character data, markSupported()
219 | * must be true; the charset detection will read a small amount of data,
220 | * then return the stream to its original position via
221 | * the InputStream.reset() operation. The exact amount that will
222 | * be read depends on the characteristics of the data itself.
223 | *
224 | * Raise an exception if no charsets appear to match the input data.
225 | *
226 | * @param in The source of the byte data in the unknown charset.
227 | *
228 | * @param declaredEncoding A declared encoding for the data, if available,
229 | * or null or an empty string if none is available.
230 | *
231 | * @stable ICU 3.4
232 | */
233 | public Reader getReader(InputStream in, String declaredEncoding) {
234 | fDeclaredEncoding = declaredEncoding;
235 |
236 | try {
237 | setText(in);
238 |
239 | CharsetMatch match = detect();
240 |
241 | if (match == null) {
242 | return null;
243 | }
244 |
245 | return match.getReader();
246 | } catch (IOException e) {
247 | return null;
248 | }
249 | }
250 |
251 | /**
252 | * Autodetect the charset of an inputStream, and return a String
253 | * containing the converted input data.
254 | *
255 | * This is a convenience method that is equivalent to
256 | *
258 | * Raise an exception if no charsets appear to match the input data.
259 | *
260 | * @param in The source of the byte data in the unknown charset.
261 | *
262 | * @param declaredEncoding A declared encoding for the data, if available,
263 | * or null or an empty string if none is available.
264 | *
265 | * @stable ICU 3.4
266 | */
267 | public String getString(byte[] in, String declaredEncoding)
268 | {
269 | fDeclaredEncoding = declaredEncoding;
270 |
271 | try {
272 | setText(in);
273 |
274 | CharsetMatch match = detect();
275 |
276 | if (match == null) {
277 | return null;
278 | }
279 |
280 | return match.getString(-1);
281 | } catch (IOException e) {
282 | return null;
283 | }
284 | }
285 |
286 |
287 | /**
288 | * Get the names of all charsets supported by
290 | * Note: Multiple different charset encodings in a same family may use
291 | * a single shared name in this implementation. For example, this method returns
292 | * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
293 | * (Windows Latin 1). However, actual detection result could be "windows-1252"
294 | * when the input data matches Latin 1 code points with any points only available
295 | * in "windows-1252".
296 | *
297 | * @return an array of the names of all charsets supported by
298 | *
24 | * Instances of this class are created only by CharsetDetectors.
25 | *
26 | * Note: this class has a natural ordering that is inconsistent with equals.
27 | * The natural ordering is based on the match confidence value.
28 | *
29 | * @stable ICU 3.4
30 | */
31 | public class CharsetMatch implements Comparable
38 | * CAUTION: if the source of the byte data was an InputStream, a Reader
39 | * can be created for only one matching char set using this method. If more
40 | * than one charset needs to be tried, the caller will need to reset
41 | * the InputStream and create InputStreamReaders itself, based on the charset name.
42 | *
43 | * @return the Reader for the Unicode character data.
44 | *
45 | * @stable ICU 3.4
46 | */
47 | public Reader getReader() {
48 | InputStream inputStream = fInputStream;
49 |
50 | if (inputStream == null) {
51 | inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
52 | }
53 |
54 | try {
55 | inputStream.reset();
56 | return new InputStreamReader(inputStream, getName());
57 | } catch (IOException e) {
58 | return null;
59 | }
60 | }
61 |
62 | /**
63 | * Create a Java String from Unicode character data corresponding
64 | * to the original byte data supplied to the Charset detect operation.
65 | *
66 | * @return a String created from the converted input data.
67 | *
68 | * @stable ICU 3.4
69 | */
70 | public String getString() throws java.io.IOException {
71 | return getString(-1);
72 |
73 | }
74 |
75 | /**
76 | * Create a Java String from Unicode character data corresponding
77 | * to the original byte data supplied to the Charset detect operation.
78 | * The length of the returned string is limited to the specified size;
79 | * the string will be trunctated to this length if necessary. A limit value of
80 | * zero or less is ignored, and treated as no limit.
81 | *
82 | * @param maxLength The maximum length of the String to be created when the
83 | * source of the data is an input stream, or -1 for
84 | * unlimited length.
85 | * @return a String created from the converted input data.
86 | *
87 | * @stable ICU 3.4
88 | */
89 | public String getString(int maxLength) throws java.io.IOException {
90 | String result = null;
91 | if (fInputStream != null) {
92 | StringBuilder sb = new StringBuilder();
93 | char[] buffer = new char[1024];
94 | Reader reader = getReader();
95 | int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
96 | int bytesRead = 0;
97 |
98 | while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
99 | sb.append(buffer, 0, bytesRead);
100 | max -= bytesRead;
101 | }
102 |
103 | reader.close();
104 |
105 | return sb.toString();
106 | } else {
107 | String name = getName();
108 | /*
109 | * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
110 | * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
111 | * should be stripped off before creating the string.
112 | */
113 | int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
114 | if (startSuffix > 0) {
115 | name = name.substring(0, startSuffix);
116 | }
117 | result = new String(fRawInput, name);
118 | }
119 | return result;
120 |
121 | }
122 |
123 | /**
124 | * Get an indication of the confidence in the charset detected.
125 | * Confidence values range from 0-100, with larger numbers indicating
126 | * a better match of the input data to the characteristics of the
127 | * charset.
128 | *
129 | * @return the confidence in the charset match
130 | *
131 | * @stable ICU 3.4
132 | */
133 | public int getConfidence() {
134 | return fConfidence;
135 | }
136 |
137 | /**
138 | * Get the name of the detected charset.
139 | * The name will be one that can be used with other APIs on the
140 | * platform that accept charset names. It is the "Canonical name"
141 | * as defined by the class java.nio.charset.Charset; for
142 | * charsets that are registered with the IANA charset registry,
143 | * this is the MIME-preferred registerd name.
144 | *
145 | * @see java.nio.charset.Charset
146 | * @see java.io.InputStreamReader
147 | *
148 | * @return The name of the charset.
149 | *
150 | * @stable ICU 3.4
151 | */
152 | public String getName() {
153 | return fCharsetName;
154 | }
155 |
156 | /**
157 | * Get the ISO code for the language of the detected charset.
158 | *
159 | * @return The ISO code for the language or CharsetDetector
provides a facility for detecting the
22 | * charset or encoding of character data in an unknown format.
23 | * The input data can either be from an input stream or an array of bytes.
24 | * The result of the detection operation is a list of possibly matching
25 | * charsets, or, for simple use, you can just ask for a Java Reader that
26 | * will will work over the input data.
27 | *
148 | *
151 | *
152 | * @return a CharsetMatch object representing the best matching charset, or
153 | * null
if there are no matches.
154 | *
155 | * @stable ICU 3.4
156 | */
157 | public CharsetMatch detect() {
158 | // TODO: A better implementation would be to copy the detect loop from
159 | // detectAll(), and cut it short as soon as a match with a high confidence
160 | // is found. This is something to be done later, after things are otherwise
161 | // working.
162 | CharsetMatch matches[] = detectAll();
163 |
164 | if (matches == null || matches.length == 0) {
165 | return null;
166 | }
167 |
168 | return matches[0];
169 | }
170 |
171 | /**
172 | * Return an array of all charsets that appear to be plausible
173 | * matches with the input data. The array is ordered with the
174 | * best quality match first.
175 | *
178 | *
181 | *
182 | * @return An array of CharsetMatch objects representing possibly matching charsets.
183 | *
184 | * @stable ICU 3.4
185 | */
186 | public CharsetMatch[] detectAll() {
187 | ArrayListthis.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();
217 | * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();
257 | *CharsetDetector
class.
289 | * CharsetDetector
class.
299 | *
300 | * @stable ICU 3.4
301 | */
302 | public static String[] getAllDetectableCharsets() {
303 | String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
304 | for (int i = 0; i < allCharsetNames.length; i++) {
305 | allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
306 | }
307 | return allCharsetNames;
308 | }
309 |
310 | /**
311 | * Test whether or not input filtering is enabled.
312 | *
313 | * @return true
if input text will be filtered.
314 | *
315 | * @see #enableInputFilter
316 | *
317 | * @stable ICU 3.4
318 | */
319 | public boolean inputFilterEnabled()
320 | {
321 | return fStripTags;
322 | }
323 |
324 | /**
325 | * Enable filtering of input text. If filtering is enabled,
326 | * text within angle brackets ("<" and ">") will be removed
327 | * before detection.
328 | *
329 | * @param filter true
to enable input text filtering.
330 | *
331 | * @return The previous setting.
332 | *
333 | * @stable ICU 3.4
334 | */
335 | public boolean enableInputFilter(boolean filter)
336 | {
337 | boolean previous = fStripTags;
338 |
339 | fStripTags = filter;
340 |
341 | return previous;
342 | }
343 |
344 | /*
345 | * MungeInput - after getting a set of raw input data to be analyzed, preprocess
346 | * it by removing what appears to be html markup.
347 | */
348 | private void MungeInput() {
349 | int srci = 0;
350 | int dsti = 0;
351 | byte b;
352 | boolean inMarkup = false;
353 | int openTags = 0;
354 | int badTags = 0;
355 |
356 | //
357 | // html / xml markup stripping.
358 | // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
359 | // discard everything within < brackets >
360 | // Count how many total '<' and illegal (nested) '<' occur, so we can make some
361 | // guess as to whether the input was actually marked up at all.
362 | if (fStripTags) {
363 | for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
364 | b = fRawInput[srci];
365 | if (b == (byte)'<') {
366 | if (inMarkup) {
367 | badTags++;
368 | }
369 | inMarkup = true;
370 | openTags++;
371 | }
372 |
373 | if (! inMarkup) {
374 | fInputBytes[dsti++] = b;
375 | }
376 |
377 | if (b == (byte)'>') {
378 | inMarkup = false;
379 | }
380 | }
381 |
382 | fInputLen = dsti;
383 | }
384 |
385 | //
386 | // If it looks like this input wasn't marked up, or if it looks like it's
387 | // essentially nothing but markup abandon the markup stripping.
388 | // Detection will have to work on the unstripped input.
389 | //
390 | if (openTags<5 || openTags/5 < badTags ||
391 | (fInputLen < 100 && fRawLength>600)) {
392 | int limit = fRawLength;
393 |
394 | if (limit > kBufSize) {
395 | limit = kBufSize;
396 | }
397 |
398 | for (srci=0; srcitrue
to enable, or false
to disable the
545 | * charset encoding.
546 | * @return A reference to this CharsetDetector
.
547 | * @throws IllegalArgumentException when the name of charset encoding is
548 | * not supported.
549 | *
550 | * @internal
551 | * @deprecated This API is ICU internal only.
552 | */
553 | @Deprecated
554 | public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
555 | int modIdx = -1;
556 | boolean isDefaultVal = false;
557 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
558 | CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
559 | if (csrinfo.recognizer.getName().equals(encoding)) {
560 | modIdx = i;
561 | isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
562 | break;
563 | }
564 | }
565 | if (modIdx < 0) {
566 | // No matching encoding found
567 | throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
568 | }
569 |
570 | if (fEnabledRecognizers == null && !isDefaultVal) {
571 | // Create an array storing the non default setting
572 | fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
573 |
574 | // Initialize the array with default info
575 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
576 | fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
577 | }
578 | }
579 |
580 | if (fEnabledRecognizers != null) {
581 | fEnabledRecognizers[modIdx] = enabled;
582 | }
583 |
584 | return this;
585 | }
586 | }
587 |
--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetMatch.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /**
4 | *******************************************************************************
5 | * Copyright (C) 2005-2016, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
10 |
11 | import java.io.ByteArrayInputStream;
12 | import java.io.IOException;
13 | import java.io.InputStream;
14 | import java.io.InputStreamReader;
15 | import java.io.Reader;
16 |
17 |
18 | /**
19 | * This class represents a charset that has been identified by a CharsetDetector
20 | * as a possible encoding for a set of input data. From an instance of this
21 | * class, you can ask for a confidence level in the charset identification,
22 | * or for Java Reader or String to access the original byte data in Unicode form.
23 | * null
if the language cannot be determined.
160 | *
161 | * @stable ICU 3.4
162 | */
163 | public String getLanguage() {
164 | return fLang;
165 | }
166 |
167 | /**
168 | * Compare to other CharsetMatch objects.
169 | * Comparison is based on the match confidence value, which
170 | * allows CharsetDetector.detectAll() to order its results.
171 | *
172 | * @param other the CharsetMatch object to compare against.
173 | * @return a negative integer, zero, or a positive integer as the
174 | * confidence level of this CharsetMatch
175 | * is less than, equal to, or greater than that of
176 | * the argument.
177 | * @throws ClassCastException if the argument is not a CharsetMatch.
178 | * @stable ICU 4.4
179 | */
180 | @Override
181 | public int compareTo (CharsetMatch other) {
182 | int compareResult = 0;
183 | if (this.fConfidence > other.fConfidence) {
184 | compareResult = 1;
185 | } else if (this.fConfidence < other.fConfidence) {
186 | compareResult = -1;
187 | }
188 | return compareResult;
189 | }
190 |
191 | /*
192 | * Constructor. Implementation internal
193 | */
194 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
195 | fConfidence = conf;
196 |
197 | // The references to the original application input data must be copied out
198 | // of the charset recognizer to here, in case the application resets the
199 | // recognizer before using this CharsetMatch.
200 | if (det.fInputStream == null) {
201 | // We only want the existing input byte data if it came straight from the user,
202 | // not if is just the head of a stream.
203 | fRawInput = det.fRawInput;
204 | fRawLength = det.fRawLength;
205 | }
206 | fInputStream = det.fInputStream;
207 | fCharsetName = rec.getName();
208 | fLang = rec.getLanguage();
209 | }
210 |
211 | /*
212 | * Constructor. Implementation internal
213 | */
214 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
215 | fConfidence = conf;
216 |
217 | // The references to the original application input data must be copied out
218 | // of the charset recognizer to here, in case the application resets the
219 | // recognizer before using this CharsetMatch.
220 | if (det.fInputStream == null) {
221 | // We only want the existing input byte data if it came straight from the user,
222 | // not if is just the head of a stream.
223 | fRawInput = det.fRawInput;
224 | fRawLength = det.fRawLength;
225 | }
226 | fInputStream = det.fInputStream;
227 | fCharsetName = csName;
228 | fLang = lang;
229 | }
230 |
231 |
232 | //
233 | // Private Data
234 | //
235 | private int fConfidence;
236 | private byte[] fRawInput = null; // Original, untouched input bytes.
237 | // If user gave us a byte array, this is it.
238 | private int fRawLength; // Length of data in fRawInput array.
239 |
240 | private InputStream fInputStream = null; // User's input stream, or null if the user
241 | // gave us a byte array.
242 |
243 | private String fCharsetName; // The name of the charset this CharsetMatch
244 | // represents. Filled in by the recognizer.
245 | private String fLang; // The language, if one was determined by
246 | // the recognizer during the detect operation.
247 | }
248 |
--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecog_2022.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /*
4 | *******************************************************************************
5 | * Copyright (C) 2005 - 2012, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
10 |
11 | /**
12 | * class CharsetRecog_2022 part of the ICU charset detection implementation.
13 | * This is a superclass for the individual detectors for
14 | * each of the detectable members of the ISO 2022 family
15 | * of encodings.
16 | *
17 | * The separate classes are nested within this class.
18 | */
19 | abstract class CharsetRecog_2022 extends CharsetRecognizer {
20 |
21 |
22 | /**
23 | * Matching function shared among the 2022 detectors JP, CN and KR
24 | * Counts up the number of legal an unrecognized escape sequences in
25 | * the sample of text, and computes a score based on the total number &
26 | * the proportion that fit the encoding.
27 | *
28 | *
29 | * @param text the byte buffer containing text to analyse
30 | * @param textLen the size of the text in the byte.
31 | * @param escapeSequences the byte escape sequences to test for.
32 | * @return match quality, in the range of 0-100.
33 | */
34 | int match(byte [] text, int textLen, byte [][] escapeSequences) {
35 | int i, j;
36 | int escN;
37 | int hits = 0;
38 | int misses = 0;
39 | int shifts = 0;
40 | int quality;
41 | scanInput:
42 | for (i=0; i
44 | * bits 0-7: the match confidence, ranging from 0-100
45 | *
46 | * bits 8-15: The match reason, an enum-like value.
47 | */
48 | int match(CharsetDetector det, int [] commonChars) {
49 | @SuppressWarnings("unused")
50 | int singleByteCharCount = 0; //TODO Do we really need this?
51 | int doubleByteCharCount = 0;
52 | int commonCharCount = 0;
53 | int badCharCount = 0;
54 | int totalCharCount = 0;
55 | int confidence = 0;
56 | iteratedChar iter = new iteratedChar();
57 |
58 | detectBlock: {
59 | for (iter.reset(); nextChar(iter, det);) {
60 | totalCharCount++;
61 | if (iter.error) {
62 | badCharCount++;
63 | } else {
64 | long cv = iter.charValue & 0xFFFFFFFFL;
65 |
66 | if (cv <= 0xff) {
67 | singleByteCharCount++;
68 | } else {
69 | doubleByteCharCount++;
70 | if (commonChars != null) {
71 | // NOTE: This assumes that there are no 4-byte common chars.
72 | if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
73 | commonCharCount++;
74 | }
75 | }
76 | }
77 | }
78 | if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
79 | // Bail out early if the byte data is not matching the encoding scheme.
80 | break detectBlock;
81 | }
82 | }
83 |
84 | if (doubleByteCharCount <= 10 && badCharCount== 0) {
85 | // Not many multi-byte chars.
86 | if (doubleByteCharCount == 0 && totalCharCount < 10) {
87 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
88 | // We don't have enough data to have any confidence.
89 | // Statistical analysis of single byte non-ASCII characters would probably help here.
90 | confidence = 0;
91 | }
92 | else {
93 | // ASCII or ISO file? It's probably not our encoding,
94 | // but is not incompatible with our encoding, so don't give it a zero.
95 | confidence = 10;
96 | }
97 |
98 | break detectBlock;
99 | }
100 |
101 | //
102 | // No match if there are too many characters that don't fit the encoding scheme.
103 | // (should we have zero tolerance for these?)
104 | //
105 | if (doubleByteCharCount < 20*badCharCount) {
106 | confidence = 0;
107 | break detectBlock;
108 | }
109 |
110 | if (commonChars == null) {
111 | // We have no statistics on frequently occurring characters.
112 | // Assess confidence purely on having a reasonable number of
113 | // multi-byte characters (the more the better
114 | confidence = 30 + doubleByteCharCount - 20*badCharCount;
115 | if (confidence > 100) {
116 | confidence = 100;
117 | }
118 | }else {
119 | //
120 | // Frequency of occurrence statistics exist.
121 | //
122 | double maxVal = Math.log((float)doubleByteCharCount / 4);
123 | double scaleFactor = 90.0 / maxVal;
124 | confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
125 | confidence = Math.min(confidence, 100);
126 | }
127 | } // end of detectBlock:
128 |
129 | return confidence;
130 | }
131 |
132 | // "Character" iterated character class.
133 | // Recognizers for specific mbcs encodings make their "characters" available
134 | // by providing a nextChar() function that fills in an instance of iteratedChar
135 | // with the next char from the input.
136 | // The returned characters are not converted to Unicode, but remain as the raw
137 | // bytes (concatenated into an int) from the codepage data.
138 | //
139 | // For Asian charsets, use the raw input rather than the input that has been
140 | // stripped of markup. Detection only considers multi-byte chars, effectively
141 | // stripping markup anyway, and double byte chars do occur in markup too.
142 | //
143 | static class iteratedChar {
144 | int charValue = 0; // 1-4 bytes from the raw input data
145 | int nextIndex = 0;
146 | boolean error = false;
147 | boolean done = false;
148 |
149 | void reset() {
150 | charValue = 0;
151 | nextIndex = 0;
152 | error = false;
153 | done = false;
154 | }
155 |
156 | int nextByte(CharsetDetector det) {
157 | if (nextIndex >= det.fRawLength) {
158 | done = true;
159 | return -1;
160 | }
161 | int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
162 | return byteValue;
163 | }
164 | }
165 |
166 | /**
167 | * Get the next character (however many bytes it is) from the input data
168 | * Subclasses for specific charset encodings must implement this function
169 | * to get characters according to the rules of their encoding scheme.
170 | *
171 | * This function is not a method of class iteratedChar only because
172 | * that would require a lot of extra derived classes, which is awkward.
173 | * @param it The iteratedChar "struct" into which the returned char is placed.
174 | * @param det The charset detector, which is needed to get at the input byte data
175 | * being iterated over.
176 | * @return True if a character was returned, false at end of input.
177 | */
178 | abstract boolean nextChar(iteratedChar it, CharsetDetector det);
179 |
180 |
181 |
182 |
183 |
184 | /**
185 | * Shift-JIS charset recognizer.
186 | *
187 | */
188 | static class CharsetRecog_sjis extends CharsetRecog_mbcs {
189 | static int [] commonChars =
190 | // TODO: This set of data comes from the character frequency-
191 | // of-occurrence analysis tool. The data needs to be moved
192 | // into a resource and loaded from there.
193 | {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
194 | 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
195 | 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
196 | 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
197 | 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
198 | 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199 |
200 | @Override
201 | boolean nextChar(iteratedChar it, CharsetDetector det) {
202 | it.error = false;
203 | int firstByte;
204 | firstByte = it.charValue = it.nextByte(det);
205 | if (firstByte < 0) {
206 | return false;
207 | }
208 |
209 | if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
210 | return true;
211 | }
212 |
213 | int secondByte = it.nextByte(det);
214 | if (secondByte < 0) {
215 | return false;
216 | }
217 | it.charValue = (firstByte << 8) | secondByte;
218 | if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
219 | // Illegal second byte value.
220 | it.error = true;
221 | }
222 | return true;
223 | }
224 |
225 | @Override
226 | CharsetMatch match(CharsetDetector det) {
227 | int confidence = match(det, commonChars);
228 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
229 | }
230 |
231 | @Override
232 | String getName() {
233 | return "Shift_JIS";
234 | }
235 |
236 | @Override
237 | public String getLanguage()
238 | {
239 | return "ja";
240 | }
241 |
242 |
243 | }
244 |
245 |
246 | /**
247 | * Big5 charset recognizer.
248 | *
249 | */
250 | static class CharsetRecog_big5 extends CharsetRecog_mbcs {
251 | static int [] commonChars =
252 | // TODO: This set of data comes from the character frequency-
253 | // of-occurrence analysis tool. The data needs to be moved
254 | // into a resource and loaded from there.
255 | {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256 | 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257 | 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258 | 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259 | 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260 | 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261 | 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262 | 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263 | 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
265 |
266 | @Override
267 | boolean nextChar(iteratedChar it, CharsetDetector det) {
268 | it.error = false;
269 | int firstByte;
270 | firstByte = it.charValue = it.nextByte(det);
271 | if (firstByte < 0) {
272 | return false;
273 | }
274 |
275 | if (firstByte <= 0x7f || firstByte==0xff) {
276 | // single byte character.
277 | return true;
278 | }
279 |
280 | int secondByte = it.nextByte(det);
281 | if (secondByte < 0) {
282 | return false;
283 | }
284 | it.charValue = (it.charValue << 8) | secondByte;
285 |
286 | if (secondByte < 0x40 ||
287 | secondByte ==0x7f ||
288 | secondByte == 0xff) {
289 | it.error = true;
290 | }
291 | return true;
292 | }
293 |
294 | @Override
295 | CharsetMatch match(CharsetDetector det) {
296 | int confidence = match(det, commonChars);
297 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
298 | }
299 |
300 | @Override
301 | String getName() {
302 | return "Big5";
303 | }
304 |
305 |
306 | @Override
307 | public String getLanguage()
308 | {
309 | return "zh";
310 | }
311 | }
312 |
313 |
314 | /**
315 | * EUC charset recognizers. One abstract class that provides the common function
316 | * for getting the next character according to the EUC encoding scheme,
317 | * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
318 | *
319 | */
320 | abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
321 |
322 | /*
323 | * (non-Javadoc)
324 | * Get the next character value for EUC based encodings.
325 | * Character "value" is simply the raw bytes that make up the character
326 | * packed into an int.
327 | */
328 | @Override
329 | boolean nextChar(iteratedChar it, CharsetDetector det) {
330 | it.error = false;
331 | int firstByte = 0;
332 | int secondByte = 0;
333 | int thirdByte = 0;
334 | //int fourthByte = 0;
335 |
336 | buildChar: {
337 | firstByte = it.charValue = it.nextByte(det);
338 | if (firstByte < 0) {
339 | // Ran off the end of the input data
340 | it.done = true;
341 | break buildChar;
342 | }
343 | if (firstByte <= 0x8d) {
344 | // single byte char
345 | break buildChar;
346 | }
347 |
348 | secondByte = it.nextByte(det);
349 | it.charValue = (it.charValue << 8) | secondByte;
350 |
351 | if (firstByte >= 0xA1 && firstByte <= 0xfe) {
352 | // Two byte Char
353 | if (secondByte < 0xa1) {
354 | it.error = true;
355 | }
356 | break buildChar;
357 | }
358 | if (firstByte == 0x8e) {
359 | // Code Set 2.
360 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
361 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
362 | // We don't know which we've got.
363 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two
364 | // bytes will look like a well formed 2 byte char.
365 | if (secondByte < 0xa1) {
366 | it.error = true;
367 | }
368 | break buildChar;
369 | }
370 |
371 | if (firstByte == 0x8f) {
372 | // Code set 3.
373 | // Three byte total char size, two bytes of actual char value.
374 | thirdByte = it.nextByte(det);
375 | it.charValue = (it.charValue << 8) | thirdByte;
376 | if (thirdByte < 0xa1) {
377 | it.error = true;
378 | }
379 | }
380 | }
381 |
382 | return (it.done == false);
383 | }
384 |
385 | /**
386 | * The charset recognize for EUC-JP. A singleton instance of this class
387 | * is created and kept by the public CharsetDetector class
388 | */
389 | static class CharsetRecog_euc_jp extends CharsetRecog_euc {
390 | static int [] commonChars =
391 | // TODO: This set of data comes from the character frequency-
392 | // of-occurrence analysis tool. The data needs to be moved
393 | // into a resource and loaded from there.
394 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
395 | 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
396 | 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
397 | 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
398 | 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
399 | 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
400 | 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
401 | 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
402 | 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
403 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
404 | @Override
405 | String getName() {
406 | return "EUC-JP";
407 | }
408 |
409 | @Override
410 | CharsetMatch match(CharsetDetector det) {
411 | int confidence = match(det, commonChars);
412 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
413 | }
414 |
415 | @Override
416 | public String getLanguage()
417 | {
418 | return "ja";
419 | }
420 | }
421 |
422 | /**
423 | * The charset recognize for EUC-KR. A singleton instance of this class
424 | * is created and kept by the public CharsetDetector class
425 | */
426 | static class CharsetRecog_euc_kr extends CharsetRecog_euc {
427 | static int [] commonChars =
428 | // TODO: This set of data comes from the character frequency-
429 | // of-occurrence analysis tool. The data needs to be moved
430 | // into a resource and loaded from there.
431 | {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
432 | 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
433 | 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
434 | 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
435 | 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
436 | 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
437 | 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
438 | 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
439 | 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
440 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
441 |
442 | @Override
443 | String getName() {
444 | return "EUC-KR";
445 | }
446 |
447 | @Override
448 | CharsetMatch match(CharsetDetector det) {
449 | int confidence = match(det, commonChars);
450 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
451 | }
452 |
453 | @Override
454 | public String getLanguage()
455 | {
456 | return "ko";
457 | }
458 | }
459 | }
460 |
461 | /**
462 | *
463 | * GB-18030 recognizer. Uses simplified Chinese statistics.
464 | *
465 | */
466 | static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
467 |
468 | /*
469 | * (non-Javadoc)
470 | * Get the next character value for EUC based encodings.
471 | * Character "value" is simply the raw bytes that make up the character
472 | * packed into an int.
473 | */
474 | @Override
475 | boolean nextChar(iteratedChar it, CharsetDetector det) {
476 | it.error = false;
477 | int firstByte = 0;
478 | int secondByte = 0;
479 | int thirdByte = 0;
480 | int fourthByte = 0;
481 |
482 | buildChar: {
483 | firstByte = it.charValue = it.nextByte(det);
484 |
485 | if (firstByte < 0) {
486 | // Ran off the end of the input data
487 | it.done = true;
488 | break buildChar;
489 | }
490 |
491 | if (firstByte <= 0x80) {
492 | // single byte char
493 | break buildChar;
494 | }
495 |
496 | secondByte = it.nextByte(det);
497 | it.charValue = (it.charValue << 8) | secondByte;
498 |
499 | if (firstByte >= 0x81 && firstByte <= 0xFE) {
500 | // Two byte Char
501 | if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
502 | break buildChar;
503 | }
504 |
505 | // Four byte char
506 | if (secondByte >= 0x30 && secondByte <= 0x39) {
507 | thirdByte = it.nextByte(det);
508 |
509 | if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
510 | fourthByte = it.nextByte(det);
511 |
512 | if (fourthByte >= 0x30 && fourthByte <= 0x39) {
513 | it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
514 | break buildChar;
515 | }
516 | }
517 | }
518 |
519 | it.error = true;
520 | break buildChar;
521 | }
522 | }
523 |
524 | return (it.done == false);
525 | }
526 |
527 | static int [] commonChars =
528 | // TODO: This set of data comes from the character frequency-
529 | // of-occurrence analysis tool. The data needs to be moved
530 | // into a resource and loaded from there.
531 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
532 | 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
533 | 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
534 | 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
535 | 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
536 | 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
537 | 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
538 | 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
539 | 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
540 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
541 |
542 |
543 | @Override
544 | String getName() {
545 | return "GB18030";
546 | }
547 |
548 | @Override
549 | CharsetMatch match(CharsetDetector det) {
550 | int confidence = match(det, commonChars);
551 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
552 | }
553 |
554 | @Override
555 | public String getLanguage()
556 | {
557 | return "zh";
558 | }
559 | }
560 |
561 |
562 | }
563 |
--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/CharsetRecognizer.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /**
4 | *******************************************************************************
5 | * Copyright (C) 2005-2012, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.sigpwned.chardet4j.com.ibm.icu.text;
10 |
11 | /**
12 | * Abstract class for recognizing a single charset.
13 | * Part of the implementation of ICU's CharsetDetector.
14 | *
15 | * Each specific charset that can be recognized will have an instance
16 | * of some subclass of this class. All interaction between the overall
17 | * CharsetDetector and the stuff specific to an individual charset happens
18 | * via the interface provided here.
19 | *
20 | * Instances of CharsetDetector DO NOT have or maintain
21 | * state pertaining to a specific match or detect operation.
22 | * The WILL be shared by multiple instances of CharsetDetector.
23 | * They encapsulate const charset-specific information.
24 | */
25 | abstract class CharsetRecognizer {
26 | /**
27 | * Get the IANA name of this charset.
28 | * @return the charset name.
29 | */
30 | abstract String getName();
31 |
32 | /**
33 | * Get the ISO language code for this charset.
34 | * @return the language code, or null
if the language cannot be determined.
35 | */
36 | public String getLanguage()
37 | {
38 | return null;
39 | }
40 |
41 | /**
42 | * Test the match of this charset with the input text data
43 | * which is obtained via the CharsetDetector object.
44 | *
45 | * @param det The CharsetDetector, which contains the input text
46 | * to be checked for being in this charset.
47 | * @return A CharsetMatch object containing details of match
48 | * with this charset, or null if there was no match.
49 | */
50 | abstract CharsetMatch match(CharsetDetector det);
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/com/ibm/icu/text/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * This package contains code from the icu4j project. This was originally released under the ICU
3 | * license. This project is released under the Apache 2 license.
4 | *
5 | * @see https://github.com/unicode-org/icu
6 | */
7 | package com.sigpwned.chardet4j.com.ibm.icu.text;
8 |
--------------------------------------------------------------------------------
/src/main/java/com/sigpwned/chardet4j/io/BomAwareInputStream.java:
--------------------------------------------------------------------------------
1 | /*-
2 | * =================================LICENSE_START==================================
3 | * chardet4j
4 | * ====================================SECTION=====================================
5 | * Copyright (C) 2022 - 2024 Andy Boothe
6 | * ====================================SECTION=====================================
7 | * Licensed under the Apache License, Version 2.0 (the "License");
8 | * you may not use this file except in compliance with the License.
9 | * You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | * ==================================LICENSE_END===================================
19 | */
20 | package com.sigpwned.chardet4j.io;
21 |
22 | import java.io.ByteArrayInputStream;
23 | import java.io.FilterInputStream;
24 | import java.io.IOException;
25 | import java.io.InputStream;
26 | import java.io.SequenceInputStream;
27 | import java.util.Optional;
28 | import com.sigpwned.chardet4j.ByteOrderMark;
29 | import com.sigpwned.chardet4j.util.ByteStreams;
30 |
31 | /**
32 | * A wrapper {@link InputStream} that remembers the {@link ByteOrderMark} that was detected at the
33 | * beginning of the stream.
34 | */
35 | public final class BomAwareInputStream extends FilterInputStream {
36 | /**
37 | * Detect the {@link ByteOrderMark} at the beginning of the stream, if any, and return a
38 | * {@link BomAwareInputStream} that wraps the stream.
39 | *
40 | * @param in the input stream
41 | * @return the {@link BomAwareInputStream}
42 | * @throws IOException if an I/O error occurs
43 | */
44 | public static BomAwareInputStream detect(InputStream in) throws IOException {
45 | final byte[] buf = ByteStreams.readNBytes(in, ByteOrderMark.MAX_BYTE_LENGTH);
46 |
47 | ByteOrderMark bom = ByteOrderMark.detect(buf).orElse(null);
48 |
49 | // If there is no BOM, then return all the bytes read so far, followed by the rest of the stream
50 | if (bom == null)
51 | return new BomAwareInputStream(new SequenceInputStream(new ByteArrayInputStream(buf), in),
52 | null);
53 |
54 | final int bomlen = bom.length();
55 |
56 | // If there is a BOM and it is the same length as the bytes read so far, then return the rest of
57 | // the stream
58 | if (bomlen == buf.length)
59 | return new BomAwareInputStream(in, bom);
60 |
61 | // If there is a BOM and it is shorter than the bytes read so far, then return the BOM followed
62 | // by the rest of the bytes read so far, followed by the rest of the stream
63 | return new BomAwareInputStream(
64 | new SequenceInputStream(new ByteArrayInputStream(buf, bomlen, buf.length - bomlen), in),
65 | bom);
66 | }
67 |
68 | private final ByteOrderMark bom;
69 |
70 | private BomAwareInputStream(InputStream delegate, ByteOrderMark bom) {
71 | super(delegate);
72 | this.bom = bom;
73 | }
74 |
75 | /**
76 | * The {@link ByteOrderMark} that was detected at the beginning of the stream, if any, or else
77 | * empty.
78 | *
79 | * @return the {@link ByteOrderMark}
80 | */
81 | public Optional