();
188 |
189 | MungeInput(); // Strip html markup, collect byte stats.
190 |
191 | // Iterate over all possible charsets, remember all that
192 | // give a match quality > 0.
193 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
194 | CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
195 | boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
196 | if (active) {
197 | CharsetMatch m = rcinfo.recognizer.match(this);
198 | if (m != null) {
199 | matches.add(m);
200 | }
201 | }
202 | }
203 | Collections.sort(matches); // CharsetMatch compares on confidence
204 | Collections.reverse(matches); // Put best match first.
205 | CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
206 | resultArray = matches.toArray(resultArray);
207 | return resultArray;
208 | }
209 |
210 |
211 | /**
212 | * Autodetect the charset of an inputStream, and return a Java Reader
213 | * to access the converted input data.
214 | *
215 | * This is a convenience method that is equivalent to
216 | * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();
217 | *
218 | * For the input stream that supplies the character data, markSupported()
219 | * must be true; the charset detection will read a small amount of data,
220 | * then return the stream to its original position via
221 | * the InputStream.reset() operation. The exact amount that will
222 | * be read depends on the characteristics of the data itself.
223 | *
224 | * Raise an exception if no charsets appear to match the input data.
225 | *
226 | * @param in The source of the byte data in the unknown charset.
227 | *
228 | * @param declaredEncoding A declared encoding for the data, if available,
229 | * or null or an empty string if none is available.
230 | *
231 | * @stable ICU 3.4
232 | */
233 | public Reader getReader(InputStream in, String declaredEncoding) {
234 | fDeclaredEncoding = declaredEncoding;
235 |
236 | try {
237 | setText(in);
238 |
239 | CharsetMatch match = detect();
240 |
241 | if (match == null) {
242 | return null;
243 | }
244 |
245 | return match.getReader();
246 | } catch (IOException e) {
247 | return null;
248 | }
249 | }
250 |
251 | /**
252 | * Autodetect the charset of an inputStream, and return a String
253 | * containing the converted input data.
254 | *
255 | * This is a convenience method that is equivalent to
256 | * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();
257 | *
258 | * Raise an exception if no charsets appear to match the input data.
259 | *
260 | * @param in The source of the byte data in the unknown charset.
261 | *
262 | * @param declaredEncoding A declared encoding for the data, if available,
263 | * or null or an empty string if none is available.
264 | *
265 | * @stable ICU 3.4
266 | */
267 | public String getString(byte[] in, String declaredEncoding)
268 | {
269 | fDeclaredEncoding = declaredEncoding;
270 |
271 | try {
272 | setText(in);
273 |
274 | CharsetMatch match = detect();
275 |
276 | if (match == null) {
277 | return null;
278 | }
279 |
280 | return match.getString(-1);
281 | } catch (IOException e) {
282 | return null;
283 | }
284 | }
285 |
286 |
287 | /**
288 | * Get the names of all charsets supported by CharsetDetector
class.
289 | *
290 | * Note: Multiple different charset encodings in a same family may use
291 | * a single shared name in this implementation. For example, this method returns
292 | * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
293 | * (Windows Latin 1). However, actual detection result could be "windows-1252"
294 | * when the input data matches Latin 1 code points with any points only available
295 | * in "windows-1252".
296 | *
297 | * @return an array of the names of all charsets supported by
298 | * CharsetDetector
class.
299 | *
300 | * @stable ICU 3.4
301 | */
302 | public static String[] getAllDetectableCharsets() {
303 | String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
304 | for (int i = 0; i < allCharsetNames.length; i++) {
305 | allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
306 | }
307 | return allCharsetNames;
308 | }
309 |
310 | /**
311 | * Test whether or not input filtering is enabled.
312 | *
313 | * @return true
if input text will be filtered.
314 | *
315 | * @see #enableInputFilter
316 | *
317 | * @stable ICU 3.4
318 | */
319 | public boolean inputFilterEnabled()
320 | {
321 | return fStripTags;
322 | }
323 |
324 | /**
325 | * Enable filtering of input text. If filtering is enabled,
326 | * text within angle brackets ("<" and ">") will be removed
327 | * before detection.
328 | *
329 | * @param filter true
to enable input text filtering.
330 | *
331 | * @return The previous setting.
332 | *
333 | * @stable ICU 3.4
334 | */
335 | public boolean enableInputFilter(boolean filter)
336 | {
337 | boolean previous = fStripTags;
338 |
339 | fStripTags = filter;
340 |
341 | return previous;
342 | }
343 |
344 | /*
345 | * MungeInput - after getting a set of raw input data to be analyzed, preprocess
346 | * it by removing what appears to be html markup.
347 | */
348 | private void MungeInput() {
349 | int srci = 0;
350 | int dsti = 0;
351 | byte b;
352 | boolean inMarkup = false;
353 | int openTags = 0;
354 | int badTags = 0;
355 |
356 | //
357 | // html / xml markup stripping.
358 | // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
359 | // discard everything within < brackets >
360 | // Count how many total '<' and illegal (nested) '<' occur, so we can make some
361 | // guess as to whether the input was actually marked up at all.
362 | if (fStripTags) {
363 | for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
364 | b = fRawInput[srci];
365 | if (b == (byte)'<') {
366 | if (inMarkup) {
367 | badTags++;
368 | }
369 | inMarkup = true;
370 | openTags++;
371 | }
372 |
373 | if (! inMarkup) {
374 | fInputBytes[dsti++] = b;
375 | }
376 |
377 | if (b == (byte)'>') {
378 | inMarkup = false;
379 | }
380 | }
381 |
382 | fInputLen = dsti;
383 | }
384 |
385 | //
386 | // If it looks like this input wasn't marked up, or if it looks like it's
387 | // essentially nothing but markup abandon the markup stripping.
388 | // Detection will have to work on the unstripped input.
389 | //
390 | if (openTags<5 || openTags/5 < badTags ||
391 | (fInputLen < 100 && fRawLength>600)) {
392 | int limit = fRawLength;
393 |
394 | if (limit > kBufSize) {
395 | limit = kBufSize;
396 | }
397 |
398 | for (srci=0; srci ALL_CS_RECOGNIZERS;
476 |
477 | static {
478 | List list = new ArrayList();
479 |
480 | list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
481 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
482 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
483 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
484 | list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
485 |
486 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
487 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
488 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
489 | list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
490 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
491 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
492 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
493 | list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
494 |
495 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
496 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
497 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
498 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
499 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
500 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
501 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
502 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
503 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
504 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
505 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
506 |
507 | // IBM 420/424 recognizers are disabled by default
508 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
509 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
510 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
511 | list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
512 |
513 | ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
514 | }
515 |
516 | /**
517 | * Get the names of charsets that can be recognized by this CharsetDetector instance.
518 | *
519 | * @return an array of the names of charsets that can be recognized by this CharsetDetector
520 | * instance.
521 | *
522 | * @internal
523 | * @deprecated This API is ICU internal only.
524 | */
525 | @Deprecated
526 | public String[] getDetectableCharsets() {
527 | List csnames = new ArrayList(ALL_CS_RECOGNIZERS.size());
528 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
529 | CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
530 | boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
531 | if (active) {
532 | csnames.add(rcinfo.recognizer.getName());
533 | }
534 | }
535 | return csnames.toArray(new String[csnames.size()]);
536 | }
537 |
538 | /**
539 | * Enable or disable individual charset encoding.
540 | * A name of charset encoding must be included in the names returned by
541 | * {@link #getAllDetectableCharsets()}.
542 | *
543 | * @param encoding the name of charset encoding.
544 | * @param enabled true
to enable, or false
to disable the
545 | * charset encoding.
546 | * @return A reference to this CharsetDetector
.
547 | * @throws IllegalArgumentException when the name of charset encoding is
548 | * not supported.
549 | *
550 | * @internal
551 | * @deprecated This API is ICU internal only.
552 | */
553 | @Deprecated
554 | public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
555 | int modIdx = -1;
556 | boolean isDefaultVal = false;
557 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
558 | CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
559 | if (csrinfo.recognizer.getName().equals(encoding)) {
560 | modIdx = i;
561 | isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
562 | break;
563 | }
564 | }
565 | if (modIdx < 0) {
566 | // No matching encoding found
567 | throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
568 | }
569 |
570 | if (fEnabledRecognizers == null && !isDefaultVal) {
571 | // Create an array storing the non default setting
572 | fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
573 |
574 | // Initialize the array with default info
575 | for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
576 | fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
577 | }
578 | }
579 |
580 | if (fEnabledRecognizers != null) {
581 | fEnabledRecognizers[modIdx] = enabled;
582 | }
583 |
584 | return this;
585 | }
586 | }
587 |
--------------------------------------------------------------------------------
/src/main/java/com/ibm/icu/text/CharsetMatch.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /**
4 | *******************************************************************************
5 | * Copyright (C) 2005-2016, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.ibm.icu.text;
10 |
11 | import java.io.ByteArrayInputStream;
12 | import java.io.IOException;
13 | import java.io.InputStream;
14 | import java.io.InputStreamReader;
15 | import java.io.Reader;
16 |
17 |
18 | /**
19 | * This class represents a charset that has been identified by a CharsetDetector
20 | * as a possible encoding for a set of input data. From an instance of this
21 | * class, you can ask for a confidence level in the charset identification,
22 | * or for Java Reader or String to access the original byte data in Unicode form.
23 | *
24 | * Instances of this class are created only by CharsetDetectors.
25 | *
26 | * Note: this class has a natural ordering that is inconsistent with equals.
27 | * The natural ordering is based on the match confidence value.
28 | *
29 | * @stable ICU 3.4
30 | */
31 | public class CharsetMatch implements Comparable {
32 |
33 |
34 | /**
35 | * Create a java.io.Reader for reading the Unicode character data corresponding
36 | * to the original byte data supplied to the Charset detect operation.
37 | *
38 | * CAUTION: if the source of the byte data was an InputStream, a Reader
39 | * can be created for only one matching char set using this method. If more
40 | * than one charset needs to be tried, the caller will need to reset
41 | * the InputStream and create InputStreamReaders itself, based on the charset name.
42 | *
43 | * @return the Reader for the Unicode character data.
44 | *
45 | * @stable ICU 3.4
46 | */
47 | public Reader getReader() {
48 | InputStream inputStream = fInputStream;
49 |
50 | if (inputStream == null) {
51 | inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
52 | }
53 |
54 | try {
55 | inputStream.reset();
56 | return new InputStreamReader(inputStream, getName());
57 | } catch (IOException e) {
58 | return null;
59 | }
60 | }
61 |
62 | /**
63 | * Create a Java String from Unicode character data corresponding
64 | * to the original byte data supplied to the Charset detect operation.
65 | *
66 | * @return a String created from the converted input data.
67 | *
68 | * @stable ICU 3.4
69 | */
70 | public String getString() throws java.io.IOException {
71 | return getString(-1);
72 |
73 | }
74 |
75 | /**
76 | * Create a Java String from Unicode character data corresponding
77 | * to the original byte data supplied to the Charset detect operation.
78 | * The length of the returned string is limited to the specified size;
79 | * the string will be trunctated to this length if necessary. A limit value of
80 | * zero or less is ignored, and treated as no limit.
81 | *
82 | * @param maxLength The maximum length of the String to be created when the
83 | * source of the data is an input stream, or -1 for
84 | * unlimited length.
85 | * @return a String created from the converted input data.
86 | *
87 | * @stable ICU 3.4
88 | */
89 | public String getString(int maxLength) throws java.io.IOException {
90 | String result = null;
91 | if (fInputStream != null) {
92 | StringBuilder sb = new StringBuilder();
93 | char[] buffer = new char[1024];
94 | Reader reader = getReader();
95 | int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
96 | int bytesRead = 0;
97 |
98 | while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
99 | sb.append(buffer, 0, bytesRead);
100 | max -= bytesRead;
101 | }
102 |
103 | reader.close();
104 |
105 | return sb.toString();
106 | } else {
107 | String name = getName();
108 | /*
109 | * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
110 | * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
111 | * should be stripped off before creating the string.
112 | */
113 | int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
114 | if (startSuffix > 0) {
115 | name = name.substring(0, startSuffix);
116 | }
117 | result = new String(fRawInput, name);
118 | }
119 | return result;
120 |
121 | }
122 |
123 | /**
124 | * Get an indication of the confidence in the charset detected.
125 | * Confidence values range from 0-100, with larger numbers indicating
126 | * a better match of the input data to the characteristics of the
127 | * charset.
128 | *
129 | * @return the confidence in the charset match
130 | *
131 | * @stable ICU 3.4
132 | */
133 | public int getConfidence() {
134 | return fConfidence;
135 | }
136 |
137 | /**
138 | * Get the name of the detected charset.
139 | * The name will be one that can be used with other APIs on the
140 | * platform that accept charset names. It is the "Canonical name"
141 | * as defined by the class java.nio.charset.Charset; for
142 | * charsets that are registered with the IANA charset registry,
143 | * this is the MIME-preferred registerd name.
144 | *
145 | * @see java.nio.charset.Charset
146 | * @see java.io.InputStreamReader
147 | *
148 | * @return The name of the charset.
149 | *
150 | * @stable ICU 3.4
151 | */
152 | public String getName() {
153 | return fCharsetName;
154 | }
155 |
156 | /**
157 | * Get the ISO code for the language of the detected charset.
158 | *
159 | * @return The ISO code for the language or null
if the language cannot be determined.
160 | *
161 | * @stable ICU 3.4
162 | */
163 | public String getLanguage() {
164 | return fLang;
165 | }
166 |
167 | /**
168 | * Compare to other CharsetMatch objects.
169 | * Comparison is based on the match confidence value, which
170 | * allows CharsetDetector.detectAll() to order its results.
171 | *
172 | * @param other the CharsetMatch object to compare against.
173 | * @return a negative integer, zero, or a positive integer as the
174 | * confidence level of this CharsetMatch
175 | * is less than, equal to, or greater than that of
176 | * the argument.
177 | * @throws ClassCastException if the argument is not a CharsetMatch.
178 | * @stable ICU 4.4
179 | */
180 | @Override
181 | public int compareTo (CharsetMatch other) {
182 | int compareResult = 0;
183 | if (this.fConfidence > other.fConfidence) {
184 | compareResult = 1;
185 | } else if (this.fConfidence < other.fConfidence) {
186 | compareResult = -1;
187 | }
188 | return compareResult;
189 | }
190 |
191 | /*
192 | * Constructor. Implementation internal
193 | */
194 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
195 | fConfidence = conf;
196 |
197 | // The references to the original application input data must be copied out
198 | // of the charset recognizer to here, in case the application resets the
199 | // recognizer before using this CharsetMatch.
200 | if (det.fInputStream == null) {
201 | // We only want the existing input byte data if it came straight from the user,
202 | // not if is just the head of a stream.
203 | fRawInput = det.fRawInput;
204 | fRawLength = det.fRawLength;
205 | }
206 | fInputStream = det.fInputStream;
207 | fCharsetName = rec.getName();
208 | fLang = rec.getLanguage();
209 | }
210 |
211 | /*
212 | * Constructor. Implementation internal
213 | */
214 | CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
215 | fConfidence = conf;
216 |
217 | // The references to the original application input data must be copied out
218 | // of the charset recognizer to here, in case the application resets the
219 | // recognizer before using this CharsetMatch.
220 | if (det.fInputStream == null) {
221 | // We only want the existing input byte data if it came straight from the user,
222 | // not if is just the head of a stream.
223 | fRawInput = det.fRawInput;
224 | fRawLength = det.fRawLength;
225 | }
226 | fInputStream = det.fInputStream;
227 | fCharsetName = csName;
228 | fLang = lang;
229 | }
230 |
231 |
232 | //
233 | // Private Data
234 | //
235 | private int fConfidence;
236 | private byte[] fRawInput = null; // Original, untouched input bytes.
237 | // If user gave us a byte array, this is it.
238 | private int fRawLength; // Length of data in fRawInput array.
239 |
240 | private InputStream fInputStream = null; // User's input stream, or null if the user
241 | // gave us a byte array.
242 |
243 | private String fCharsetName; // The name of the charset this CharsetMatch
244 | // represents. Filled in by the recognizer.
245 | private String fLang; // The language, if one was determined by
246 | // the recognizer during the detect operation.
247 | }
248 |
--------------------------------------------------------------------------------
/src/main/java/com/ibm/icu/text/CharsetRecog_2022.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /*
4 | *******************************************************************************
5 | * Copyright (C) 2005 - 2012, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.ibm.icu.text;
10 |
11 | /**
12 | * class CharsetRecog_2022 part of the ICU charset detection implementation.
13 | * This is a superclass for the individual detectors for
14 | * each of the detectable members of the ISO 2022 family
15 | * of encodings.
16 | *
17 | * The separate classes are nested within this class.
18 | */
19 | abstract class CharsetRecog_2022 extends CharsetRecognizer {
20 |
21 |
22 | /**
23 | * Matching function shared among the 2022 detectors JP, CN and KR
24 | * Counts up the number of legal an unrecognized escape sequences in
25 | * the sample of text, and computes a score based on the total number &
26 | * the proportion that fit the encoding.
27 | *
28 | *
29 | * @param text the byte buffer containing text to analyse
30 | * @param textLen the size of the text in the byte.
31 | * @param escapeSequences the byte escape sequences to test for.
32 | * @return match quality, in the range of 0-100.
33 | */
34 | int match(byte [] text, int textLen, byte [][] escapeSequences) {
35 | int i, j;
36 | int escN;
37 | int hits = 0;
38 | int misses = 0;
39 | int shifts = 0;
40 | int quality;
41 | scanInput:
42 | for (i=0; i= 3 &&
35 | (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
36 | hasBOM = true;
37 | }
38 |
39 | // Scan for multi-byte sequences
40 | for (i=0; i=det.fRawLength) {
62 | break;
63 | }
64 | b = input[i];
65 | if ((b & 0xc0) != 0x080) {
66 | numInvalid++;
67 | break;
68 | }
69 | if (--trailBytes == 0) {
70 | numValid++;
71 | break;
72 | }
73 | }
74 | }
75 |
76 | // Cook up some sort of confidence score, based on presence of a BOM
77 | // and the existence of valid and/or invalid multi-byte sequences.
78 | confidence = 0;
79 | if (hasBOM && numInvalid==0) {
80 | confidence = 100;
81 | } else if (hasBOM && numValid > numInvalid*10) {
82 | confidence = 80;
83 | } else if (numValid > 3 && numInvalid == 0) {
84 | confidence = 100;
85 | } else if (numValid > 0 && numInvalid == 0) {
86 | confidence = 80;
87 | } else if (numValid == 0 && numInvalid == 0) {
88 | // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
89 | // accepts ASCII with confidence = 10.
90 | // TODO: add plain ASCII as an explicitly detected type.
91 | confidence = 15;
92 | } else if (numValid > numInvalid*10) {
93 | // Probably corrupt utf-8 data. Valid sequences aren't likely by chance.
94 | confidence = 25;
95 | }
96 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/com/ibm/icu/text/CharsetRecog_Unicode.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /*
4 | *******************************************************************************
5 | * Copyright (C) 1996-2013, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | *
9 | */
10 |
11 | package com.ibm.icu.text;
12 |
13 | /**
14 | * This class matches UTF-16 and UTF-32, both big- and little-endian. The
15 | * BOM will be used if it is present.
16 | */
17 | abstract class CharsetRecog_Unicode extends CharsetRecognizer {
18 |
19 | /* (non-Javadoc)
20 | * @see com.ibm.icu.text.CharsetRecognizer#getName()
21 | */
22 | @Override
23 | abstract String getName();
24 |
25 | /* (non-Javadoc)
26 | * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
27 | */
28 | @Override
29 | abstract CharsetMatch match(CharsetDetector det);
30 |
31 | static int codeUnit16FromBytes(byte hi, byte lo) {
32 | return ((hi & 0xff) << 8) | (lo & 0xff);
33 | }
34 |
35 | // UTF-16 confidence calculation. Very simple minded, but better than nothing.
36 | // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
37 | // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
38 | // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
39 | // NULs should be rare in actual text.
40 | static int adjustConfidence(int codeUnit, int confidence) {
41 | if (codeUnit == 0) {
42 | confidence -= 10;
43 | } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
44 | confidence += 10;
45 | }
46 | if (confidence < 0) {
47 | confidence = 0;
48 | } else if (confidence > 100) {
49 | confidence = 100;
50 | }
51 | return confidence;
52 | }
53 |
54 | static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
55 | {
56 | @Override
57 | String getName()
58 | {
59 | return "UTF-16BE";
60 | }
61 |
62 | @Override
63 | CharsetMatch match(CharsetDetector det)
64 | {
65 | byte[] input = det.fRawInput;
66 | int confidence = 10;
67 |
68 | int bytesToCheck = Math.min(input.length, 30);
69 | for (int charIndex=0; charIndex 0) {
84 | return new CharsetMatch(det, this, confidence);
85 | }
86 | return null;
87 | }
88 | }
89 |
90 | static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
91 | {
92 | @Override
93 | String getName()
94 | {
95 | return "UTF-16LE";
96 | }
97 |
98 | @Override
99 | CharsetMatch match(CharsetDetector det)
100 | {
101 | byte[] input = det.fRawInput;
102 | int confidence = 10;
103 |
104 | int bytesToCheck = Math.min(input.length, 30);
105 | for (int charIndex=0; charIndex 0) {
120 | return new CharsetMatch(det, this, confidence);
121 | }
122 | return null;
123 | }
124 | }
125 |
126 | static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
127 | {
128 | abstract int getChar(byte[] input, int index);
129 |
130 | @Override
131 | abstract String getName();
132 |
133 | @Override
134 | CharsetMatch match(CharsetDetector det)
135 | {
136 | byte[] input = det.fRawInput;
137 | int limit = (det.fRawLength / 4) * 4;
138 | int numValid = 0;
139 | int numInvalid = 0;
140 | boolean hasBOM = false;
141 | int confidence = 0;
142 |
143 | if (limit==0) {
144 | return null;
145 | }
146 | if (getChar(input, 0) == 0x0000FEFF) {
147 | hasBOM = true;
148 | }
149 |
150 | for(int i = 0; i < limit; i += 4) {
151 | int ch = getChar(input, i);
152 |
153 | if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
154 | numInvalid += 1;
155 | } else {
156 | numValid += 1;
157 | }
158 | }
159 |
160 |
161 | // Cook up some sort of confidence score, based on presence of a BOM
162 | // and the existence of valid and/or invalid multi-byte sequences.
163 | if (hasBOM && numInvalid==0) {
164 | confidence = 100;
165 | } else if (hasBOM && numValid > numInvalid*10) {
166 | confidence = 80;
167 | } else if (numValid > 3 && numInvalid == 0) {
168 | confidence = 100;
169 | } else if (numValid > 0 && numInvalid == 0) {
170 | confidence = 80;
171 | } else if (numValid > numInvalid*10) {
172 | // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
173 | confidence = 25;
174 | }
175 |
176 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
177 | }
178 | }
179 |
180 | static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
181 | {
182 | @Override
183 | int getChar(byte[] input, int index)
184 | {
185 | return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
186 | (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
187 | }
188 |
189 | @Override
190 | String getName()
191 | {
192 | return "UTF-32BE";
193 | }
194 | }
195 |
196 |
197 | static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
198 | {
199 | @Override
200 | int getChar(byte[] input, int index)
201 | {
202 | return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
203 | (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
204 | }
205 |
206 | @Override
207 | String getName()
208 | {
209 | return "UTF-32LE";
210 | }
211 | }
212 | }
213 |
--------------------------------------------------------------------------------
/src/main/java/com/ibm/icu/text/CharsetRecog_mbcs.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /*
4 | ****************************************************************************
5 | * Copyright (C) 2005-2012, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | ****************************************************************************
8 | *
9 | */
10 | package com.ibm.icu.text;
11 |
12 | import java.util.Arrays;
13 |
14 | /**
15 | * CharsetRecognizer implementation for Asian - double or multi-byte - charsets.
16 | * Match is determined mostly by the input data adhering to the
17 | * encoding scheme for the charset, and, optionally,
18 | * frequency-of-occurrence of characters.
19 | *
20 | * Instances of this class are singletons, one per encoding
21 | * being recognized. They are created in the main
22 | * CharsetDetector class and kept in the global list of available
23 | * encodings to be checked. The specific encoding being recognized
24 | * is determined by subclass.
25 | */
26 | abstract class CharsetRecog_mbcs extends CharsetRecognizer {
27 |
28 | /**
29 | * Get the IANA name of this charset.
30 | * @return the charset name.
31 | */
32 | @Override
33 | abstract String getName() ;
34 |
35 |
36 | /**
37 | * Test the match of this charset with the input text data
38 | * which is obtained via the CharsetDetector object.
39 | *
40 | * @param det The CharsetDetector, which contains the input text
41 | * to be checked for being in this charset.
42 | * @return Two values packed into one int (Damn java, anyhow)
43 | *
44 | * bits 0-7: the match confidence, ranging from 0-100
45 | *
46 | * bits 8-15: The match reason, an enum-like value.
47 | */
48 | int match(CharsetDetector det, int [] commonChars) {
49 | @SuppressWarnings("unused")
50 | int singleByteCharCount = 0; //TODO Do we really need this?
51 | int doubleByteCharCount = 0;
52 | int commonCharCount = 0;
53 | int badCharCount = 0;
54 | int totalCharCount = 0;
55 | int confidence = 0;
56 | iteratedChar iter = new iteratedChar();
57 |
58 | detectBlock: {
59 | for (iter.reset(); nextChar(iter, det);) {
60 | totalCharCount++;
61 | if (iter.error) {
62 | badCharCount++;
63 | } else {
64 | long cv = iter.charValue & 0xFFFFFFFFL;
65 |
66 | if (cv <= 0xff) {
67 | singleByteCharCount++;
68 | } else {
69 | doubleByteCharCount++;
70 | if (commonChars != null) {
71 | // NOTE: This assumes that there are no 4-byte common chars.
72 | if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
73 | commonCharCount++;
74 | }
75 | }
76 | }
77 | }
78 | if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
79 | // Bail out early if the byte data is not matching the encoding scheme.
80 | break detectBlock;
81 | }
82 | }
83 |
84 | if (doubleByteCharCount <= 10 && badCharCount== 0) {
85 | // Not many multi-byte chars.
86 | if (doubleByteCharCount == 0 && totalCharCount < 10) {
87 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
88 | // We don't have enough data to have any confidence.
89 | // Statistical analysis of single byte non-ASCII characters would probably help here.
90 | confidence = 0;
91 | }
92 | else {
93 | // ASCII or ISO file? It's probably not our encoding,
94 | // but is not incompatible with our encoding, so don't give it a zero.
95 | confidence = 10;
96 | }
97 |
98 | break detectBlock;
99 | }
100 |
101 | //
102 | // No match if there are too many characters that don't fit the encoding scheme.
103 | // (should we have zero tolerance for these?)
104 | //
105 | if (doubleByteCharCount < 20*badCharCount) {
106 | confidence = 0;
107 | break detectBlock;
108 | }
109 |
110 | if (commonChars == null) {
111 | // We have no statistics on frequently occurring characters.
112 | // Assess confidence purely on having a reasonable number of
113 | // multi-byte characters (the more the better
114 | confidence = 30 + doubleByteCharCount - 20*badCharCount;
115 | if (confidence > 100) {
116 | confidence = 100;
117 | }
118 | }else {
119 | //
120 | // Frequency of occurrence statistics exist.
121 | //
122 | double maxVal = Math.log((float)doubleByteCharCount / 4);
123 | double scaleFactor = 90.0 / maxVal;
124 | confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
125 | confidence = Math.min(confidence, 100);
126 | }
127 | } // end of detectBlock:
128 |
129 | return confidence;
130 | }
131 |
132 | // "Character" iterated character class.
133 | // Recognizers for specific mbcs encodings make their "characters" available
134 | // by providing a nextChar() function that fills in an instance of iteratedChar
135 | // with the next char from the input.
136 | // The returned characters are not converted to Unicode, but remain as the raw
137 | // bytes (concatenated into an int) from the codepage data.
138 | //
139 | // For Asian charsets, use the raw input rather than the input that has been
140 | // stripped of markup. Detection only considers multi-byte chars, effectively
141 | // stripping markup anyway, and double byte chars do occur in markup too.
142 | //
143 | static class iteratedChar {
144 | int charValue = 0; // 1-4 bytes from the raw input data
145 | int nextIndex = 0;
146 | boolean error = false;
147 | boolean done = false;
148 |
149 | void reset() {
150 | charValue = 0;
151 | nextIndex = 0;
152 | error = false;
153 | done = false;
154 | }
155 |
156 | int nextByte(CharsetDetector det) {
157 | if (nextIndex >= det.fRawLength) {
158 | done = true;
159 | return -1;
160 | }
161 | int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
162 | return byteValue;
163 | }
164 | }
165 |
166 | /**
167 | * Get the next character (however many bytes it is) from the input data
168 | * Subclasses for specific charset encodings must implement this function
169 | * to get characters according to the rules of their encoding scheme.
170 | *
171 | * This function is not a method of class iteratedChar only because
172 | * that would require a lot of extra derived classes, which is awkward.
173 | * @param it The iteratedChar "struct" into which the returned char is placed.
174 | * @param det The charset detector, which is needed to get at the input byte data
175 | * being iterated over.
176 | * @return True if a character was returned, false at end of input.
177 | */
178 | abstract boolean nextChar(iteratedChar it, CharsetDetector det);
179 |
180 |
181 |
182 |
183 |
184 | /**
185 | * Shift-JIS charset recognizer.
186 | *
187 | */
188 | static class CharsetRecog_sjis extends CharsetRecog_mbcs {
189 | static int [] commonChars =
190 | // TODO: This set of data comes from the character frequency-
191 | // of-occurrence analysis tool. The data needs to be moved
192 | // into a resource and loaded from there.
193 | {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
194 | 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
195 | 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
196 | 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
197 | 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
198 | 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199 |
200 | @Override
201 | boolean nextChar(iteratedChar it, CharsetDetector det) {
202 | it.error = false;
203 | int firstByte;
204 | firstByte = it.charValue = it.nextByte(det);
205 | if (firstByte < 0) {
206 | return false;
207 | }
208 |
209 | if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
210 | return true;
211 | }
212 |
213 | int secondByte = it.nextByte(det);
214 | if (secondByte < 0) {
215 | return false;
216 | }
217 | it.charValue = (firstByte << 8) | secondByte;
218 | if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
219 | // Illegal second byte value.
220 | it.error = true;
221 | }
222 | return true;
223 | }
224 |
225 | @Override
226 | CharsetMatch match(CharsetDetector det) {
227 | int confidence = match(det, commonChars);
228 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
229 | }
230 |
231 | @Override
232 | String getName() {
233 | return "Shift_JIS";
234 | }
235 |
236 | @Override
237 | public String getLanguage()
238 | {
239 | return "ja";
240 | }
241 |
242 |
243 | }
244 |
245 |
246 | /**
247 | * Big5 charset recognizer.
248 | *
249 | */
250 | static class CharsetRecog_big5 extends CharsetRecog_mbcs {
251 | static int [] commonChars =
252 | // TODO: This set of data comes from the character frequency-
253 | // of-occurrence analysis tool. The data needs to be moved
254 | // into a resource and loaded from there.
255 | {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256 | 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257 | 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258 | 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259 | 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260 | 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261 | 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262 | 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263 | 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
265 |
266 | @Override
267 | boolean nextChar(iteratedChar it, CharsetDetector det) {
268 | it.error = false;
269 | int firstByte;
270 | firstByte = it.charValue = it.nextByte(det);
271 | if (firstByte < 0) {
272 | return false;
273 | }
274 |
275 | if (firstByte <= 0x7f || firstByte==0xff) {
276 | // single byte character.
277 | return true;
278 | }
279 |
280 | int secondByte = it.nextByte(det);
281 | if (secondByte < 0) {
282 | return false;
283 | }
284 | it.charValue = (it.charValue << 8) | secondByte;
285 |
286 | if (secondByte < 0x40 ||
287 | secondByte ==0x7f ||
288 | secondByte == 0xff) {
289 | it.error = true;
290 | }
291 | return true;
292 | }
293 |
294 | @Override
295 | CharsetMatch match(CharsetDetector det) {
296 | int confidence = match(det, commonChars);
297 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
298 | }
299 |
300 | @Override
301 | String getName() {
302 | return "Big5";
303 | }
304 |
305 |
306 | @Override
307 | public String getLanguage()
308 | {
309 | return "zh";
310 | }
311 | }
312 |
313 |
314 | /**
315 | * EUC charset recognizers. One abstract class that provides the common function
316 | * for getting the next character according to the EUC encoding scheme,
317 | * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
318 | *
319 | */
320 | abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
321 |
322 | /*
323 | * (non-Javadoc)
324 | * Get the next character value for EUC based encodings.
325 | * Character "value" is simply the raw bytes that make up the character
326 | * packed into an int.
327 | */
328 | @Override
329 | boolean nextChar(iteratedChar it, CharsetDetector det) {
330 | it.error = false;
331 | int firstByte = 0;
332 | int secondByte = 0;
333 | int thirdByte = 0;
334 | //int fourthByte = 0;
335 |
336 | buildChar: {
337 | firstByte = it.charValue = it.nextByte(det);
338 | if (firstByte < 0) {
339 | // Ran off the end of the input data
340 | it.done = true;
341 | break buildChar;
342 | }
343 | if (firstByte <= 0x8d) {
344 | // single byte char
345 | break buildChar;
346 | }
347 |
348 | secondByte = it.nextByte(det);
349 | it.charValue = (it.charValue << 8) | secondByte;
350 |
351 | if (firstByte >= 0xA1 && firstByte <= 0xfe) {
352 | // Two byte Char
353 | if (secondByte < 0xa1) {
354 | it.error = true;
355 | }
356 | break buildChar;
357 | }
358 | if (firstByte == 0x8e) {
359 | // Code Set 2.
360 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
361 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
362 | // We don't know which we've got.
363 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two
364 | // bytes will look like a well formed 2 byte char.
365 | if (secondByte < 0xa1) {
366 | it.error = true;
367 | }
368 | break buildChar;
369 | }
370 |
371 | if (firstByte == 0x8f) {
372 | // Code set 3.
373 | // Three byte total char size, two bytes of actual char value.
374 | thirdByte = it.nextByte(det);
375 | it.charValue = (it.charValue << 8) | thirdByte;
376 | if (thirdByte < 0xa1) {
377 | it.error = true;
378 | }
379 | }
380 | }
381 |
382 | return (it.done == false);
383 | }
384 |
385 | /**
386 | * The charset recognize for EUC-JP. A singleton instance of this class
387 | * is created and kept by the public CharsetDetector class
388 | */
389 | static class CharsetRecog_euc_jp extends CharsetRecog_euc {
390 | static int [] commonChars =
391 | // TODO: This set of data comes from the character frequency-
392 | // of-occurrence analysis tool. The data needs to be moved
393 | // into a resource and loaded from there.
394 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
395 | 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
396 | 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
397 | 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
398 | 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
399 | 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
400 | 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
401 | 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
402 | 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
403 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
404 | @Override
405 | String getName() {
406 | return "EUC-JP";
407 | }
408 |
409 | @Override
410 | CharsetMatch match(CharsetDetector det) {
411 | int confidence = match(det, commonChars);
412 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
413 | }
414 |
415 | @Override
416 | public String getLanguage()
417 | {
418 | return "ja";
419 | }
420 | }
421 |
422 | /**
423 | * The charset recognize for EUC-KR. A singleton instance of this class
424 | * is created and kept by the public CharsetDetector class
425 | */
426 | static class CharsetRecog_euc_kr extends CharsetRecog_euc {
427 | static int [] commonChars =
428 | // TODO: This set of data comes from the character frequency-
429 | // of-occurrence analysis tool. The data needs to be moved
430 | // into a resource and loaded from there.
431 | {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
432 | 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
433 | 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
434 | 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
435 | 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
436 | 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
437 | 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
438 | 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
439 | 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
440 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
441 |
442 | @Override
443 | String getName() {
444 | return "EUC-KR";
445 | }
446 |
447 | @Override
448 | CharsetMatch match(CharsetDetector det) {
449 | int confidence = match(det, commonChars);
450 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
451 | }
452 |
453 | @Override
454 | public String getLanguage()
455 | {
456 | return "ko";
457 | }
458 | }
459 | }
460 |
461 | /**
462 | *
463 | * GB-18030 recognizer. Uses simplified Chinese statistics.
464 | *
465 | */
466 | static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
467 |
468 | /*
469 | * (non-Javadoc)
470 | * Get the next character value for EUC based encodings.
471 | * Character "value" is simply the raw bytes that make up the character
472 | * packed into an int.
473 | */
474 | @Override
475 | boolean nextChar(iteratedChar it, CharsetDetector det) {
476 | it.error = false;
477 | int firstByte = 0;
478 | int secondByte = 0;
479 | int thirdByte = 0;
480 | int fourthByte = 0;
481 |
482 | buildChar: {
483 | firstByte = it.charValue = it.nextByte(det);
484 |
485 | if (firstByte < 0) {
486 | // Ran off the end of the input data
487 | it.done = true;
488 | break buildChar;
489 | }
490 |
491 | if (firstByte <= 0x80) {
492 | // single byte char
493 | break buildChar;
494 | }
495 |
496 | secondByte = it.nextByte(det);
497 | it.charValue = (it.charValue << 8) | secondByte;
498 |
499 | if (firstByte >= 0x81 && firstByte <= 0xFE) {
500 | // Two byte Char
501 | if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
502 | break buildChar;
503 | }
504 |
505 | // Four byte char
506 | if (secondByte >= 0x30 && secondByte <= 0x39) {
507 | thirdByte = it.nextByte(det);
508 |
509 | if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
510 | fourthByte = it.nextByte(det);
511 |
512 | if (fourthByte >= 0x30 && fourthByte <= 0x39) {
513 | it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
514 | break buildChar;
515 | }
516 | }
517 | }
518 |
519 | it.error = true;
520 | break buildChar;
521 | }
522 | }
523 |
524 | return (it.done == false);
525 | }
526 |
527 | static int [] commonChars =
528 | // TODO: This set of data comes from the character frequency-
529 | // of-occurrence analysis tool. The data needs to be moved
530 | // into a resource and loaded from there.
531 | {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
532 | 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
533 | 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
534 | 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
535 | 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
536 | 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
537 | 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
538 | 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
539 | 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
540 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
541 |
542 |
543 | @Override
544 | String getName() {
545 | return "GB18030";
546 | }
547 |
548 | @Override
549 | CharsetMatch match(CharsetDetector det) {
550 | int confidence = match(det, commonChars);
551 | return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
552 | }
553 |
554 | @Override
555 | public String getLanguage()
556 | {
557 | return "zh";
558 | }
559 | }
560 |
561 |
562 | }
563 |
--------------------------------------------------------------------------------
/src/main/java/com/ibm/icu/text/CharsetRecognizer.java:
--------------------------------------------------------------------------------
1 | // © 2016 and later: Unicode, Inc. and others.
2 | // License & terms of use: http://www.unicode.org/copyright.html
3 | /**
4 | *******************************************************************************
5 | * Copyright (C) 2005-2012, International Business Machines Corporation and *
6 | * others. All Rights Reserved. *
7 | *******************************************************************************
8 | */
9 | package com.ibm.icu.text;
10 |
11 | /**
12 | * Abstract class for recognizing a single charset.
13 | * Part of the implementation of ICU's CharsetDetector.
14 | *
15 | * Each specific charset that can be recognized will have an instance
16 | * of some subclass of this class. All interaction between the overall
17 | * CharsetDetector and the stuff specific to an individual charset happens
18 | * via the interface provided here.
19 | *
20 | * Instances of CharsetDetector DO NOT have or maintain
21 | * state pertaining to a specific match or detect operation.
22 | * The WILL be shared by multiple instances of CharsetDetector.
23 | * They encapsulate const charset-specific information.
24 | */
25 | abstract class CharsetRecognizer {
26 | /**
27 | * Get the IANA name of this charset.
28 | * @return the charset name.
29 | */
30 | abstract String getName();
31 |
32 | /**
33 | * Get the ISO language code for this charset.
34 | * @return the language code, or null
if the language cannot be determined.
35 | */
36 | public String getLanguage()
37 | {
38 | return null;
39 | }
40 |
41 | /**
42 | * Test the match of this charset with the input text data
43 | * which is obtained via the CharsetDetector object.
44 | *
45 | * @param det The CharsetDetector, which contains the input text
46 | * to be checked for being in this charset.
47 | * @return A CharsetMatch object containing details of match
48 | * with this charset, or null if there was no match.
49 | */
50 | abstract CharsetMatch match(CharsetDetector det);
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/org/billthefarmer/print/Print.java:
--------------------------------------------------------------------------------
1 | ////////////////////////////////////////////////////////////////////////////////
2 | //
3 | // Print - print text and HTML
4 | //
5 | // Copyright (C) 2019 Bill Farmer
6 | //
7 | // This program is free software: you can redistribute it and/or modify
8 | // it under the terms of the GNU General Public License as published by
9 | // the Free Software Foundation, either version 3 of the License, or
10 | // (at your option) any later version.
11 | //
12 | // This program is distributed in the hope that it will be useful,
13 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | // GNU General Public License for more details.
16 | //
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program. If not, see .
19 | //
20 | ////////////////////////////////////////////////////////////////////////////////
21 |
22 | package org.billthefarmer.print;
23 |
24 | import android.app.Activity;
25 | import android.app.AlertDialog;
26 | import android.app.Dialog;
27 | import android.content.Intent;
28 | import android.content.SharedPreferences;
29 | import android.net.Uri;
30 | import android.os.AsyncTask;
31 | import android.os.Bundle;
32 | import android.preference.PreferenceManager;
33 | import android.print.PrintAttributes;
34 | import android.print.PrintDocumentAdapter;
35 | import android.print.PrintManager;
36 | import android.text.SpannableStringBuilder;
37 | import android.text.method.LinkMovementMethod;
38 | import android.util.Log;
39 | import android.view.KeyEvent;
40 | import android.view.Menu;
41 | import android.view.MenuItem;
42 | import android.view.View;
43 | import android.webkit.URLUtil;
44 | import android.webkit.WebSettings;
45 | import android.webkit.WebView;
46 | import android.webkit.WebViewClient;
47 | import android.widget.ProgressBar;
48 | import android.widget.TextView;
49 |
50 | import org.commonmark.Extension;
51 | import org.commonmark.ext.autolink.AutolinkExtension;
52 | import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
53 | import org.commonmark.ext.gfm.tables.TablesExtension;
54 | import org.commonmark.ext.ins.InsExtension;
55 | import org.commonmark.ext.sub.SubExtension;
56 | import org.commonmark.ext.sup.SupExtension;
57 | import org.commonmark.ext.task.list.items.TaskListItemsExtension;
58 | import org.commonmark.node.Node;
59 | import org.commonmark.parser.Parser;
60 | import org.commonmark.renderer.html.HtmlRenderer;
61 |
62 | import java.io.BufferedInputStream;
63 | import java.io.BufferedReader;
64 | import java.io.InputStreamReader;
65 |
66 | import java.lang.ref.WeakReference;
67 |
68 | import java.text.DateFormat;
69 |
70 | import java.util.Arrays;
71 | import java.util.List;
72 | import java.util.regex.Matcher;
73 | import java.util.regex.Pattern;
74 |
75 | @SuppressWarnings("deprecation")
76 | public class Print extends Activity
77 | {
78 | public static final String TAG = "Print";
79 |
80 | public static final String HTML_HEAD =
81 | "\n\n\n\n" +
82 | "\n\n\n";
84 | public static final String HTML_TAIL = "\n\n\n";
85 | public static final String ANDROID_ASSET = "file:///android_asset/";
86 | public static final String UTF_8 = "utf-8";
87 |
88 | public static final String TEXT_PLAIN = "text/plain";
89 | public static final String TEXT_HTML = "text/html";
90 | public static final String TEXT_WILD = "text/*";
91 |
92 | public static final String PREF_MARKDOWN = "pref_markdown";
93 |
94 | public static final String ASSET_URL =
95 | "file:///android_asset/print.html";
96 |
97 | private static final int OPEN_DOCUMENT = 1;
98 |
99 | private WebView webView;
100 | private ProgressBar progress;
101 | private boolean markdown = true;
102 |
103 | // Called when the activity is first created.
104 | @Override
105 | public void onCreate(Bundle savedInstanceState)
106 | {
107 | super.onCreate(savedInstanceState);
108 |
109 | SharedPreferences preferences =
110 | PreferenceManager.getDefaultSharedPreferences(this);
111 |
112 | markdown = preferences.getBoolean(PREF_MARKDOWN, true);
113 |
114 | setContentView(R.layout.main);
115 |
116 | webView = findViewById(R.id.webview);
117 | progress = findViewById(R.id.progress);
118 |
119 | if (webView != null)
120 | {
121 | // Enable javascript, web sites don't work unless JavaScript
122 | // is enabled
123 | WebSettings settings = webView.getSettings();
124 | settings.setJavaScriptEnabled(true);
125 |
126 | // Enable zoom
127 | settings.setBuiltInZoomControls(true);
128 | settings.setDisplayZoomControls(false);
129 |
130 | // Follow links and set title
131 | webView.setWebViewClient(new WebViewClient()
132 | {
133 | // onPageFinished
134 | @Override
135 | public void onPageFinished(WebView view, String url)
136 | {
137 | // Remove progress
138 | progress.setVisibility(View.GONE);
139 |
140 | // Get page title
141 | if (URLUtil.isNetworkUrl(url) && view.getTitle() != null)
142 | setTitle(view.getTitle());
143 |
144 | else
145 | setTitle(R.string.appName);
146 |
147 | if (view.canGoBack())
148 | getActionBar().setDisplayHomeAsUpEnabled(true);
149 |
150 | else
151 | getActionBar().setDisplayHomeAsUpEnabled(false);
152 | }
153 | });
154 |
155 | if (savedInstanceState != null)
156 | // Restore state
157 | webView.restoreState(savedInstanceState);
158 |
159 | else
160 | {
161 | Intent intent = getIntent();
162 | switch (intent.getAction())
163 | {
164 | case Intent.ACTION_VIEW:
165 | {
166 | // Get uri
167 | Uri uri = intent.getData();
168 | if (uri != null)
169 | readFile(uri);
170 | break;
171 | }
172 |
173 | case Intent.ACTION_SEND:
174 | {
175 | // Get uri
176 | Uri uri = intent.getParcelableExtra(Intent.EXTRA_STREAM);
177 | String html = intent.getStringExtra(Intent.EXTRA_HTML_TEXT);
178 | String text = intent.getStringExtra(Intent.EXTRA_TEXT);
179 | if (uri != null)
180 | readFile(uri);
181 |
182 | else if (html != null)
183 | {
184 | if (URLUtil.isNetworkUrl(html))
185 | webView.loadUrl(html);
186 |
187 | else
188 | loadText(html);
189 | }
190 |
191 | else if (text != null)
192 | {
193 | if (URLUtil.isNetworkUrl(text))
194 | webView.loadUrl(text);
195 |
196 | else
197 | loadText(text);
198 | }
199 | break;
200 | }
201 | default:
202 | webView.loadUrl(ASSET_URL);
203 | }
204 | }
205 | }
206 | }
207 |
208 | // onPause
209 | @Override
210 | public void onPause()
211 | {
212 | super.onPause();
213 |
214 | SharedPreferences preferences =
215 | PreferenceManager.getDefaultSharedPreferences(this);
216 | SharedPreferences.Editor editor = preferences.edit();
217 |
218 | editor.putBoolean(PREF_MARKDOWN, markdown);
219 | editor.apply();
220 | }
221 |
222 | // On save instance state
223 | @Override
224 | public void onSaveInstanceState(Bundle outState)
225 | {
226 | super.onSaveInstanceState(outState);
227 |
228 | if (webView != null)
229 | // Save state
230 | webView.saveState(outState);
231 | }
232 |
233 | // On create option menu
234 | @Override
235 | public boolean onCreateOptionsMenu(Menu menu)
236 | {
237 | // Inflate the menu; this adds items to the action bar if it
238 | // is present.
239 | getMenuInflater().inflate(R.menu.main, menu);
240 | return true;
241 | }
242 |
243 | // onPrepareOptionsMenu
244 | @Override
245 | public boolean onPrepareOptionsMenu(Menu menu)
246 | {
247 | menu.findItem(R.id.action_markdown).setChecked(markdown);
248 |
249 | return true;
250 | }
251 |
252 | // On options item
253 | @Override
254 | public boolean onOptionsItemSelected(MenuItem item)
255 | {
256 | // Get id
257 | int id = item.getItemId();
258 | switch (id)
259 | {
260 | // Home
261 | case android.R.id.home:
262 | // Back navigation
263 | if (webView != null && webView.canGoBack())
264 | webView.goBack();
265 |
266 | else
267 | finish();
268 | break;
269 |
270 | // Print
271 | case R.id.action_print:
272 | print();
273 | break;
274 |
275 | // Open
276 | case R.id.action_open:
277 | open();
278 | break;
279 |
280 | // Open
281 | case R.id.action_markdown:
282 | markdown(item);
283 | break;
284 |
285 | // About
286 | case R.id.action_about:
287 | about();
288 | break;
289 |
290 | default:
291 | return false;
292 | }
293 |
294 | return true;
295 | }
296 |
297 | // On back pressed
298 | @Override
299 | public void onBackPressed()
300 | {
301 | // Back navigation
302 | if (webView != null && webView.canGoBack())
303 | webView.goBack();
304 |
305 | else
306 | finish();
307 | }
308 |
309 | // onActivityResult
310 | @Override
311 | protected void onActivityResult(int requestCode, int resultCode,
312 | Intent data)
313 | {
314 | if (resultCode == RESULT_CANCELED)
315 | return;
316 |
317 | switch (requestCode)
318 | {
319 | case OPEN_DOCUMENT:
320 | Uri uri = data.getData();
321 | readFile(uri);
322 | break;
323 | }
324 | }
325 |
326 | // dispatchKeyEvent
327 | @Override
328 | public boolean dispatchKeyEvent(KeyEvent event)
329 | {
330 | // Check Ctrl key
331 | if (event.isCtrlPressed())
332 | {
333 | switch (event.getAction())
334 | {
335 | case KeyEvent.ACTION_DOWN:
336 | switch (event.getKeyCode())
337 | {
338 | // Open
339 | case KeyEvent.KEYCODE_O:
340 | open();
341 | break;
342 |
343 | // Print
344 | case KeyEvent.KEYCODE_P:
345 | print();
346 | break;
347 | }
348 | }
349 | }
350 |
351 | return super.dispatchKeyEvent(event);
352 | }
353 |
354 | // readFile
355 | private void readFile(Uri uri)
356 | {
357 | // Show progress
358 | progress.setVisibility(View.VISIBLE);
359 |
360 | String url = uri.toString();
361 | if (URLUtil.isContentUrl(url))
362 | {
363 | ReadTask readTask = new ReadTask(this);
364 | readTask.execute(uri);
365 | }
366 |
367 | else
368 | webView.loadUrl(url);
369 | }
370 |
371 | private void open()
372 | {
373 | Intent intent = new Intent(Intent.ACTION_OPEN_DOCUMENT);
374 | intent.setType(TEXT_WILD);
375 | intent.addCategory(Intent.CATEGORY_OPENABLE);
376 | startActivityForResult(intent, OPEN_DOCUMENT);
377 | }
378 |
379 | // loadText
380 | private void loadText(String text)
381 | {
382 | if (markdown)
383 | {
384 | // Use commonmark
385 | List extensions =
386 | Arrays.asList(InsExtension.create(),
387 | SubExtension.create(),
388 | SupExtension.create(),
389 | TablesExtension.create(),
390 | AutolinkExtension.create(),
391 | StrikethroughExtension.create(),
392 | TaskListItemsExtension.create());
393 | Parser parser = Parser.builder().extensions(extensions).build();
394 | Node document = parser.parse(text);
395 | HtmlRenderer renderer = HtmlRenderer.builder()
396 | .extensions(extensions).build();
397 |
398 | String html = renderer.render(document);
399 |
400 | webView.loadDataWithBaseURL(ANDROID_ASSET,
401 | HTML_HEAD + html + HTML_TAIL,
402 | TEXT_HTML, UTF_8, null);
403 | }
404 |
405 | else
406 | webView.loadDataWithBaseURL(ANDROID_ASSET, text,
407 | TEXT_HTML, UTF_8, null);
408 | }
409 |
410 | // print
411 | private void print()
412 | {
413 | // Get a PrintManager instance
414 | PrintManager printManager = (PrintManager)
415 | getSystemService(PRINT_SERVICE);
416 |
417 | String jobName = getString(R.string.appName) + " Document";
418 |
419 | // Get a print adapter instance
420 | PrintDocumentAdapter printAdapter =
421 | webView.createPrintDocumentAdapter(jobName);
422 |
423 | // Create a print job with name and adapter instance
424 | printManager.print(jobName, printAdapter,
425 | new PrintAttributes.Builder()
426 | .setMediaSize(PrintAttributes.MediaSize.ISO_A4)
427 | .build());
428 | }
429 |
430 | // markdown
431 | private void markdown(MenuItem item)
432 | {
433 | markdown = !markdown;
434 | item.setChecked(markdown);
435 | }
436 |
437 | // about
438 | private void about()
439 | {
440 | AlertDialog.Builder builder = new AlertDialog.Builder(this);
441 | builder.setTitle(R.string.appName);
442 |
443 | DateFormat dateFormat = DateFormat.getDateTimeInstance();
444 | SpannableStringBuilder spannable =
445 | new SpannableStringBuilder(getText(R.string.version));
446 | Pattern pattern = Pattern.compile("%s");
447 | Matcher matcher = pattern.matcher(spannable);
448 | if (matcher.find())
449 | spannable.replace(matcher.start(), matcher.end(),
450 | BuildConfig.VERSION_NAME);
451 | matcher.reset(spannable);
452 | if (matcher.find())
453 | spannable.replace(matcher.start(), matcher.end(),
454 | dateFormat.format(BuildConfig.BUILT));
455 | builder.setMessage(spannable);
456 |
457 | // Add the button
458 | builder.setPositiveButton(android.R.string.ok, null);
459 |
460 | // Create the AlertDialog
461 | Dialog dialog = builder.show();
462 |
463 | // Set movement method
464 | TextView text = dialog.findViewById(android.R.id.message);
465 | if (text != null)
466 | text.setMovementMethod(LinkMovementMethod.getInstance());
467 | }
468 |
469 | // alertDialog
470 | private void alertDialog(int title, String message, int neutralButton)
471 | {
472 | AlertDialog.Builder builder = new AlertDialog.Builder(this);
473 | builder.setTitle(title);
474 | builder.setMessage(message);
475 |
476 | // Add the buttons
477 | builder.setNeutralButton(neutralButton, null);
478 |
479 | // Create the AlertDialog
480 | builder.show();
481 | }
482 |
483 | // ReadTask
484 | private static class ReadTask extends AsyncTask
485 | {
486 | WeakReference printWeakReference;
487 |
488 | ReadTask(Print print)
489 | {
490 | printWeakReference = new WeakReference(print);
491 | }
492 |
493 | @Override
494 | protected CharSequence doInBackground(Uri uris[])
495 | {
496 | StringBuilder stringBuilder = new StringBuilder();
497 | final Print print = printWeakReference.get();
498 | if (print == null)
499 | return stringBuilder;
500 |
501 | try (BufferedInputStream in = new BufferedInputStream
502 | (print.getContentResolver().openInputStream(uris[0])))
503 | {
504 | BufferedReader reader = new
505 | BufferedReader(new InputStreamReader(in));
506 |
507 | String line;
508 | while ((line = reader.readLine()) != null)
509 | {
510 | stringBuilder.append(line);
511 | stringBuilder.append(System.getProperty("line.separator"));
512 | }
513 | }
514 |
515 | catch (Exception e)
516 | {
517 | print.runOnUiThread(() ->
518 | print.alertDialog(R.string.appName,
519 | e.getMessage(),
520 | android.R.string.ok));
521 | e.printStackTrace();
522 | }
523 |
524 | return stringBuilder;
525 | }
526 |
527 | // onPostExecute
528 | @Override
529 | protected void onPostExecute(CharSequence result)
530 | {
531 | final Print print = printWeakReference.get();
532 | if (print == null)
533 | return;
534 |
535 | print.loadText(result.toString());
536 | }
537 | }
538 | }
539 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/Sub.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub;
2 |
3 | import org.commonmark.node.CustomNode;
4 | import org.commonmark.node.Delimited;
5 |
6 | /**
7 | * A sub node containing text and other inline nodes as children.
8 | */
9 | public class Sub extends CustomNode implements Delimited {
10 |
11 | private static final String DELIMITER = "~";
12 |
13 | @Override
14 | public String getOpeningDelimiter() {
15 | return DELIMITER;
16 | }
17 |
18 | @Override
19 | public String getClosingDelimiter() {
20 | return DELIMITER;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/SubExtension.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub;
2 |
3 | import org.commonmark.Extension;
4 | import org.commonmark.ext.sub.internal.SubDelimiterProcessor;
5 | import org.commonmark.ext.sub.internal.SubHtmlNodeRenderer;
6 | import org.commonmark.ext.sub.internal.SubTextContentNodeRenderer;
7 | import org.commonmark.parser.Parser;
8 | import org.commonmark.renderer.NodeRenderer;
9 | import org.commonmark.renderer.html.HtmlNodeRendererContext;
10 | import org.commonmark.renderer.html.HtmlNodeRendererFactory;
11 | import org.commonmark.renderer.html.HtmlRenderer;
12 | import org.commonmark.renderer.text.TextContentNodeRendererContext;
13 | import org.commonmark.renderer.text.TextContentNodeRendererFactory;
14 | import org.commonmark.renderer.text.TextContentRenderer;
15 |
16 | /**
17 | * Extension for sub using ~
18 | *
19 | * Create it with {@link #create()} and then configure it on the builders
20 | * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)},
21 | * {@link HtmlRenderer.Builder#extensions(Iterable)}).
22 | *
23 | *
24 | * The parsed sub text regions are turned into {@link Sub} nodes.
25 | *
26 | */
27 | public class SubExtension implements Parser.ParserExtension,
28 | HtmlRenderer.HtmlRendererExtension,
29 | TextContentRenderer.TextContentRendererExtension {
30 |
31 | private SubExtension() {
32 | }
33 |
34 | public static Extension create() {
35 | return new SubExtension();
36 | }
37 |
38 | @Override
39 | public void extend(Parser.Builder parserBuilder) {
40 | parserBuilder.customDelimiterProcessor(new SubDelimiterProcessor());
41 | }
42 |
43 | @Override
44 | public void extend(HtmlRenderer.Builder rendererBuilder) {
45 | rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() {
46 | @Override
47 | public NodeRenderer create(HtmlNodeRendererContext context) {
48 | return new SubHtmlNodeRenderer(context);
49 | }
50 | });
51 | }
52 |
53 | @Override
54 | public void extend(TextContentRenderer.Builder rendererBuilder) {
55 | rendererBuilder.nodeRendererFactory(new TextContentNodeRendererFactory() {
56 | @Override
57 | public NodeRenderer create(TextContentNodeRendererContext context) {
58 | return new SubTextContentNodeRenderer(context);
59 | }
60 | });
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/internal/SubDelimiterProcessor.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub.internal;
2 |
3 | import org.commonmark.ext.sub.Sub;
4 | import org.commonmark.node.Node;
5 | import org.commonmark.node.Nodes;
6 | import org.commonmark.node.SourceSpans;
7 | import org.commonmark.node.Text;
8 | import org.commonmark.parser.delimiter.DelimiterProcessor;
9 | import org.commonmark.parser.delimiter.DelimiterRun;
10 |
11 | public class SubDelimiterProcessor implements DelimiterProcessor {
12 |
13 | @Override
14 | public char getOpeningCharacter() {
15 | return '~';
16 | }
17 |
18 | @Override
19 | public char getClosingCharacter() {
20 | return '~';
21 | }
22 |
23 | @Override
24 | public int getMinLength() {
25 | return 1;
26 | }
27 |
28 | @Override
29 | public int process(DelimiterRun openingRun, DelimiterRun closingRun) {
30 | if (openingRun.length() == 1 && closingRun.length() == 1) {
31 | // Use exactly one delimiter.
32 |
33 | Text opener = openingRun.getOpener();
34 |
35 | // Wrap nodes between delimiters in sub.
36 | Node sub = new Sub();
37 |
38 | SourceSpans sourceSpans = new SourceSpans();
39 | sourceSpans.addAllFrom(openingRun.getOpeners(1));
40 |
41 | for (Node node : Nodes.between(opener, closingRun.getCloser())) {
42 | sub.appendChild(node);
43 | sourceSpans.addAll(node.getSourceSpans());
44 | }
45 |
46 | sourceSpans.addAllFrom(closingRun.getClosers(1));
47 | sub.setSourceSpans(sourceSpans.getSourceSpans());
48 |
49 | opener.insertAfter(sub);
50 |
51 | return 1;
52 | } else {
53 | return 0;
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/internal/SubHtmlNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub.internal;
2 |
3 | import org.commonmark.node.Node;
4 | import org.commonmark.renderer.html.HtmlNodeRendererContext;
5 | import org.commonmark.renderer.html.HtmlWriter;
6 |
7 | import java.util.Collections;
8 | import java.util.Map;
9 |
10 | public class SubHtmlNodeRenderer extends SubNodeRenderer {
11 |
12 | private final HtmlNodeRendererContext context;
13 | private final HtmlWriter html;
14 |
15 | public SubHtmlNodeRenderer(HtmlNodeRendererContext context) {
16 | this.context = context;
17 | this.html = context.getWriter();
18 | }
19 |
20 | @Override
21 | public void render(Node node) {
22 | Map attributes = context.extendAttributes(node, "sub", Collections.emptyMap());
23 | html.tag("sub", attributes);
24 | renderChildren(node);
25 | html.tag("/sub");
26 | }
27 |
28 | private void renderChildren(Node parent) {
29 | Node node = parent.getFirstChild();
30 | while (node != null) {
31 | Node next = node.getNext();
32 | context.render(node);
33 | node = next;
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/internal/SubNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub.internal;
2 |
3 | import org.commonmark.ext.sub.Sub;
4 | import org.commonmark.node.Node;
5 | import org.commonmark.renderer.NodeRenderer;
6 |
7 | import java.util.Collections;
8 | import java.util.Set;
9 |
10 | abstract class SubNodeRenderer implements NodeRenderer {
11 |
12 | @Override
13 | public Set> getNodeTypes() {
14 | return Collections.>singleton(Sub.class);
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sub/internal/SubTextContentNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sub.internal;
2 |
3 | import org.commonmark.node.Node;
4 | import org.commonmark.renderer.text.TextContentNodeRendererContext;
5 |
6 | public class SubTextContentNodeRenderer extends SubNodeRenderer {
7 |
8 | private final TextContentNodeRendererContext context;
9 |
10 | public SubTextContentNodeRenderer(TextContentNodeRendererContext context) {
11 | this.context = context;
12 | }
13 |
14 | @Override
15 | public void render(Node node) {
16 | renderChildren(node);
17 | }
18 |
19 | private void renderChildren(Node parent) {
20 | Node node = parent.getFirstChild();
21 | while (node != null) {
22 | Node next = node.getNext();
23 | context.render(node);
24 | node = next;
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/Sup.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup;
2 |
3 | import org.commonmark.node.CustomNode;
4 | import org.commonmark.node.Delimited;
5 |
6 | /**
7 | * A sup node containing text and other inline nodes as children.
8 | */
9 | public class Sup extends CustomNode implements Delimited {
10 |
11 | private static final String DELIMITER = "^";
12 |
13 | @Override
14 | public String getOpeningDelimiter() {
15 | return DELIMITER;
16 | }
17 |
18 | @Override
19 | public String getClosingDelimiter() {
20 | return DELIMITER;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/SupExtension.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup;
2 |
3 | import org.commonmark.Extension;
4 | import org.commonmark.ext.sup.internal.SupDelimiterProcessor;
5 | import org.commonmark.ext.sup.internal.SupHtmlNodeRenderer;
6 | import org.commonmark.ext.sup.internal.SupTextContentNodeRenderer;
7 | import org.commonmark.parser.Parser;
8 | import org.commonmark.renderer.NodeRenderer;
9 | import org.commonmark.renderer.html.HtmlNodeRendererContext;
10 | import org.commonmark.renderer.html.HtmlNodeRendererFactory;
11 | import org.commonmark.renderer.html.HtmlRenderer;
12 | import org.commonmark.renderer.text.TextContentNodeRendererContext;
13 | import org.commonmark.renderer.text.TextContentNodeRendererFactory;
14 | import org.commonmark.renderer.text.TextContentRenderer;
15 |
16 | /**
17 | * Extension for sup using ^
18 | *
19 | * Create it with {@link #create()} and then configure it on the builders
20 | * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)},
21 | * {@link HtmlRenderer.Builder#extensions(Iterable)}).
22 | *
23 | *
24 | * The parsed sup text regions are turned into {@link Sup} nodes.
25 | *
26 | */
27 | public class SupExtension implements Parser.ParserExtension,
28 | HtmlRenderer.HtmlRendererExtension,
29 | TextContentRenderer.TextContentRendererExtension {
30 |
31 | private SupExtension() {
32 | }
33 |
34 | public static Extension create() {
35 | return new SupExtension();
36 | }
37 |
38 | @Override
39 | public void extend(Parser.Builder parserBuilder) {
40 | parserBuilder.customDelimiterProcessor(new SupDelimiterProcessor());
41 | }
42 |
43 | @Override
44 | public void extend(HtmlRenderer.Builder rendererBuilder) {
45 | rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() {
46 | @Override
47 | public NodeRenderer create(HtmlNodeRendererContext context) {
48 | return new SupHtmlNodeRenderer(context);
49 | }
50 | });
51 | }
52 |
53 | @Override
54 | public void extend(TextContentRenderer.Builder rendererBuilder) {
55 | rendererBuilder.nodeRendererFactory(new TextContentNodeRendererFactory() {
56 | @Override
57 | public NodeRenderer create(TextContentNodeRendererContext context) {
58 | return new SupTextContentNodeRenderer(context);
59 | }
60 | });
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/internal/SupDelimiterProcessor.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup.internal;
2 |
3 | import org.commonmark.ext.sup.Sup;
4 | import org.commonmark.node.Node;
5 | import org.commonmark.node.Nodes;
6 | import org.commonmark.node.SourceSpans;
7 | import org.commonmark.node.Text;
8 | import org.commonmark.parser.delimiter.DelimiterProcessor;
9 | import org.commonmark.parser.delimiter.DelimiterRun;
10 |
11 | public class SupDelimiterProcessor implements DelimiterProcessor {
12 |
13 | @Override
14 | public char getOpeningCharacter() {
15 | return '^';
16 | }
17 |
18 | @Override
19 | public char getClosingCharacter() {
20 | return '^';
21 | }
22 |
23 | @Override
24 | public int getMinLength() {
25 | return 1;
26 | }
27 |
28 | @Override
29 | public int process(DelimiterRun openingRun, DelimiterRun closingRun) {
30 | if (openingRun.length() == 1 && closingRun.length() == 1) {
31 | // Use exactly one delimiter.
32 |
33 | Text opener = openingRun.getOpener();
34 |
35 | // Wrap nodes between delimiters in sup.
36 | Node sup = new Sup();
37 |
38 | SourceSpans sourceSpans = new SourceSpans();
39 | sourceSpans.addAllFrom(openingRun.getOpeners(1));
40 |
41 | for (Node node : Nodes.between(opener, closingRun.getCloser())) {
42 | sup.appendChild(node);
43 | sourceSpans.addAll(node.getSourceSpans());
44 | }
45 |
46 | sourceSpans.addAllFrom(closingRun.getClosers(1));
47 | sup.setSourceSpans(sourceSpans.getSourceSpans());
48 |
49 | opener.insertAfter(sup);
50 |
51 | return 1;
52 | } else {
53 | return 0;
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/internal/SupHtmlNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup.internal;
2 |
3 | import org.commonmark.node.Node;
4 | import org.commonmark.renderer.html.HtmlNodeRendererContext;
5 | import org.commonmark.renderer.html.HtmlWriter;
6 |
7 | import java.util.Collections;
8 | import java.util.Map;
9 |
10 | public class SupHtmlNodeRenderer extends SupNodeRenderer {
11 |
12 | private final HtmlNodeRendererContext context;
13 | private final HtmlWriter html;
14 |
15 | public SupHtmlNodeRenderer(HtmlNodeRendererContext context) {
16 | this.context = context;
17 | this.html = context.getWriter();
18 | }
19 |
20 | @Override
21 | public void render(Node node) {
22 | Map attributes = context.extendAttributes(node, "sup", Collections.emptyMap());
23 | html.tag("sup", attributes);
24 | renderChildren(node);
25 | html.tag("/sup");
26 | }
27 |
28 | private void renderChildren(Node parent) {
29 | Node node = parent.getFirstChild();
30 | while (node != null) {
31 | Node next = node.getNext();
32 | context.render(node);
33 | node = next;
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/internal/SupNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup.internal;
2 |
3 | import org.commonmark.ext.sup.Sup;
4 | import org.commonmark.node.Node;
5 | import org.commonmark.renderer.NodeRenderer;
6 |
7 | import java.util.Collections;
8 | import java.util.Set;
9 |
10 | abstract class SupNodeRenderer implements NodeRenderer {
11 |
12 | @Override
13 | public Set> getNodeTypes() {
14 | return Collections.>singleton(Sup.class);
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/org/commonmark/ext/sup/internal/SupTextContentNodeRenderer.java:
--------------------------------------------------------------------------------
1 | package org.commonmark.ext.sup.internal;
2 |
3 | import org.commonmark.node.Node;
4 | import org.commonmark.renderer.text.TextContentNodeRendererContext;
5 |
6 | public class SupTextContentNodeRenderer extends SupNodeRenderer {
7 |
8 | private final TextContentNodeRendererContext context;
9 |
10 | public SupTextContentNodeRenderer(TextContentNodeRendererContext context) {
11 | this.context = context;
12 | }
13 |
14 | @Override
15 | public void render(Node node) {
16 | renderChildren(node);
17 | }
18 |
19 | private void renderChildren(Node parent) {
20 | Node node = parent.getFirstChild();
21 | while (node != null) {
22 | Node next = node.getNext();
23 | context.render(node);
24 | node = next;
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/res/drawable/ic_folder_open_white_24dp.xml:
--------------------------------------------------------------------------------
1 |
6 |
9 |
10 |
--------------------------------------------------------------------------------
/src/main/res/drawable/ic_launcher.xml:
--------------------------------------------------------------------------------
1 |
2 |
8 |
9 |
12 |
15 |
18 |
21 |
24 |
27 |
30 |
33 |
36 |
39 |
42 |
45 |
48 |
49 |
--------------------------------------------------------------------------------
/src/main/res/drawable/ic_print_white_24dp.xml:
--------------------------------------------------------------------------------
1 |
6 |
9 |
10 |
--------------------------------------------------------------------------------
/src/main/res/layout/main.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
9 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/res/menu/main.xml:
--------------------------------------------------------------------------------
1 |
2 |
29 |
--------------------------------------------------------------------------------
/src/main/res/values-fr/strings.xmlTraduction française du fichier strings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Imprimante
5 |
6 | À propos
7 | Ouvrir
8 | Imprimer
9 | Markdown
10 | %s\n\nBuilt
12 | %s\n\nCopyright \u00A9 2021 Bill Farmer\n\nLicence
14 | GNU GPLv3
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/res/values-tr/strings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Yazdır
5 |
6 | Hakkında
7 | Açık
8 | Yazdır
9 | İşaretle
10 | %s\n\nBuilt
12 | %s\n\nCopyright \u00A9 2021 Bill Farmer\n\nLicence
14 | GNU GPLv3
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/res/values/strings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Print
5 |
6 | About
7 | Open
8 | Print
9 | Markdown
10 | %s\n\nBuilt
12 | %s\n\nCopyright \u00A9 2021 Bill Farmer\n\nLicence
14 | GNU GPLv3
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/res/values/styles.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
16 |
17 |
18 |
24 |
25 |
26 |
--------------------------------------------------------------------------------