├── .gitignore ├── .travis.yml ├── Changelog ├── INSTALL ├── LICENSE ├── Makefile.am ├── Makefile.in ├── README.md ├── acconfig.h ├── autogen ├── chardet-config.h.in ├── chardet-config.in ├── chardet.pc.in ├── configure ├── configure.ac ├── include ├── nscore.h ├── prmem.h └── version.h ├── libchardet.spec.in ├── m4 ├── ax_gcc_visibiliity.m4 ├── libtool.m4 ├── ltoptions.m4 ├── ltsugar.m4 ├── ltversion.m4 └── lt~obsolete.m4 ├── man ├── Makefile.am ├── Makefile.in ├── en │ ├── detect.3 │ ├── detect_destroy.3 │ ├── detect_handledata.3 │ ├── detect_handledata_r.3 │ ├── detect_init.3 │ ├── detect_obj_free.3 │ ├── detect_obj_init.3 │ ├── detect_r.3 │ └── detect_reset.3 └── ko │ ├── detect.3 │ ├── detect_destroy.3 │ ├── detect_handledata.3 │ ├── detect_handledata_r.3 │ ├── detect_init.3 │ ├── detect_obj_free.3 │ ├── detect_obj_init.3 │ ├── detect_r.3 │ └── detect_reset.3 ├── patch └── original-encode-detect-1.01.patch ├── project └── libchardet.cbp ├── src ├── CharDistribution.cpp ├── CharDistribution.h ├── JpCntx.cpp ├── JpCntx.h ├── Makefile.am ├── Makefile.in ├── chardet.cpp ├── chardet.h ├── nsBig5Prober.cpp ├── nsBig5Prober.h ├── nsCharSetProber.cpp ├── nsCharSetProber.h ├── nsCodingStateMachine.h ├── nsEUCJPProber.cpp ├── nsEUCJPProber.h ├── nsEUCKRProber.cpp ├── nsEUCKRProber.h ├── nsEUCTWProber.cpp ├── nsEUCTWProber.h ├── nsEscCharsetProber.cpp ├── nsEscCharsetProber.h ├── nsEscSM.cpp ├── nsGB2312Prober.cpp ├── nsGB2312Prober.h ├── nsHebrewProber.cpp ├── nsHebrewProber.h ├── nsLatin1Prober.cpp ├── nsLatin1Prober.h ├── nsMBCSGroupProber.cpp ├── nsMBCSGroupProber.h ├── nsMBCSSM.cpp ├── nsPkgInt.h ├── nsSBCSGroupProber.cpp ├── nsSBCSGroupProber.h ├── nsSBCharSetProber.cpp ├── nsSBCharSetProber.h ├── nsSJISProber.cpp ├── nsSJISProber.h ├── nsUTF8Prober.cpp ├── nsUTF8Prober.h ├── nsUniversalDetector.cpp ├── nsUniversalDetector.h └── tables │ ├── Big5Freq.tab │ ├── EUCKRFreq.tab │ ├── EUCTWFreq.tab │ ├── GB2312Freq.tab │ ├── JISFreq.tab │ ├── LangArabicModel.cpp │ ├── LangBulgarianModel.cpp │ ├── LangCyrillicModel.cpp │ ├── LangDanishModel.cpp │ ├── LangEsperantoModel.cpp │ ├── LangFrenchModel.cpp │ ├── LangGermanModel.cpp │ ├── LangGreekModel.cpp │ ├── LangHebrewModel.cpp │ ├── LangHungarianModel.cpp │ ├── LangSpanishModel.cpp │ ├── LangThaiModel.cpp │ ├── LangTurkishModel.cpp │ └── LangVietnameseModel.cpp ├── test ├── Makefile.am ├── Makefile.in ├── bom-test.c ├── sample.c ├── sample1.c ├── utf-8-bom.txt └── utf-8.txt └── tools ├── ar-lib ├── compile ├── config.guess ├── config.sub ├── install-sh ├── ltmain.sh ├── missing └── test-driver /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *~ 3 | *.o 4 | *.lo 5 | *.la 6 | .libs 7 | Makefile 8 | config.h 9 | config.log 10 | config.status 11 | chardet-config 12 | chardet-config.h 13 | *.pc 14 | *.spec 15 | libtool 16 | stamp-h1 17 | tags 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | compiler: 4 | - gcc 5 | - clang 6 | 7 | addons: 8 | apt: 9 | packages: 10 | - automake 11 | - autoconf 12 | - gdb 13 | - libtool 14 | - autotools-dev 15 | - valgrind 16 | 17 | 18 | matrix: 19 | exclude: 20 | - compiler: "gcc" 21 | - compiler: "clang" 22 | include: 23 | - compiler: "gcc" 24 | env: CFLAGS="-g -O2 -Wall" 25 | - compiler: "clang" 26 | env: CFLAGS="-g -O2 -Wall" 27 | 28 | script: 29 | - ./configure && make 30 | 31 | notifications: 32 | email: false 33 | -------------------------------------------------------------------------------- /Changelog: -------------------------------------------------------------------------------- 1 | Mozilla's Universal Charset Detector C/C++ library 2 | 3 | *** 1.0.7 4 | 5 | 2021/05/25 KST 6 | - give a little weight to "probable sequences" about single byte charset 7 | - update langage table for Greek, Thai and Vietnamese 8 | 9 | 10 | *** 1.0.6 11 | 12 | 2021/05/14 KST 13 | - fixed #14 can't detect short euc-kr 14 | 15 | 2021/05/13 KST 16 | - fixed #18 SECURITY! Invalid memory approach (heap-use-after-free) (@gaoxiang-ut) 17 | 18 | 2019/08/01 KST 19 | - fixed #13 bom member has been added to the DetectObj structure 20 | - fixed #15 support automake style 'make check' 21 | 22 | 2017/03/07 KST 23 | - fixed #12 No include guard 24 | 25 | 2016/08/20 KST 26 | - fixed #9 configure.ac needs subdir-objects 27 | - fixed #10 autogen failure because AM_PROG_AR with automake 1.11.1 28 | 29 | 30 | *** 1.0.5 31 | 32 | 2016/05/11 KST 33 | - release 1.0.5 34 | - fixed #6 can not detect EUC extended area 35 | 36 | 2016/05/05 KST 37 | - fixed #5 no detect control character on US-ASCII 38 | - fixed #7 wrong detect Danash ISO-8859-15 39 | - fixed #8 fixed binary safe problems 40 | . replace detect to detect_r 41 | . replace detect_handledata to detect_handledata_r 42 | 43 | 2016/05/04 KST 44 | - fixed #1 separate model directory 45 | - merge uchardet's improvement 46 | . fixed #2 Improve single-byte charset detection confidence algorithm 47 | . fixed #3 update model of Greek, Hungarian and Thai 48 | . fixed #4 new language model (Arabic, Danish, Esperanto, German, Spanish, 49 | Turkish, Vietnamese) 50 | 51 | 2015/12/10 KST 52 | - fixed man pages wrong macro bug (martin.gansser@gmail.com) 53 | 54 | 55 | *** 1.0.4 56 | 57 | 2014/02/14 KST 58 | - fixed duplicated path on chardet.pc 59 | - release 1.0.4 60 | 61 | 2014/02/13 KST 62 | - support windows native library with MinGW 63 | - support Code::Blocks project 64 | 65 | 2014/02/11 KST 66 | - fixed build error and shared library on cygwin 67 | 68 | 69 | *** 1.0.3 70 | 71 | 2014/02/10 KST 72 | - add chardet.pc by Lee ByungYoung 73 | - applied automake 74 | - add english man page 75 | 76 | 2012/08/14 KST 77 | - fixed comparison on JpCntx.cpp 78 | 79 | 80 | *** 1.0.2 81 | 82 | 2010/07/05 KST 83 | - support visibility attribute on gcc4. 84 | - add version information api 85 | 86 | 87 | *** 1.0.1 88 | 89 | 2009/02/23 KST 90 | - fix wrong detect TIS-620 charset 91 | - release 1.0.1 92 | 93 | 94 | *** 1.0.0 95 | 96 | 2009/02/23 KST JoungKyun.Kim 97 | - add version constant 98 | 99 | 2009/02/22 KST JoungKyun.Kim 100 | - release 1.0.0 based on perl Encode-Detect-1.01 101 | 102 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | libchardet Installsation Documentation 2 | 3 | Author: JoungKyun.Kim 4 | 2015/12/11 5 | 6 | 1. Distribution support 7 | 8 | Ubuntu: from 14.04 9 | apt-get install libchardet1 libchardet-dev 10 | 11 | Fedora: from Fedoar 23 12 | dnf install libchardet libchardet-devel 13 | yum install libchardet libchardet-devel 14 | 15 | Mageia Cauldron : 16 | urpmi libchardet libchardet-devel 17 | 18 | Arch / ChakraOS : 19 | pacman -Syu; pacman -S libchardet 20 | 21 | 22 | 2. Source compile 23 | 24 | 2-1. Quick Install 25 | 26 | shell> ./configure 27 | shell> make 28 | shell> make install 29 | 30 | 2-2. configure 31 | 32 | libchardet used GNU autoconf and no option except default option of 33 | gnu configure. And, you can use follow environment variables. 34 | 35 | CFLAGS, CXXFLAGS, DEFS, LIBS, LDFLAGS, CPPFLAGS, and so on. 36 | 37 | 2-3. make 38 | 39 | you can enable with -j option. 40 | 41 | 2-4. make install 42 | 43 | If use with DESTDIR environment, you can test install where you want. 44 | for example, execute `make DESTDIR=PATH install` 45 | 46 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1 2 | # 3 | # The contents of this file are subject to the Mozilla Public License Version 4 | # 1.1 (the "License"); you may not use this file except in compliance with 5 | # the License. You may obtain a copy of the License at 6 | # http://www.mozilla.org/MPL/ 7 | # 8 | # Software distributed under the License is distributed on an "AS IS" basis, 9 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 | # for the specific language governing rights and limitations under the 11 | # License. 12 | # 13 | # Mozilla's universal charset detector C/C++ Wrapping API 14 | # Writer(s) : 15 | # Detect class by John Gardiner Myers 16 | # C wrapping API by JoungKyun.Kim 17 | # 18 | # Alternatively, the contents of this file may be used under the terms of 19 | # either the GNU General Public License Version 2 or later (the "GPL"), or 20 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 21 | # in which case the provisions of the GPL or the LGPL are applicable instead 22 | # of those above. If you wish to allow use of your version of this file only 23 | # under the terms of either the GPL or the LGPL, and not to allow others to 24 | # use your version of this file under the terms of the MPL, indicate your 25 | # decision by deleting the provisions above and replace them with the notice 26 | # and other provisions required by the GPL or the LGPL. If you do not delete 27 | # the provisions above, a recipient may use your version of this file under 28 | # the terms of any one of the MPL, the GPL or the LGPL. 29 | 30 | DISTCLEANFILES = 31 | EXTRA_DIST = Changelog LICENSE libchardet.spec \ 32 | project/libchardet.cbp test/utf-8.txt test/utf-8-bom.txt 33 | 34 | ACLOCAL_AMFLAGS = -I m4 35 | SUBDIRS = . src man test 36 | 37 | #docdir = $(datadir)/doc/$(PACKAGE)-$(VERSION) 38 | dist_doc_DATA = Changelog LICENSE 39 | 40 | bin_SCRIPTS = chardet-config 41 | pkgconfigdir=$(libdir)/pkgconfig 42 | pkgconfig_DATA = chardet.pc 43 | 44 | DISTCLEANFILES += *.bz2 aclocal.m4 *~ 45 | 46 | test-install: 47 | $(MAKE) DESTDIR=`pwd`/test-install install 48 | 49 | distclean-local: 50 | rm -rf autom4te.cache test-install 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | libchardet - Mozilla's Universal Charset Detector C/C++ API 2 | === 3 | [![Build Status](https://travis-ci.org/Joungkyun/libchardet.svg?branch=master)](https://travis-ci.org/Joungkyun/libchardet) 4 | [![GitHub license](https://img.shields.io/badge/license-MPL%201.1-blue.svg)](https://raw.githubusercontent.com/Joungkyun/libchardet/master/LICENSE) 5 | [![GitHub last release](https://img.shields.io/github/release/joungkyun/libchardet.svg)](https://github.com/joungkyun/libchardet/releases) 6 | [![GitHub closed issues](https://img.shields.io/github/issues-closed-raw/joungkyun/libchardet.svg)](https://github.com/joungkyun/libchardet/issues?q=is%3Aissue+is%3Aclosed) 7 | [![GitHub closed pull requests](https://img.shields.io/github/issues-pr-closed-raw/joungkyun/libchardet.svg)](https://github.com/joungkyun/libchardet/pulls?q=is%3Apr+is%3Aclosed) 8 | 9 | ## License 10 | Copyright (c) 2019 JoungKyun.Kim All rights reserved. 11 | 12 | This program is under MPL 1.1 or LGPL 2.1 13 | 14 | ## Description 15 | libchardet is based on Mozilla Universal Charset Detector library and, detects 16 | the character set used to encode data. 17 | 18 | [Original code](http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/) was writed by Netscape Communications Corporation, Techniques used by universalchardet are described at <http://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html>. 19 | 20 | libchardet see also John Gardiner Myers's [Encode-Detect-1.01](http://search.cpan.org/~jgmyers/Encode-Detect-1.01/) 21 | perl module, and added C wrapping API, and library build environment with libtool. 22 | 23 | From 1.0.5, libchardet was reflected single-byte charset detection confidence 24 | algorithm of [uchardet](https://github.com/BYVoid/uchardet/) and new language models. 25 | (Arabic, Danish, Esperanto, German, Spanish, Turkish, Vietnamese) 26 | 27 | From 1.0.6, bom members have been added to the DetectObj structure. 28 | The value of the bom member is 1, which means that it has been detected as a BOM. 29 | Support for bom member can be determined by the existence of the CHARDET_BOM_CHECK 30 | constant. See example below. 31 | 32 | ## Installation 33 | 34 | See also [INSTALL](INSTALL) document 35 | 36 | ## Sample Codes 37 | 38 | See also test directory of source code 39 | 40 | ```c 41 | #include 42 | 43 | int main (void) { 44 | DetectObj *obj; 45 | char * str = "안녕하세요"; 46 | 47 | if ( (obj = detect_obj_init ()) == NULL ) { 48 | fprintf (stderr, "Memory Allocation failed\n"); 49 | return CHARDET_MEM_ALLOCATED_FAIL; 50 | } 51 | 52 | #ifndef CHARDET_BINARY_SAFE 53 | // before 1.0.5. This API is deprecated on 1.0.5 54 | switch (detect (str, &obj)) 55 | #else 56 | // from 1.0.5 57 | switch (detect_r (str, strlen (str), &obj)) 58 | #endif 59 | { 60 | case CHARDET_OUT_OF_MEMORY : 61 | fprintf (stderr, "On handle processing, occured out of memory\n"); 62 | detect_obj_free (&obj); 63 | return CHARDET_OUT_OF_MEMORY; 64 | case CHARDET_NULL_OBJECT : 65 | fprintf (stderr, 66 | "2st argument of chardet() is must memory allocation " 67 | "with detect_obj_init API\n"); 68 | return CHARDET_NULL_OBJECT; 69 | } 70 | 71 | #ifndef CHARDET_BOM_CHECK 72 | printf ("encoding: %s, confidence: %f\n", obj->encoding, obj->confidence); 73 | #else 74 | // from 1.0.6 support return whether exists BOM 75 | printf ( 76 | "encoding: %s, confidence: %f, exist BOM: %d\n", 77 | obj->encoding, obj->confidence, obj->bom 78 | ); 79 | #endif 80 | detect_obj_free (&obj); 81 | 82 | return 0; 83 | } 84 | ``` 85 | 86 | or looping code 87 | 88 | ```c 89 | #include 90 | 91 | int main (void) { 92 | Detect * d; 93 | DetectObj * obj; 94 | char * str = "안녕하세요"; 95 | 96 | if ( (d = detect_init ()) == NULL ) { 97 | fprintf (stderr, "chardet handle initialize failed\n"); 98 | return CHARDET_MEM_ALLOCATED_FAIL; 99 | } 100 | 101 | while ( 1 ) { 102 | detect_reset (&d); 103 | 104 | if ( (obj = detect_obj_init ()) == NULL ) { 105 | fprintf (stderr, "Memory Allocation failed\n"); 106 | return CHARDET_MEM_ALLOCATED_FAIL; 107 | } 108 | 109 | #ifndef CHARDET_BINARY_SAFE 110 | // before 1.0.5. This API is deprecated on 1.0.5 111 | switch (detect_handledata (&d, str,, &obj)) 112 | #else 113 | // from 1.0.5 114 | switch (detect_handledata_r (&d, str, strlen (str), &obj)) 115 | #endif 116 | { 117 | case CHARDET_OUT_OF_MEMORY : 118 | fprintf (stderr, "On handle processing, occured out of memory\n"); 119 | detect_obj_free (&obj); 120 | return CHARDET_OUT_OF_MEMORY; 121 | case CHARDET_NULL_OBJECT : 122 | fprintf (stderr, 123 | "2st argument of chardet() is must memory allocation " 124 | "with detect_obj_init API\n"); 125 | return CHARDET_NULL_OBJECT; 126 | } 127 | 128 | #ifndef CHARDET_BOM_CHECK 129 | printf ("encoding: %s, confidence: %f\n", obj->encoding, obj->confidence); 130 | #else 131 | // from 1.0.6 support return whether exists BOM 132 | printf ( 133 | "encoding: %s, confidence: %f, exist BOM: %d\n", 134 | obj->encoding, obj->confidence, obj->bom 135 | ); 136 | #endif 137 | detect_obj_free (&obj); 138 | 139 | if ( 1 ) 140 | break; 141 | } 142 | detect_destroy (&d); 143 | 144 | return 0; 145 | } 146 | ``` 147 | 148 | ## APIs 149 | * PHP Extension - https://github.com/OOPS-ORG-PHP/mod_chardet 150 | * PYTHON C Binding - https://github.com/Joungkyun/python-chardet 151 | * PERL - http://search.cpan.org/~jgmyers/Encode-Detect-1.01/Detect.pm 152 | -------------------------------------------------------------------------------- /acconfig.h: -------------------------------------------------------------------------------- 1 | #ifndef CHARDET_CONFIG_H 2 | #define CHARDET_CONFIG_H 3 | 4 | 5 | /* config.h: a general config file */ 6 | 7 | @TOP@ 8 | 9 | @BOTTOM@ 10 | 11 | #endif /* CHARDET_CONFIG_H */ 12 | -------------------------------------------------------------------------------- /autogen: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export LIBTOOL=/usr/bin/libtool 4 | export AUTOMAKE=/usr/bin/automake 5 | export ACLOCAL=/usr/bin/aclocal 6 | rm -f configure tools/* 7 | autoreconf --install 8 | rm -rf autom4te* aclocal.m4 *~ 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /chardet-config.h.in: -------------------------------------------------------------------------------- 1 | /* chardet-config.h.in. Generated from configure.ac by autoheader. */ 2 | #ifndef CHARDET_CONFIG_H 3 | #define CHARDET_CONFIG_H 4 | 5 | 6 | /* config.h: a general config file */ 7 | 8 | 9 | 10 | /* Define to 1 if you have the header file. */ 11 | #undef HAVE_DLFCN_H 12 | 13 | /* Define to 1, depending whether the compiler DLL_EXPORT declarations. */ 14 | #undef HAVE_DLL_EXPORT 15 | 16 | /* Define to 1 if you have the header file. */ 17 | #undef HAVE_INTTYPES_H 18 | 19 | /* Define to 1 if your system has a GNU libc compatible `malloc' function, and 20 | to 0 otherwise. */ 21 | #undef HAVE_MALLOC 22 | 23 | /* Define to 1 if you have the header file. */ 24 | #undef HAVE_MEMORY_H 25 | 26 | /* Define to 1 if `stat' has the bug that it succeeds when given the 27 | zero-length file name argument. */ 28 | #undef HAVE_STAT_EMPTY_STRING_BUG 29 | 30 | /* Define to 1 if you have the header file. */ 31 | #undef HAVE_STDINT_H 32 | 33 | /* Define to 1 if you have the header file. */ 34 | #undef HAVE_STDLIB_H 35 | 36 | /* Define to 1 if you have the header file. */ 37 | #undef HAVE_STRINGS_H 38 | 39 | /* Define to 1 if you have the header file. */ 40 | #undef HAVE_STRING_H 41 | 42 | /* Define to 1 if you have the header file. */ 43 | #undef HAVE_SYS_STAT_H 44 | 45 | /* Define to 1 if you have the header file. */ 46 | #undef HAVE_SYS_TYPES_H 47 | 48 | /* Define to 1 if you have the header file. */ 49 | #undef HAVE_UNISTD_H 50 | 51 | /* Define to 1, depending whether the compiler supports simple visibility 52 | declarations. */ 53 | #undef HAVE_VISIBILITY 54 | 55 | /* Define to 1 if `lstat' dereferences a symlink specified with a trailing 56 | slash. */ 57 | #undef LSTAT_FOLLOWS_SLASHED_SYMLINK 58 | 59 | /* Define to the sub-directory in which libtool stores uninstalled libraries. 60 | */ 61 | #undef LT_OBJDIR 62 | 63 | /* Name of package */ 64 | #undef PACKAGE 65 | 66 | /* Define to the address where bug reports for this package should be sent. */ 67 | #undef PACKAGE_BUGREPORT 68 | 69 | /* Define to the full name of this package. */ 70 | #undef PACKAGE_NAME 71 | 72 | /* Define to the full name and version of this package. */ 73 | #undef PACKAGE_STRING 74 | 75 | /* Define to the one symbol short name of this package. */ 76 | #undef PACKAGE_TARNAME 77 | 78 | /* Define to the home page for this package. */ 79 | #undef PACKAGE_URL 80 | 81 | /* Define to the version of this package. */ 82 | #undef PACKAGE_VERSION 83 | 84 | /* Define to 1 if you have the ANSI C header files. */ 85 | #undef STDC_HEADERS 86 | 87 | /* Define to 1 if you can safely include both and . */ 88 | #undef TIME_WITH_SYS_TIME 89 | 90 | /* Version number of package */ 91 | #undef VERSION 92 | 93 | /* Define to empty if `const' does not conform to ANSI C. */ 94 | #undef const 95 | 96 | /* Define to rpl_malloc if the replacement function should be used. */ 97 | #undef malloc 98 | 99 | /* Define to `unsigned int' if does not define. */ 100 | #undef size_t 101 | 102 | #endif /* CHARDET_CONFIG_H */ 103 | -------------------------------------------------------------------------------- /chardet-config.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | prefix="@prefix@" 4 | exec_prefix="@exec_prefix@" 5 | exec_prefix_set=no 6 | includedir="@includedir@" 7 | la_file="@libdir@/lib@PROG@.la" 8 | 9 | usage="\ 10 | Usage: chardet-config [--prefix] [--exec-prefix] [--version] [--libs] [--defs] [--cflags]" 11 | 12 | if test $# -eq 0; then 13 | echo "${usage}" 1>&2 14 | exit 1 15 | fi 16 | 17 | . $la_file 18 | 19 | while test $# -gt 0; do 20 | case "$1" in 21 | -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;; 22 | *) optarg= ;; 23 | esac 24 | 25 | case $1 in 26 | --prefix=*) 27 | prefix="$optarg" 28 | if test "$exec_prefix_set" = no ; then 29 | exec_prefix="$optarg" 30 | fi 31 | ;; 32 | --prefix) 33 | echo -n "$prefix" 34 | ;; 35 | --exec-prefix=*) 36 | exec_prefix="$optarg" 37 | exec_prefix_set=yes 38 | ;; 39 | --exec-prefix) 40 | echo -n "$exec_prefix" 41 | ;; 42 | --version) 43 | echo "@PACKAGE_VERSION@" 44 | exit 0 45 | ;; 46 | --cflags) 47 | echo -n " @CFLAGS@" 48 | ;; 49 | --defs) 50 | includes="-I${includedir}" 51 | echo -n " -DHAVE_CONFIG_H $includes/@PROG@" 52 | ;; 53 | --libs) 54 | echo -n " -L${libdir} -l@PROG@ ${dependency_libs}" 55 | ;; 56 | *) 57 | echo "${usage}" 1>&2 58 | exit 1 59 | ;; 60 | esac 61 | shift 62 | done 63 | 64 | exit 0 65 | -------------------------------------------------------------------------------- /chardet.pc.in: -------------------------------------------------------------------------------- 1 | # Package Information for pkg-config 2 | 3 | prefix=@prefix@ 4 | exec_prefix=@exec_prefix@ 5 | libdir=@libdir@ 6 | includedir=@includedir@ 7 | 8 | Name: @PACKAGE_NAME@ 9 | Description: Mozilla's Universal Charset Detector C/C++ API 10 | Version: @PACKAGE_VERSION@ 11 | Libs: -L${libdir} -lchardet 12 | Cflags: -I${includedir}/chardet @CFLAGS@ 13 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # Process this file with autoconf to produce a configure script. 2 | # Configure template for chardet library 3 | # 4 | 5 | # 6 | AC_PREREQ(2.59) 7 | AC_INIT([libchardet], [1.0.7], [https://github.com/Joungkyun/libchardet/issues]) 8 | AC_CONFIG_AUX_DIR([tools]) 9 | AM_INIT_AUTOMAKE([1.12.0 -Wall -Werror -Wno-override foreign no-dependencies subdir-objects]) 10 | AM_MAINTAINER_MODE 11 | 12 | AC_CONFIG_SRCDIR([src/nsUniversalDetector.h]) 13 | AC_CONFIG_HEADER([chardet-config.h]) 14 | AC_PREFIX_DEFAULT([/usr/local]) 15 | 16 | AC_SUBST(SONAME_INFO) 17 | AC_SUBST(SONAME_VERSION) 18 | AC_SUBST(SONAME_MAJOR) 19 | AC_SUBST(SONAME_MINOR) 20 | AC_SUBST(SONAME_REVISION) 21 | AC_SUBST(PROG) 22 | 23 | PROG=`echo $PACKAGE_NAME | sed 's/^lib//g'` 24 | 25 | SONAME_MAJOR=1 26 | SONAME_MINOR=0 27 | SONAME_REVISION=0 28 | SONAME_VERSION=$SONAME_MAJOR.$SONAME_MINOR.$SONAME_REVISION 29 | 30 | SONAME_VALUE=$(($SONAME_MAJOR + $SONAME_MINOR)) 31 | SONAME_INFO="$SONAME_VALUE:$SONAME_REVISION:$SONAME_MINOR" 32 | 33 | # Checks for programs. 34 | AC_PROG_CC 35 | AC_PROG_CXX 36 | AC_PROG_CPP 37 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) dnl Workaround for Automake 1.11 38 | AX_GL_VISIBILITY 39 | AC_PROG_INSTALL 40 | AC_PROG_LN_S 41 | AC_PROG_AWK 42 | AC_PROG_MAKE_SET 43 | AC_PROG_RANLIB 44 | AC_PROG_LIBTOOL 45 | 46 | AC_PATH_PROG(PERL, perl) 47 | if test ! -f "$ac_cv_path_PERL"; then 48 | AC_MSG_ERROR(Can't not found perl) 49 | fi 50 | 51 | AC_PATH_PROG(DATE, date) 52 | if test ! -f "$ac_cv_path_DATE"; then 53 | AC_MSG_ERROR(Can't not found date) 54 | fi 55 | 56 | AC_SUBST(PACKAGE_DATE) 57 | PACKAGE_DATE=$(LANG= $DATE +"%a %b %d %Y") 58 | 59 | # Checks for header files. 60 | AC_HEADER_STDC 61 | AC_CHECK_HEADERS([stdlib.h]) 62 | 63 | # Checks for typedefs, structures, and compiler characteristics. 64 | AC_C_CONST 65 | AC_TYPE_SIZE_T 66 | AC_HEADER_TIME 67 | 68 | # Checks for library functions. 69 | AC_FUNC_MALLOC 70 | AC_FUNC_STAT 71 | 72 | AC_CONFIG_FILES([ 73 | Makefile 74 | src/Makefile 75 | man/Makefile 76 | test/Makefile 77 | chardet-config 78 | chardet.pc 79 | libchardet.spec 80 | ]) 81 | AC_OUTPUT 82 | -------------------------------------------------------------------------------- /include/nscore.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDED_NSCORE_H 2 | #define INCLUDED_NSCORE_H 3 | 4 | typedef short PRInt16; 5 | typedef unsigned short PRUint16; 6 | 7 | typedef int PRInt32; 8 | typedef unsigned PRUint32; 9 | 10 | typedef int PRBool; 11 | #define PR_TRUE 1 12 | #define PR_FALSE 0 13 | 14 | #define nsnull 0 15 | 16 | typedef PRUint32 nsresult; 17 | #define NS_OK 0 18 | #define NS_ERROR_OUT_OF_MEMORY ((nsresult)(0x8007000eL)) 19 | 20 | #endif /* INCLUDED_NSCORE_H */ 21 | -------------------------------------------------------------------------------- /include/prmem.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDED_PRMEM_H 2 | #define INCLUDED_PRMEM_H 3 | 4 | #include 5 | 6 | #define PR_Malloc(size) malloc(size) 7 | #define PR_Free(size) free(size) 8 | 9 | #define PR_FREEIF(ptr) if (ptr) { free(ptr); (ptr) = 0; } 10 | 11 | #endif /* INCLUDED_PRMEM_H */ 12 | -------------------------------------------------------------------------------- /include/version.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDED_VERSION_H 2 | #define INCLUDED_VERSION_H 3 | 4 | #define LIBCHARDET_MAJOR_VER 1 5 | #define LIBCHARDET_MINOR_VER 0 6 | #define LIBCHARDET_PATCH_VER 6 7 | 8 | #define LIBCHARDET_VERSION "1.0.6" 9 | #define LIBCHARDET_UVERSION "001000006" 10 | #define LIBCHARDET_LVERSION 0x01000006 11 | 12 | #endif /* INCLUDED_VERSION_H */ 13 | -------------------------------------------------------------------------------- /libchardet.spec.in: -------------------------------------------------------------------------------- 1 | %define _unpackaged_files_terminate_build 0 2 | 3 | Summary: Mozilla Universal Chardet library 4 | Summary(ko): 모질라 유니버샬 캐릭터셋 디텍트 라이브러리 5 | Name: @PACKAGE_NAME@ 6 | Version: @PACKAGE_VERSION@ 7 | Release: 1 8 | Epoch: 1 9 | License: MPL 10 | Group: System Environment/Libraries 11 | Source0: https://github.com/Joungkyun/libchardet/archive/%{version}.tar.gz#/%{name}-%{version}.tar.gz 12 | URL: https://github.com/Joungkyun/libchardet 13 | BuildRequires: libstdc++-devel 14 | Requires: libstdc++ 15 | 16 | BuildRoot:%(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) 17 | 18 | %description 19 | libchardet provides an interface to Mozilla's universal charset detector, 20 | which detects the charset used to encode data. 21 | 22 | %package devel 23 | Summary: Header and object files for development using libchardet 24 | Summary(ko): libchardet 를 이용하여 개발하기 위한 header 파일과 목적 파일들 25 | Group: System Environment/Libraries 26 | Requires: %{name} = %{epoch}:%{version}-%{release}, libstdc++-devel 27 | 28 | %description devel 29 | The libchardet-devel package contains the header and object files necessary 30 | for developing programs which use the libchardet libraries. 31 | 32 | %prep 33 | [ "%{buildroot}" != "/" ] && %{__rm} -rf %{buildroot} 34 | %setup -q 35 | 36 | %build 37 | %configure 38 | 39 | %{__make} %{?_smp_mflags} 40 | 41 | %install 42 | %{__make} DESTDIR=%{buildroot} install 43 | 44 | %clean 45 | %{__rm} -rf %{buildroot} 46 | 47 | %post 48 | /sbin/ldconfig 49 | 50 | %postun 51 | /sbin/ldconfig 52 | 53 | %files 54 | %defattr(0755,root,root) 55 | %{_libdir}/%{name}.so.* 56 | 57 | %files devel 58 | %defattr(0644,root,root,0755) 59 | %attr(0755,root,root) %{_bindir}/chardet-config 60 | %{_libdir}/*.so 61 | %{_libdir}/*.a 62 | %{_libdir}/*.la 63 | %{_libdir}/pkgconfig/chardet.pc 64 | %{_includedir}/chardet/*.h 65 | %{_mandir}/man3/* 66 | %{_mandir}/ko/man3/* 67 | 68 | %changelog 69 | * @PACKAGE_DATE@ JoungKyun.Kim 1:@PACKAGE_VERSION@-1 70 | - packaged @PACKAGE_VERSION@ 71 | -------------------------------------------------------------------------------- /m4/ax_gcc_visibiliity.m4: -------------------------------------------------------------------------------- 1 | dnl 2 | dnl Written by JoungKyun.Kim 3 | dnl Copyright (c) 2019 JoungKyun.Kim 4 | dnl 5 | dnl ---------------------------------------------------------------------------- 6 | dnl Redistribution and use in source and binary forms, with or without 7 | dnl modification, are permitted provided that the following conditions are met: 8 | dnl 9 | dnl * Redistributions of source code must retain the above copyright notice, 10 | dnl this list of conditions and the following disclaimer. 11 | dnl 12 | dnl * Redistributions in binary form must reproduce the above copyright 13 | dnl notice, this list of conditions and the following disclaimer in the 14 | dnl documentation and/or other materials provided with the distribution. 15 | dnl 16 | dnl * Neither the name of JoungKyun.Kim nor the url of oops.org 17 | dnl nor the names of their contributors may be used to endorse or 18 | dnl promote products derived from this software without specific prior 19 | dnl written permission. 20 | dnl 21 | dnl THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | dnl AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | dnl IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | dnl ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25 | dnl LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 | dnl CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 | dnl SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 | dnl INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 | dnl CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | dnl ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 | dnl POSSIBILITY OF SUCH DAMAGE. 32 | dnl ---------------------------------------------------------------------------- 33 | dnl This file is part of olibc. 34 | dnl 35 | dnl 36 | 37 | dnl Notice! call after AC_PROG_LIBTOOL 38 | 39 | AC_SUBST(GCC_MAJOR_VERSION) 40 | AC_SUBST(GCC_MINOR_VERSION) 41 | AC_SUBST(GCC_PATCH_VERSION) 42 | 43 | dnl Get GCC major.minor.patch version 44 | AC_DEFUN([AX_GCC_VERSION], 45 | [ 46 | if test $GCC = "yes"; then 47 | if test -z "$PERL"; then 48 | AC_PATH_PROG(PERL, perl) 49 | if test ! -f "$ac_cv_path_PERL"; then 50 | AC_MSG_ERROR(Can't not found perl) 51 | fi 52 | fi 53 | 54 | if test -n "$PERL"; then 55 | GCC_VERSION="`$CC -dumpversion 2> /dev/null | $PERL -pe 's/([[0-9]]+)\.([[0-9]]+)\.([[0-9]]+)/GCC_MAJOR_VERSION=\1;GCC_MINOR_VERSION=\2;GCC_PATCH_VERSION=\3;/g'`" 56 | eval "$GCC_VERSION" 57 | fi 58 | fi 59 | ]) 60 | 61 | AC_DEFUN([AX_GCC_VISIBILITY], 62 | [ 63 | if test "$GCC" = "yes"; then 64 | AX_GCC_VERSION 65 | if test $GCC_MAJOR_VERSION -ge 4; then 66 | CFLAGS="$CFLAGS -fvisibility=hidden" 67 | CPPFALGS="$CPPFALGS -fvisibility=hidden" 68 | fi 69 | fi 70 | ]) 71 | 72 | dnl Copyright (C) 2005 Free Software Foundation, Inc. 73 | dnl This file is free software; the Free Software Foundation 74 | dnl gives unlimited permission to copy and/or distribute it, 75 | dnl with or without modifications, as long as this notice is preserved. 76 | 77 | dnl visibility.m4 serial 1 (gettext-0.15) 78 | dnl From Bruno Haible. 79 | dnl Modifyed by Joungkyun.Kim 80 | 81 | dnl Tests whether the compiler supports the command-line option 82 | dnl -fvisibility=hidden and the function and variable attributes 83 | dnl __attribute__((__visibility__("hidden"))) and 84 | dnl __attribute__((__visibility__("default"))). 85 | dnl Does *not* test for __visibility__("protected") - which has tricky 86 | dnl semantics (see the 'vismain' test in glibc) and does not exist e.g. on 87 | dnl MacOS X. 88 | dnl Does *not* test for __visibility__("internal") - which has processor 89 | dnl dependent semantics. 90 | dnl Does *not* test for #pragma GCC visibility push(hidden) - which is 91 | dnl "really only recommended for legacy code". 92 | dnl Set the __visibility__("hidden") option in variable CFLAG. 93 | dnl Defines and sets the variable HAVE_VISIBILITY. 94 | 95 | AC_DEFUN([AX_GL_VISIBILITY], 96 | [ 97 | AC_REQUIRE([AC_PROG_CC]) 98 | HAVE_VISIBILITY=0 99 | if test -n "$GCC"; then 100 | AC_MSG_CHECKING([for simple visibility declarations]) 101 | AC_CACHE_VAL(gl_cv_cc_visibility, [ 102 | gl_save_CFLAGS="$CFLAGS" 103 | CFLAGS="$CFLAGS -fvisibility=hidden" 104 | AC_TRY_COMPILE( 105 | [extern __attribute__((__visibility__("hidden"))) int hiddenvar; 106 | extern __attribute__((__visibility__("default"))) int exportedvar; 107 | extern __attribute__((__visibility__("hidden"))) int hiddenfunc (void); 108 | extern __attribute__((__visibility__("default"))) int exportedfunc (void);], 109 | [], 110 | gl_cv_cc_visibility=yes, 111 | gl_cv_cc_visibility=no) 112 | CFLAGS="$gl_save_CFLAGS"]) 113 | AC_MSG_RESULT([$gl_cv_cc_visibility]) 114 | if test $gl_cv_cc_visibility = yes; then 115 | CFLAGS="$CFLAGS -fvisibility=hidden" 116 | CPPFLAGS="$CPPFLAGS -fvisibility=hidden" 117 | HAVE_VISIBILITY=1 118 | AC_DEFINE_UNQUOTED([HAVE_VISIBILITY], [$HAVE_VISIBILITY], 119 | [Define to 1, depending whether the compiler supports simple visibility declarations.]) 120 | fi 121 | fi 122 | 123 | # DLL_EXPORT check 124 | AX_DLL_EXPORT_CHECK 125 | ]) 126 | 127 | AC_SUBST(MINGW_CYGWIN_DYNLIB) 128 | 129 | AC_DEFUN([AX_DLL_EXPORT_CHECK], 130 | [ 131 | AC_REQUIRE([AC_PROG_LIBTOOL]) 132 | HAVE_DLL_EXPORT=0 133 | AC_MSG_CHECKING([for DLL_EXPORT declarations]) 134 | 135 | case $host_os in 136 | mingw* | cygwin* | pw32* | os2* | cegcc*) 137 | HAVE_DLL_EXPORT=1 138 | MINGW_CYGWIN_DYNLIB="-no-undefined" 139 | AC_MSG_RESULT([yes]) 140 | AC_DEFINE_UNQUOTED([HAVE_DLL_EXPORT], [$HAVE_DLL_EXPORT], 141 | [Define to 1, depending whether the compiler DLL_EXPORT declarations.]) 142 | ;; 143 | *) 144 | AC_MSG_RESULT([no]) 145 | ;; 146 | esac 147 | ]) 148 | 149 | -------------------------------------------------------------------------------- /m4/ltsugar.m4: -------------------------------------------------------------------------------- 1 | # ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc. 4 | # Written by Gary V. Vaughan, 2004 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # serial 6 ltsugar.m4 11 | 12 | # This is to help aclocal find these macros, as it can't see m4_define. 13 | AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])]) 14 | 15 | 16 | # lt_join(SEP, ARG1, [ARG2...]) 17 | # ----------------------------- 18 | # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their 19 | # associated separator. 20 | # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier 21 | # versions in m4sugar had bugs. 22 | m4_define([lt_join], 23 | [m4_if([$#], [1], [], 24 | [$#], [2], [[$2]], 25 | [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])]) 26 | m4_define([_lt_join], 27 | [m4_if([$#$2], [2], [], 28 | [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])]) 29 | 30 | 31 | # lt_car(LIST) 32 | # lt_cdr(LIST) 33 | # ------------ 34 | # Manipulate m4 lists. 35 | # These macros are necessary as long as will still need to support 36 | # Autoconf-2.59 which quotes differently. 37 | m4_define([lt_car], [[$1]]) 38 | m4_define([lt_cdr], 39 | [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], 40 | [$#], 1, [], 41 | [m4_dquote(m4_shift($@))])]) 42 | m4_define([lt_unquote], $1) 43 | 44 | 45 | # lt_append(MACRO-NAME, STRING, [SEPARATOR]) 46 | # ------------------------------------------ 47 | # Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'. 48 | # Note that neither SEPARATOR nor STRING are expanded; they are appended 49 | # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). 50 | # No SEPARATOR is output if MACRO-NAME was previously undefined (different 51 | # than defined and empty). 52 | # 53 | # This macro is needed until we can rely on Autoconf 2.62, since earlier 54 | # versions of m4sugar mistakenly expanded SEPARATOR but not STRING. 55 | m4_define([lt_append], 56 | [m4_define([$1], 57 | m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])]) 58 | 59 | 60 | 61 | # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...]) 62 | # ---------------------------------------------------------- 63 | # Produce a SEP delimited list of all paired combinations of elements of 64 | # PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list 65 | # has the form PREFIXmINFIXSUFFIXn. 66 | # Needed until we can rely on m4_combine added in Autoconf 2.62. 67 | m4_define([lt_combine], 68 | [m4_if(m4_eval([$# > 3]), [1], 69 | [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl 70 | [[m4_foreach([_Lt_prefix], [$2], 71 | [m4_foreach([_Lt_suffix], 72 | ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[, 73 | [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])]) 74 | 75 | 76 | # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ]) 77 | # ----------------------------------------------------------------------- 78 | # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited 79 | # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ. 80 | m4_define([lt_if_append_uniq], 81 | [m4_ifdef([$1], 82 | [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1], 83 | [lt_append([$1], [$2], [$3])$4], 84 | [$5])], 85 | [lt_append([$1], [$2], [$3])$4])]) 86 | 87 | 88 | # lt_dict_add(DICT, KEY, VALUE) 89 | # ----------------------------- 90 | m4_define([lt_dict_add], 91 | [m4_define([$1($2)], [$3])]) 92 | 93 | 94 | # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE) 95 | # -------------------------------------------- 96 | m4_define([lt_dict_add_subkey], 97 | [m4_define([$1($2:$3)], [$4])]) 98 | 99 | 100 | # lt_dict_fetch(DICT, KEY, [SUBKEY]) 101 | # ---------------------------------- 102 | m4_define([lt_dict_fetch], 103 | [m4_ifval([$3], 104 | m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]), 105 | m4_ifdef([$1($2)], [m4_defn([$1($2)])]))]) 106 | 107 | 108 | # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE]) 109 | # ----------------------------------------------------------------- 110 | m4_define([lt_if_dict_fetch], 111 | [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4], 112 | [$5], 113 | [$6])]) 114 | 115 | 116 | # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...]) 117 | # -------------------------------------------------------------- 118 | m4_define([lt_dict_filter], 119 | [m4_if([$5], [], [], 120 | [lt_join(m4_quote(m4_default([$4], [[, ]])), 121 | lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]), 122 | [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl 123 | ]) 124 | -------------------------------------------------------------------------------- /m4/ltversion.m4: -------------------------------------------------------------------------------- 1 | # ltversion.m4 -- version numbers -*- Autoconf -*- 2 | # 3 | # Copyright (C) 2004 Free Software Foundation, Inc. 4 | # Written by Scott James Remnant, 2004 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # @configure_input@ 11 | 12 | # serial 3337 ltversion.m4 13 | # This file is part of GNU Libtool 14 | 15 | m4_define([LT_PACKAGE_VERSION], [2.4.2]) 16 | m4_define([LT_PACKAGE_REVISION], [1.3337]) 17 | 18 | AC_DEFUN([LTVERSION_VERSION], 19 | [macro_version='2.4.2' 20 | macro_revision='1.3337' 21 | _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?]) 22 | _LT_DECL(, macro_revision, 0) 23 | ]) 24 | -------------------------------------------------------------------------------- /m4/lt~obsolete.m4: -------------------------------------------------------------------------------- 1 | # lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc. 4 | # Written by Scott James Remnant, 2004. 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # serial 5 lt~obsolete.m4 11 | 12 | # These exist entirely to fool aclocal when bootstrapping libtool. 13 | # 14 | # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN) 15 | # which have later been changed to m4_define as they aren't part of the 16 | # exported API, or moved to Autoconf or Automake where they belong. 17 | # 18 | # The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN 19 | # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us 20 | # using a macro with the same name in our local m4/libtool.m4 it'll 21 | # pull the old libtool.m4 in (it doesn't see our shiny new m4_define 22 | # and doesn't know about Autoconf macros at all.) 23 | # 24 | # So we provide this file, which has a silly filename so it's always 25 | # included after everything else. This provides aclocal with the 26 | # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything 27 | # because those macros already exist, or will be overwritten later. 28 | # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 29 | # 30 | # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. 31 | # Yes, that means every name once taken will need to remain here until 32 | # we give up compatibility with versions before 1.7, at which point 33 | # we need to keep only those names which we still refer to. 34 | 35 | # This is to help aclocal find these macros, as it can't see m4_define. 36 | AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])]) 37 | 38 | m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])]) 39 | m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])]) 40 | m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])]) 41 | m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])]) 42 | m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])]) 43 | m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])]) 44 | m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])]) 45 | m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])]) 46 | m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])]) 47 | m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])]) 48 | m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])]) 49 | m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])]) 50 | m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])]) 51 | m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])]) 52 | m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])]) 53 | m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])]) 54 | m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])]) 55 | m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])]) 56 | m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])]) 57 | m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])]) 58 | m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])]) 59 | m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])]) 60 | m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])]) 61 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])]) 62 | m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])]) 63 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])]) 64 | m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])]) 65 | m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])]) 66 | m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])]) 67 | m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])]) 68 | m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])]) 69 | m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])]) 70 | m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])]) 71 | m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])]) 72 | m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])]) 73 | m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])]) 74 | m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])]) 75 | m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])]) 76 | m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])]) 77 | m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])]) 78 | m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])]) 79 | m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])]) 80 | m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])]) 81 | m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])]) 82 | m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])]) 83 | m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])]) 84 | m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])]) 85 | m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])]) 86 | m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])]) 87 | m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])]) 88 | m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])]) 89 | m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])]) 90 | m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])]) 91 | m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])]) 92 | m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])]) 93 | m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])]) 94 | m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])]) 95 | m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])]) 96 | m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])]) 97 | m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])]) 98 | m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])]) 99 | -------------------------------------------------------------------------------- /man/Makefile.am: -------------------------------------------------------------------------------- 1 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1 2 | # 3 | # The contents of this file are subject to the Mozilla Public License Version 4 | # 1.1 (the "License"); you may not use this file except in compliance with 5 | # the License. You may obtain a copy of the License at 6 | # http://www.mozilla.org/MPL/ 7 | # 8 | # Software distributed under the License is distributed on an "AS IS" basis, 9 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 | # for the specific language governing rights and limitations under the 11 | # License. 12 | # 13 | # Mozilla's universal charset detector C/C++ Wrapping API 14 | # Writer(s) : 15 | # Detect class by John Gardiner Myers 16 | # C wrapping API by JoungKyun.Kim 17 | # 18 | # Alternatively, the contents of this file may be used under the terms of 19 | # either the GNU General Public License Version 2 or later (the "GPL"), or 20 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 21 | # in which case the provisions of the GPL or the LGPL are applicable instead 22 | # of those above. If you wish to allow use of your version of this file only 23 | # under the terms of either the GPL or the LGPL, and not to allow others to 24 | # use your version of this file under the terms of the MPL, indicate your 25 | # decision by deleting the provisions above and replace them with the notice 26 | # and other provisions required by the GPL or the LGPL. If you do not delete 27 | # the provisions above, a recipient may use your version of this file under 28 | # the terms of any one of the MPL, the GPL or the LGPL. 29 | 30 | EXTRA_DIST = ko/detect.3 \ 31 | ko/detect_handledata.3 \ 32 | ko/detect_obj_free.3 \ 33 | ko/detect_reset.3 \ 34 | ko/detect_destroy.3 \ 35 | ko/detect_init.3 \ 36 | ko/detect_obj_init.3 37 | 38 | dist_man_MANS = en/detect.3 \ 39 | en/detect_handledata.3 \ 40 | en/detect_obj_free.3 \ 41 | en/detect_reset.3 \ 42 | en/detect_destroy.3 \ 43 | en/detect_init.3 \ 44 | en/detect_obj_init.3 45 | 46 | ko_man3dir = $(mandir)/ko/man3 47 | ko_man3_DATA = $(EXTRA_DIST) 48 | 49 | -------------------------------------------------------------------------------- /man/en/detect.3: -------------------------------------------------------------------------------- 1 | .TH detect 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect.3 4 | .\" 2019-08-01 JoungKyun.Kim 5 | 6 | .SH NAME 7 | detect, detect_r \- Detecting character set and measuring accuracy of charset 8 | 9 | .SH SYNOPSIS 10 | .B "#include " 11 | .sp 12 | .BI "short detect (char * inbuf, DetectObj ** outbuf);" 13 | .sp 14 | .BI "short detect_r (char * inbuf, size_t inlen, DetectObj ** outbuf);" 15 | 16 | .SH DESCRIPTION 17 | Storing charset and accuracy of 18 | .B inbuf to 19 | .B outbuf 20 | 21 | The 22 | .BI detect 23 | API is deprecated becase this api is not binary safe. Use or replace to 24 | .BI detect_r 25 | api. 26 | 27 | .SS Arguments: 28 | .TP 29 | .B inbuf 30 | .br 31 | input string for detecting 32 | 33 | .TP 34 | .B inlen 35 | .br 36 | length of input string for detecting 37 | 38 | .TP 39 | .B outbuf 40 | .br 41 | Stroing inforamtion of 42 | .B inbuf. 43 | The structure of 44 | .B outbuf 45 | is follows. 46 | 47 | .nf 48 | typedef struct DetectObject { 49 | char * encoding; 50 | float confidence; 51 | } DetectObj; 52 | .fi 53 | 54 | The 55 | .B outbuf 56 | variable is must initialized by 57 | .BI detect_obj_init 58 | API before calling this detect api. 59 | 60 | .SH "RETURN VALUE" 61 | Returns following condition as case by case. 62 | 63 | .TP 64 | .B CHARDET_SUCCESS 65 | .br 66 | Detecting success 67 | 68 | .TP 69 | .B CHARDET_NO_RESULT 70 | .br 71 | Detection failure 72 | 73 | .TP 74 | .B CHARDET_NULL_OBJECT 75 | .br 76 | Don't initializing 77 | .B outbuf 78 | with 79 | .BI chardet_obj_init 80 | 81 | .TP 82 | .B CHARDET_OUT_OF_MEMORY 83 | .br 84 | Occuring 85 | .B "out of memory" 86 | at internal API 87 | 88 | .SH EXAMPLE 89 | .nf 90 | #include 91 | 92 | int main (void) { 93 | DetectObj *obj; 94 | char * checkstr = "안녕하세요"; 95 | 96 | if ( (obj = detect_obj_init ()) == NULL ) { 97 | fprintf (stderr, "Memory Allocation failed\\n"); 98 | return CHARDET_MEM_ALLOCATED_FAIL; 99 | } 100 | 101 | //switch (detect (checkstr, &obj)) 102 | switch (detect_r (checkstr, strlen (checkstr), &obj)) 103 | { 104 | case CHARDET_OUT_OF_MEMORY : 105 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 106 | detect_obj_free (&obj); 107 | return CHARDET_OUT_OF_MEMORY; 108 | case CHARDET_NULL_OBJECT : 109 | fprintf (stderr, 110 | "2st argument of chardet() is must memory allocation " 111 | "with detect_obj_init API\\n"); 112 | return CHARDET_NULL_OBJECT; 113 | } 114 | 115 | # check support obj->bom with CHARDET_BOM_CHECK constant 116 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 117 | detect_obj_free (&obj); 118 | 119 | return 0; 120 | } 121 | .fi 122 | 123 | .SH AUTHORS 124 | JoungKyun.Kim 125 | 126 | .SH "BUG REPORTS" 127 | Use QnA board on https://github.com/Joungkyun/libchardet/issues 128 | 129 | .SH "SEE ALSO" 130 | detect_handledata(3), detect_obj_init(3), detect_obj_free(3) 131 | -------------------------------------------------------------------------------- /man/en/detect_destroy.3: -------------------------------------------------------------------------------- 1 | .TH detect_destroy 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_destroy.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH NAME 7 | detect_destroy \- free Detector structure 8 | 9 | .SH SYNOPSIS 10 | .B "#include " 11 | .sp 12 | .BI "void detect_destroy (Detect ** handle);" 13 | 14 | .SH DESCRIPTION 15 | Free resource that allocated by 16 | .BI detect_init 17 | api. 18 | 19 | .SH "RETURN VALUE" 20 | void 21 | 22 | .SH EXAMPLE 23 | .nf 24 | #include 25 | 26 | int main (void) { 27 | Detect * d; 28 | DetectObj * obj; 29 | 30 | if ( (d = detect_init ()) == NULL ) { 31 | fprintf (stderr, "chardet handle initialize failed\\n"); 32 | return CHARDET_MEM_ALLOCATED_FAIL; 33 | } 34 | 35 | detect_reset (&d); 36 | 37 | if ( (obj = detect_obj_init ()) == NULL ) { 38 | fprintf (stderr, "Memory Allocation failed\\n"); 39 | return CHARDET_MEM_ALLOCATED_FAIL; 40 | } 41 | 42 | switch (detect_handledata (&d, "안녕하세요", &obj)) { 43 | case CHARDET_OUT_OF_MEMORY : 44 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 45 | detect_obj_free (&obj); 46 | return CHARDET_OUT_OF_MEMORY; 47 | case CHARDET_NULL_OBJECT : 48 | fprintf (stderr, 49 | "3st argument of chardet_handledata() is must memory allocation " 50 | "with detect_obj_init API\\n"); 51 | return CHARDET_NULL_OBJECT; 52 | } 53 | 54 | # check support obj->bom with CHARDET_BOM_CHECK constant 55 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 56 | detect_obj_free (&obj); 57 | detect_destroy (&d); 58 | 59 | return 0; 60 | } 61 | .fi 62 | 63 | .SH AUTHORS 64 | JoungKyun.Kim 65 | 66 | .SH "BUG REPORTS" 67 | Use QnA board on https://github.com/Joungkyun/libchardet/issues 68 | 69 | .SH "SEE ALSO" 70 | detect_init(3), detect_reset(3) 71 | -------------------------------------------------------------------------------- /man/en/detect_handledata.3: -------------------------------------------------------------------------------- 1 | .TH detect_handledata 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_handledata.3 4 | .\" 2019-08-01 JoungKyun.Kim 5 | 6 | .SH NAME 7 | detect_handledata, detect_handledata_r \- Detecting character set and measuring accuracy of charset 8 | 9 | .SH SNOPSYS 10 | .B "#include " 11 | .sp 12 | .BI "short chardet_handledata (Detect ** handle, const char * inbuf, DetectObj ** outbuf);" 13 | .sp 14 | .BI "short chardet_handledata_r (Detect ** handle, const char * inbuf, size_t inlen, DetectObj ** outbuf);" 15 | 16 | .SH DESCRIPTION 17 | Storing charset and accuracy of 18 | .B inbuf to 19 | .B outbuf 20 | 21 | The 22 | .BI detect_handledata 23 | API is deprecated becase this api is not binary safe. Use or replace to 24 | .BI detect_handledata_r 25 | api. 26 | 27 | .SS Arguments: 28 | .TP 29 | .B handle 30 | .br 31 | .B Detect 32 | handle resource that allocated by 33 | .B detect_init 34 | api. 35 | 36 | .TP 37 | .B inbuf 38 | .br 39 | input string for detecting 40 | 41 | .TP 42 | .B inlen 43 | .br 44 | length of input string for detecting 45 | 46 | .TP 47 | .B outbuf 48 | .br 49 | Stroing inforamtion of 50 | .B inbuf. 51 | The structure of 52 | .B outbuf 53 | is follows. 54 | 55 | .nf 56 | typedef struct DetectObject { 57 | char * encoding; 58 | float confidence; 59 | } DetectObj; 60 | .fi 61 | 62 | The 63 | .B outbuf 64 | variable is must initialized by 65 | .BI detect_obj_init 66 | API before calling this detect api. 67 | 68 | .SH "RETURN VALUES" 69 | Returns following condition as case by case. 70 | 71 | .TP 72 | .B CHARDET_SUCCESS 73 | .br 74 | Detecting success 75 | 76 | .TP 77 | .B CHARDET_NO_RESULT 78 | .br 79 | Detection failure 80 | 81 | .TP 82 | .B CHARDET_NULL_OBJECT 83 | .br 84 | Don't initializing 85 | .B outbuf 86 | with 87 | .BI chardet_obj_init 88 | 89 | .TP 90 | .B CHARDET_OUT_OF_MEMORY 91 | .br 92 | Occuring 93 | .B "out of memory" 94 | at internal API 95 | 96 | .SH EXAMPLE 97 | .nf 98 | #include 99 | 100 | int main (void) { 101 | Detect * d; 102 | DetectObj * obj; 103 | int i, arrayNum; 104 | char *str[] = { 105 | "this is ascii", 106 | "이건 euc-kr 입니다." 107 | }; 108 | 109 | arrayNum = sizeof (str) / sizeof (str[0]); 110 | 111 | if ( (d = detect_init ()) == NULL ) { 112 | fprintf (stderr, "chardet handle initialize failed\\n"); 113 | return CHARDET_MEM_ALLOCATED_FAIL; 114 | } 115 | 116 | for ( i=0; ibom with CHARDET_BOM_CHECK constant 139 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 140 | detect_obj_free (&obj); 141 | } 142 | detect_destroy (&d); 143 | 144 | return 0; 145 | } 146 | .fi 147 | 148 | .SH AUTHORS 149 | JoungKyun.Kim 150 | 151 | .SH "BUG REPORTS" 152 | Use QnA board on https://github.com/Joungkyun/libchardet/issues 153 | 154 | .SH "SEE ALSO" 155 | detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_reset(3), detect_destroy(3) 156 | -------------------------------------------------------------------------------- /man/en/detect_handledata_r.3: -------------------------------------------------------------------------------- 1 | .so man3/detect_handledata.3 2 | -------------------------------------------------------------------------------- /man/en/detect_init.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/man/en/detect_init.3 -------------------------------------------------------------------------------- /man/en/detect_obj_free.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/man/en/detect_obj_free.3 -------------------------------------------------------------------------------- /man/en/detect_obj_init.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/man/en/detect_obj_init.3 -------------------------------------------------------------------------------- /man/en/detect_r.3: -------------------------------------------------------------------------------- 1 | .so man3/detect_r.3 2 | -------------------------------------------------------------------------------- /man/en/detect_reset.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/man/en/detect_reset.3 -------------------------------------------------------------------------------- /man/ko/detect.3: -------------------------------------------------------------------------------- 1 | .TH detect 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect.3 4 | .\" 2019-08-01 JoungKyun.Kim 5 | 6 | .SH 이름 7 | detect, detect_r \- 문자열의 문자셋과 정확도를 측정 8 | 9 | .SH 사용법 10 | .B "#include " 11 | .sp 12 | .BI "short detect (char * inbuf, DetectObj ** outbuf);" 13 | .sp 14 | .BI "short detect_r (char * inbuf, size_t inlen, DetectObj ** outbuf);" 15 | 16 | .SH 설명 17 | .B inbuf 18 | 의 문자셋과 정확도를 19 | .B outbuf에 저장한다. 20 | 21 | .BI detect 22 | API는 binary safe 문제로 더이상 사용을 권장하지 않습니다. 23 | .BI detect_r 24 | API를 이용하십시오. 25 | 26 | .SS API 아규먼트 27 | .TP 28 | .B inbuf 29 | .br 30 | 문자셋과 정확도를 측정할 입력 문자열 31 | 32 | .B inlen 33 | .br 34 | 문자셋과 정확도를 측정할 입력 문자열의 길이 35 | 36 | .TP 37 | .B outbuf 38 | .br 39 | .B inbuf 40 | 의 문자셋과 정확도를 구하여 저장을 한다. 41 | .B outbuf 42 | 의 구조는 다음과 같다. 43 | 44 | .nf 45 | typedef struct DetectObject { 46 | char * encoding; 47 | float confidence; 48 | short bom; 49 | } DetectObj; 50 | .fi 51 | 52 | .B outbuf는 53 | .BI detect_obj_init 54 | api를 이용하여 초기화를 한 후에 55 | .BI detect 56 | api로 넘겨줘야 한다. 57 | 58 | .SH 반환값 59 | 경우에 따라 아래의 상태를 반환한다. 60 | 61 | .TP 62 | .B CHARDET_SUCCESS 63 | .br 64 | 성공 65 | 66 | .TP 67 | .B CHARDET_NO_RESULT 68 | .br 69 | 탐지하지 못했을 경우 70 | 71 | .TP 72 | .B CHARDET_NULL_OBJECT 73 | .br 74 | .B outbuf 75 | 을 76 | .BI chardet_obj_init 77 | api를 이용하여 메모리 할당을 하지 않았을 경우. 78 | 79 | .TP 80 | .B CHARDET_OUT_OF_MEMORY 81 | .br 82 | 내부 API에서 83 | .B "out of memory" 84 | 가 발생했을 경우 85 | 86 | .SH 예제 87 | .nf 88 | #include 89 | 90 | int main (void) { 91 | DetectObj *obj; 92 | 93 | if ( (obj = detect_obj_init ()) == NULL ) { 94 | fprintf (stderr, "Memory Allocation failed\\n"); 95 | return CHARDET_MEM_ALLOCATED_FAIL; 96 | } 97 | 98 | //switch (detect ("안녕하세요", &obj)) 99 | switch (detect_r ("안녕하세요", 10, &obj)) 100 | { 101 | case CHARDET_OUT_OF_MEMORY : 102 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 103 | detect_obj_free (&obj); 104 | return CHARDET_OUT_OF_MEMORY; 105 | case CHARDET_NULL_OBJECT : 106 | fprintf (stderr, 107 | "2st argument of chardet() is must memory allocation " 108 | "with detect_obj_init API\\n"); 109 | return CHARDET_NULL_OBJECT; 110 | } 111 | 112 | # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. 113 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 114 | detect_obj_free (&obj); 115 | 116 | return 0; 117 | } 118 | .fi 119 | 120 | .SH 저자 121 | 김정균 122 | 123 | .SH 버그 리포트 124 | https://github.com/Joungkyun/libchardet/issues 125 | 126 | .SH "참고" 127 | detect_handledata(3), detect_obj_init(3), detect_obj_free(3) 128 | -------------------------------------------------------------------------------- /man/ko/detect_destroy.3: -------------------------------------------------------------------------------- 1 | .TH detect_destroy 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_destroy.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH 이름 7 | detect_destroy \- chardet resource를 해제한다. 8 | 9 | .SH 사용법 10 | .B "#include " 11 | .sp 12 | .BI "void detect_destroy (Detect ** handle);" 13 | 14 | .SH 설명 15 | .BI detect_init 16 | 에 의하여 할당된 resource를 해제한다. 17 | 18 | .SH 반환값 19 | void 20 | 21 | .SH 예제 22 | .nf 23 | #include 24 | 25 | int main (void) { 26 | Detect * d; 27 | DetectObj * obj; 28 | 29 | if ( (d = detect_init ()) == NULL ) { 30 | fprintf (stderr, "chardet handle initialize failed\\n"); 31 | return CHARDET_MEM_ALLOCATED_FAIL; 32 | } 33 | 34 | detect_reset (&d); 35 | 36 | if ( (obj = detect_obj_init ()) == NULL ) { 37 | fprintf (stderr, "Memory Allocation failed\\n"); 38 | return CHARDET_MEM_ALLOCATED_FAIL; 39 | } 40 | 41 | switch (detect_handledata (&d, "안녕하세요", &obj)) { 42 | case CHARDET_OUT_OF_MEMORY : 43 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 44 | detect_obj_free (&obj); 45 | return CHARDET_OUT_OF_MEMORY; 46 | case CHARDET_NULL_OBJECT : 47 | fprintf (stderr, 48 | "3st argument of chardet_handledata() is must memory allocation " 49 | "with detect_obj_init API\\n"); 50 | return CHARDET_NULL_OBJECT; 51 | } 52 | 53 | # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. 54 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 55 | detect_obj_free (&obj); 56 | detect_destroy (&d); 57 | 58 | return 0; 59 | } 60 | .fi 61 | 62 | .SH 저자 63 | 김정균 64 | 65 | .SH 버그 리포트 66 | https://github.com/Joungkyun/libchardet/issues 67 | 68 | .SH "참고" 69 | detect_init(3), detect_reset(3) 70 | 71 | -------------------------------------------------------------------------------- /man/ko/detect_handledata.3: -------------------------------------------------------------------------------- 1 | .TH detect_handledata 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_handledata.3 4 | .\" 2019-08-01 JoungKyun.Kim 5 | 6 | .SH 이름 7 | detect_handledata, detect_handledata_r \- 문자셋과 정확도를 측정 8 | 9 | .SH 사용법 10 | .B "#include " 11 | .sp 12 | .BI "short detect_handledata (Detect ** handle, const char * inbuf, DetectObj ** outbuf);" 13 | .sp 14 | .BI "short detect_handledata_r (Detect ** handle, const char * inbuf, size_t inlen, DetectObj ** outbuf);" 15 | 16 | .SH 설명 17 | .B inbuf 18 | 의 문자셋과 정확도를 19 | .B outbuf에 저장한다. 20 | 21 | .BI detect_handledata 22 | API는 binary safe 문제로 더이상 사용을 권장하지 않습니다. 23 | .BI detect_handledata_r 24 | API를 이용하십시오. 25 | 26 | .SS API 아규먼트 27 | .TP 28 | .B handle 29 | .br 30 | .BI detect_init 31 | api에 의하여 할당된 detect handle resource. 32 | 33 | .TP 34 | .B inbuf 35 | .br 36 | 문자셋과 정확도를 측정할 입력 문자열 37 | 38 | .B inlen 39 | .br 40 | 문자셋과 정확도를 측정할 입력 문자열의 길이 41 | 42 | .TP 43 | .B outbuf 44 | .br 45 | .B inbuf 46 | 의 문자셋과 정확도를 구하여 저장을 한다. 47 | .B outbuf 48 | 의 구조는 다음과 같다. 49 | 50 | .nf 51 | typedef struct DetectObject { 52 | char * encoding; 53 | float confidence; 54 | } DetectObj; 55 | .fi 56 | 57 | .B outbuf는 58 | .BI detect_obj_init 59 | api를 이용하여 초기화를 한 후에 60 | .BI detect 61 | api로 넘겨줘야 한다. 62 | 63 | .SH 반환값 64 | 경우에 따라 아래의 상태를 반환한다. 65 | 66 | .TP 67 | .B CHARDET_SUCCESS 68 | .br 69 | 성공 70 | 71 | .TP 72 | .B CHARDET_NO_RESULT 73 | .br 74 | 탐지하지 못했을 경우 75 | 76 | .TP 77 | .B CHARDET_NULL_OBJECT 78 | .br 79 | .B outbuf 80 | 을 81 | .BI chardet_obj_init 82 | api를 이용하여 메모리 할당을 하지 않았을 경우. 83 | 84 | .TP 85 | .B CHARDET_OUT_OF_MEMORY 86 | .br 87 | 내부 API에서 88 | .B "out of memory" 89 | 가 발생했을 경우 90 | 91 | .SH 예제 92 | .nf 93 | #include 94 | 95 | int main (void) { 96 | Detect * d; 97 | DetectObj * obj; 98 | int i, arrayNum; 99 | char *str[] = { 100 | "this is ascii", 101 | "이건 euc-kr 입니다." 102 | }; 103 | 104 | arrayNum = sizeof (str) / sizeof (str[0]); 105 | 106 | if ( (d = detect_init ()) == NULL ) { 107 | fprintf (stderr, "chardet handle initialize failed\\n"); 108 | return CHARDET_MEM_ALLOCATED_FAIL; 109 | } 110 | 111 | for ( i=0; iencoding, obj->confidence, obj->bom); 135 | detect_obj_free (&obj); 136 | } 137 | detect_destroy (&d); 138 | 139 | return 0; 140 | } 141 | .fi 142 | 143 | .SH 저자 144 | 김정균 145 | 146 | .SH 버그 리포트 147 | https://github.com/Joungkyun/libchardet/issues 148 | 149 | .SH "참고" 150 | detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_reset(3), detect_destroy(3) 151 | -------------------------------------------------------------------------------- /man/ko/detect_handledata_r.3: -------------------------------------------------------------------------------- 1 | .so man3/detect_handledata.3 2 | -------------------------------------------------------------------------------- /man/ko/detect_init.3: -------------------------------------------------------------------------------- 1 | .TH detect_init 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_init.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH 이름 7 | detect_init - chardet file handle 초기화 8 | .SH 사용법 9 | .I #include 10 | .br 11 | .I Detect * chardet_init (void); 12 | .SH 설명 13 | chardet library 의 file handle 을 초기화 한다. 14 | .SH 반환값 15 | 실패시에 NULL을 반환하며, 성공시에 Detect_t structure를 반환한다. 16 | .nf 17 | 18 | typedef struct Detect_t { 19 | Detector *detect; 20 | } Detect; 21 | .PP 22 | .SH 예제 23 | .nf 24 | #include 25 | 26 | int main (void) { 27 | Detect * d; 28 | DetectObj * obj; 29 | 30 | if ( (d = detect_init ()) == NULL ) { 31 | fprintf (stderr, "chardet handle initialize failed\\n"); 32 | return CHARDET_MEM_ALLOCATED_FAIL; 33 | } 34 | 35 | detect_reset (&d); 36 | 37 | if ( (obj = detect_obj_init ()) == NULL ) { 38 | fprintf (stderr, "Memory Allocation failed\\n"); 39 | return CHARDET_MEM_ALLOCATED_FAIL; 40 | } 41 | 42 | switch (detect_handledata (&d, "안녕하세요", &obj)) { 43 | case CHARDET_OUT_OF_MEMORY : 44 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 45 | detect_obj_free (&obj); 46 | return CHARDET_OUT_OF_MEMORY; 47 | case CHARDET_NULL_OBJECT : 48 | fprintf (stderr, 49 | "2st argument of chardet() is must memory allocation " 50 | "with detect_obj_init API\\n"); 51 | return CHARDET_NULL_OBJECT; 52 | } 53 | 54 | # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. 55 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 56 | detect_obj_free (&obj); 57 | detect_destroy (&d); 58 | 59 | return 0; 60 | } 61 | .fi 62 | .SH 저자 63 | 김정균 64 | .SH 버그 리포트 65 | https://github.com/Joungkyun/libchardet/issues 66 | .SH 저작권 67 | Copyright (c) 2019 JoungKyun.Kim 68 | 69 | 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. 70 | .SH "참고" 71 | detect_obj_init(3), detect_obj_free(3), detect_reset(3), detect_handledata(3), detect_destroy(3) 72 | 73 | -------------------------------------------------------------------------------- /man/ko/detect_obj_free.3: -------------------------------------------------------------------------------- 1 | .TH detect_obj_free 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_obj_free.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH 이름 7 | chardet_obj_free - chardet_obj_init 의 return value memory 해제 8 | .SH 사용법 9 | .I #include 10 | .br 11 | .I void chardet_obj_free (DetectObj **); 12 | .SH 설명 13 | chardet_obj_free() 함수는 chardet_obj_init API 에 의하여 메모리가 할당될 DetectOBJ 14 | structure 의 메모리를 해제 한다. 15 | .SH 반환값 16 | 없음 17 | .PP 18 | .SH 예제 19 | .nf 20 | #include 21 | 22 | int main (void) { 23 | DetectObj *obj; 24 | 25 | if ( (obj = detect_obj_init ()) == NULL ) { 26 | fprintf (stderr, "Memory Allocation failed\\n"); 27 | return CHARDET_MEM_ALLOCATED_FAIL; 28 | } 29 | 30 | switch (detect ("안녕하세요", &obj)) { 31 | case CHARDET_OUT_OF_MEMORY : 32 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 33 | detect_obj_free (&obj); 34 | return CHARDET_OUT_OF_MEMORY; 35 | case CHARDET_NULL_OBJECT : 36 | fprintf (stderr, 37 | "2st argument of chardet() is must memory allocation " 38 | "with detect_obj_init API\\n"); 39 | return CHARDET_NULL_OBJECT; 40 | } 41 | 42 | # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. 43 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 44 | detect_obj_free (&obj); 45 | 46 | return 0; 47 | } 48 | .fi 49 | .SH 저자 50 | 김정균 51 | .SH 버그 리포트 52 | https://github.com/Joungkyun/libchardet/issues 53 | .SH 저작권 54 | Copyright (c) 2019 JoungKyun.Kim 55 | 56 | 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. 57 | .SH "참고" 58 | detect_obj_init(3) 59 | 60 | -------------------------------------------------------------------------------- /man/ko/detect_obj_init.3: -------------------------------------------------------------------------------- 1 | .TH detect_obj_init 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_obj_init.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH 이름 7 | detect_obj_init - libchardet 의 결과 값 structure 초기화 8 | .SH 사용법 9 | .I #include 10 | .br 11 | .I DetectObj chardet_obj_init (void); 12 | .SH 설명 13 | chardet_obj_init() 함수는 chardet API 또는 chardet_handledata API 의 결과 값을 14 | 받을 DetectObj structure를 초기화 한다. 15 | .SH 반환값 16 | DetectObject struct 를 반환한다. 반환된 값은 chardet_obj_free API 를 이용하여 메모리 해제를 해 주어야 한다. 17 | .nf 18 | 19 | typedef struct DetectObject { 20 | char * encoding; 21 | float confidence; 22 | short bom; 23 | } DetectObj; 24 | .PP 25 | .SH 예제 26 | .nf 27 | #include 28 | 29 | int main (void) { 30 | DetectObj *obj; 31 | 32 | if ( (obj = detect_obj_init ()) == NULL ) { 33 | fprintf (stderr, "Memory Allocation failed\\n"); 34 | return CHARDET_MEM_ALLOCATED_FAIL; 35 | } 36 | 37 | switch (detect ("안녕하세요", &obj)) { 38 | case CHARDET_OUT_OF_MEMORY : 39 | fprintf (stderr, "On handle processing, occured out of memory\\n"); 40 | detect_obj_free (&obj); 41 | return CHARDET_OUT_OF_MEMORY; 42 | case CHARDET_NULL_OBJECT : 43 | fprintf (stderr, 44 | "2st argument of chardet() is must memory allocation " 45 | "with detect_obj_init API\\n"); 46 | return CHARDET_NULL_OBJECT; 47 | } 48 | 49 | # obj-bom 은 CHARDET_BOM_CHECK 상수 지원여부로 지원을 판단할 수 있습니다. 50 | printf ("encoding: %s, confidence: %f, exists bom: %d\\n", obj->encoding, obj->confidence, obj->bom); 51 | detect_obj_free (&obj); 52 | 53 | return 0; 54 | } 55 | .fi 56 | .SH 저자 57 | 김정균 58 | .SH 버그 리포트 59 | https://github.com/Joungkyun/libchardet/issues 60 | .SH 저작권 61 | Copyright (c) 2019 JoungKyun.Kim 62 | 63 | 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. 64 | .SH "참고" 65 | detect_obj_free(3), detect(3), detect_handledata(3) 66 | 67 | -------------------------------------------------------------------------------- /man/ko/detect_r.3: -------------------------------------------------------------------------------- 1 | .so man3/detect_r.3 2 | -------------------------------------------------------------------------------- /man/ko/detect_reset.3: -------------------------------------------------------------------------------- 1 | .TH detect_reset 3 2019-08-01 "libchardet manuals" 2 | .\" Process with 3 | .\" nroff -man detect_reset.3 4 | .\" 2019-08-01 JoungKyun Kim 5 | 6 | .SH 이름 7 | detect_reset - chardet file handle reset 8 | .SH 사용법 9 | .I #include 10 | .br 11 | .I void chardet_reset (Detect **handle); 12 | .SH 설명 13 | chardet library 의 file handle 을 재 초기화 한다. 14 | .SH 반환값 15 | 반환 값 없음 16 | .PP 17 | .SH 예제 18 | .nf 19 | #include 20 | 21 | int main (void) { 22 | Detect * d; 23 | DetectObj * obj; 24 | int i, arrayNum; 25 | char *str[] = { 26 | "this is ascii", 27 | "이건 euc-kr 입니다." 28 | }; 29 | 30 | arrayNum = sizeof (str) / sizeof (str[0]); 31 | 32 | if ( (d = detect_init ()) == NULL ) { 33 | fprintf (stderr, "chardet handle initialize failed\\n"); 34 | return CHARDET_MEM_ALLOCATED_FAIL; 35 | } 36 | 37 | for ( i=0; iencoding, obj->confidence, obj->bom); 59 | detect_obj_free (&obj); 60 | } 61 | detect_destroy (&d); 62 | 63 | return 0; 64 | } 65 | .fi 66 | .SH 저자 67 | 김정균 68 | .SH 버그 리포트 69 | https://github.com/Joungkyun/libchardet/issues 70 | .SH 저작권 71 | Copyright (c) 2019 JoungKyun.Kim 72 | 73 | 이 프로그램은 MPL/GPL2/LGPL2.1 을 따르며, 사용시의 어떠한 문제에 대하여 보증하지 않는다. 74 | .SH "참고" 75 | detect_obj_init(3), detect_obj_free(3), detect_init(3), detect_handledata(3), detect_destroy(3) 76 | 77 | -------------------------------------------------------------------------------- /project/libchardet.cbp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 107 | 108 | -------------------------------------------------------------------------------- /src/CharDistribution.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Communicator client code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #include "CharDistribution.h" 41 | 42 | #include "tables/JISFreq.tab" 43 | #include "tables/Big5Freq.tab" 44 | #include "tables/EUCKRFreq.tab" 45 | #include "tables/EUCTWFreq.tab" 46 | #include "tables/GB2312Freq.tab" 47 | 48 | #define SURE_YES 0.99f 49 | #define SURE_NO 0.01f 50 | 51 | //return confidence base on received data 52 | float CharDistributionAnalysis::GetConfidence() 53 | { 54 | //if we didn't receive any character in our consideration range, return negative answer 55 | if (mTotalChars <= 0) 56 | return SURE_NO; 57 | 58 | if (mTotalChars != mFreqChars) { 59 | float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); 60 | 61 | if (r < SURE_YES) 62 | return r; 63 | } 64 | //normalize confidence, (we don't want to be 100% sure) 65 | return SURE_YES; 66 | } 67 | 68 | EUCTWDistributionAnalysis::EUCTWDistributionAnalysis() 69 | { 70 | mCharToFreqOrder = EUCTWCharToFreqOrder; 71 | mTableSize = EUCTW_TABLE_SIZE; 72 | mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO; 73 | } 74 | 75 | EUCKRDistributionAnalysis::EUCKRDistributionAnalysis() 76 | { 77 | mCharToFreqOrder = EUCKRCharToFreqOrder; 78 | mTableSize = EUCKR_TABLE_SIZE; 79 | mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO; 80 | } 81 | 82 | GB2312DistributionAnalysis::GB2312DistributionAnalysis() 83 | { 84 | mCharToFreqOrder = GB2312CharToFreqOrder; 85 | mTableSize = GB2312_TABLE_SIZE; 86 | mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO; 87 | } 88 | 89 | Big5DistributionAnalysis::Big5DistributionAnalysis() 90 | { 91 | mCharToFreqOrder = Big5CharToFreqOrder; 92 | mTableSize = BIG5_TABLE_SIZE; 93 | mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO; 94 | } 95 | 96 | SJISDistributionAnalysis::SJISDistributionAnalysis() 97 | { 98 | mCharToFreqOrder = JISCharToFreqOrder; 99 | mTableSize = JIS_TABLE_SIZE; 100 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 101 | } 102 | 103 | EUCJPDistributionAnalysis::EUCJPDistributionAnalysis() 104 | { 105 | mCharToFreqOrder = JISCharToFreqOrder; 106 | mTableSize = JIS_TABLE_SIZE; 107 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /src/JpCntx.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Communicator client code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef __JPCNTX_H__ 41 | #define __JPCNTX_H__ 42 | 43 | #define NUM_OF_CATEGORY 6 44 | 45 | #include "nscore.h" 46 | 47 | #define ENOUGH_REL_THRESHOLD 100 48 | #define MAX_REL_THRESHOLD 1000 49 | 50 | //hiragana frequency category table 51 | extern char jp2CharContext[83][83]; 52 | 53 | class JapaneseContextAnalysis 54 | { 55 | public: 56 | JapaneseContextAnalysis() {Reset(PR_FALSE);}; 57 | 58 | void HandleData(const char* aBuf, PRUint32 aLen); 59 | 60 | void HandleOneChar(const char* aStr, PRUint32 aCharLen) 61 | { 62 | PRInt32 order; 63 | 64 | //if we received enough data, stop here 65 | if (mTotalRel > MAX_REL_THRESHOLD) mDone = PR_TRUE; 66 | if (mDone) return; 67 | 68 | //Only 2-bytes characters are of our interest 69 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 70 | if (order != -1 && mLastCharOrder != -1) 71 | { 72 | mTotalRel++; 73 | //count this sequence to its category counter 74 | mRelSample[(int)jp2CharContext[mLastCharOrder][order]]++; 75 | } 76 | mLastCharOrder = order; 77 | }; 78 | 79 | float GetConfidence(void); 80 | void Reset(PRBool aIsPreferredLanguage); 81 | void SetOpion(){}; 82 | PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}; 83 | 84 | protected: 85 | virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0; 86 | virtual PRInt32 GetOrder(const char* str) = 0; 87 | 88 | //category counters, each interger counts sequence in its category 89 | PRUint32 mRelSample[NUM_OF_CATEGORY]; 90 | 91 | //total sequence received 92 | PRUint32 mTotalRel; 93 | 94 | //Number of sequences needed to trigger detection 95 | PRUint32 mDataThreshold; 96 | 97 | //The order of previous char 98 | PRInt32 mLastCharOrder; 99 | 100 | //if last byte in current buffer is not the last byte of a character, we 101 | //need to know how many byte to skip in next buffer. 102 | PRUint32 mNeedToSkipCharNum; 103 | 104 | //If this flag is set to PR_TRUE, detection is done and conclusion has been made 105 | PRBool mDone; 106 | }; 107 | 108 | 109 | class SJISContextAnalysis : public JapaneseContextAnalysis 110 | { 111 | //SJISContextAnalysis(){}; 112 | protected: 113 | PRInt32 GetOrder(const char* str, PRUint32 *charLen); 114 | 115 | PRInt32 GetOrder(const char* str) 116 | { 117 | //We only interested in Hiragana, so first byte is '\202' 118 | if (*str == '\202' && 119 | (unsigned char)*(str+1) >= (unsigned char)0x9f && 120 | (unsigned char)*(str+1) <= (unsigned char)0xf1) 121 | return (unsigned char)*(str+1) - (unsigned char)0x9f; 122 | return -1; 123 | }; 124 | }; 125 | 126 | class EUCJPContextAnalysis : public JapaneseContextAnalysis 127 | { 128 | protected: 129 | PRInt32 GetOrder(const char* str, PRUint32 *charLen); 130 | PRInt32 GetOrder(const char* str) 131 | //We only interested in Hiragana, so first byte is '\244' 132 | { 133 | if (*str == '\244' && 134 | (unsigned char)*(str+1) >= (unsigned char)0xa1 && 135 | (unsigned char)*(str+1) <= (unsigned char)0xf3) 136 | return (unsigned char)*(str+1) - (unsigned char)0xa1; 137 | return -1; 138 | }; 139 | }; 140 | 141 | #endif /* __JPCNTX_H__ */ 142 | 143 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1 2 | # 3 | # The contents of this file are subject to the Mozilla Public License Version 4 | # 1.1 (the "License"); you may not use this file except in compliance with 5 | # the License. You may obtain a copy of the License at 6 | # http://www.mozilla.org/MPL/ 7 | # 8 | # Software distributed under the License is distributed on an "AS IS" basis, 9 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 | # for the specific language governing rights and limitations under the 11 | # License. 12 | # 13 | # Mozilla's universal charset detector C/C++ Wrapping API 14 | # Writer(s) : 15 | # Detect class by John Gardiner Myers 16 | # C wrapping API by JoungKyun.Kim 17 | # 18 | # Alternatively, the contents of this file may be used under the terms of 19 | # either the GNU General Public License Version 2 or later (the "GPL"), or 20 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 21 | # in which case the provisions of the GPL or the LGPL are applicable instead 22 | # of those above. If you wish to allow use of your version of this file only 23 | # under the terms of either the GPL or the LGPL, and not to allow others to 24 | # use your version of this file under the terms of the MPL, indicate your 25 | # decision by deleting the provisions above and replace them with the notice 26 | # and other provisions required by the GPL or the LGPL. If you do not delete 27 | # the provisions above, a recipient may use your version of this file under 28 | # the terms of any one of the MPL, the GPL or the LGPL. 29 | 30 | SONAME_MAJOR = @SONAME_MAJOR@ 31 | SONAME_MINOR = @SONAME_MINOR@ 32 | SONAME_REVISION = @SONAME_REVISION@ 33 | SONAME_INFO = @SONAME_INFO@ 34 | SONAME_VERSION = @SONAME_VERSION@ 35 | 36 | includedir = @includedir@/chardet 37 | 38 | include_HEADERS = nsUniversalDetector.h chardet.h \ 39 | ../include/nscore.h ../include/version.h \ 40 | ../chardet-config.h 41 | noinst_HEADERS = CharDistribution.h \ 42 | nsCodingStateMachine.h \ 43 | nsGB2312Prober.h \ 44 | nsSBCSGroupProber.h \ 45 | JpCntx.h \ 46 | nsEUCJPProber.h \ 47 | nsHebrewProber.h \ 48 | nsSBCharSetProber.h \ 49 | nsEUCKRProber.h \ 50 | nsLatin1Prober.h \ 51 | nsSJISProber.h \ 52 | nsBig5Prober.h \ 53 | nsEUCTWProber.h \ 54 | nsMBCSGroupProber.h \ 55 | nsUTF8Prober.h \ 56 | nsCharSetProber.h \ 57 | nsEscCharsetProber.h \ 58 | nsPkgInt.h \ 59 | ../include/prmem.h \ 60 | tables/Big5Freq.tab \ 61 | tables/EUCKRFreq.tab \ 62 | tables/EUCTWFreq.tab \ 63 | tables/GB2312Freq.tab \ 64 | tables/JISFreq.tab 65 | 66 | AM_CFLAGS = -Wall @CFLAGS@ 67 | AM_CXXFLAGS = -Wall -Wno-non-virtual-dtor -nostdinc++ -fno-rtti -fno-exceptions @CPPFLAGS@ 68 | AM_LDFLAGS = -version-info $(SONAME_INFO) @LDFLAGS@ @LIBS@ @MINGW_CYGWIN_DYNLIB@ 69 | 70 | lib_LTLIBRARIES = libchardet.la 71 | libchardet_la_SOURCES = CharDistribution.cpp JpCntx.cpp \ 72 | tables/LangBulgarianModel.cpp \ 73 | tables/LangCyrillicModel.cpp \ 74 | tables/LangGreekModel.cpp \ 75 | tables/LangHebrewModel.cpp \ 76 | tables/LangHungarianModel.cpp \ 77 | tables/LangThaiModel.cpp \ 78 | tables/LangArabicModel.cpp \ 79 | tables/LangDanishModel.cpp \ 80 | tables/LangEsperantoModel.cpp \ 81 | tables/LangFrenchModel.cpp \ 82 | tables/LangGermanModel.cpp \ 83 | tables/LangSpanishModel.cpp \ 84 | tables/LangTurkishModel.cpp \ 85 | tables/LangVietnameseModel.cpp \ 86 | nsBig5Prober.cpp \ 87 | nsCharSetProber.cpp nsEUCJPProber.cpp nsEUCKRProber.cpp \ 88 | nsEUCTWProber.cpp nsEscCharsetProber.cpp nsEscSM.cpp \ 89 | nsGB2312Prober.cpp nsHebrewProber.cpp nsLatin1Prober.cpp \ 90 | nsMBCSGroupProber.cpp nsMBCSSM.cpp nsSBCSGroupProber.cpp \ 91 | nsSBCharSetProber.cpp nsSJISProber.cpp nsUTF8Prober.cpp \ 92 | nsUniversalDetector.cpp chardet.cpp 93 | libchardet_la_CPPFLAGS = -I../include 94 | -------------------------------------------------------------------------------- /src/chardet.cpp: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * Mozilla's universal charset detector C/C++ Wrapping API 15 | * Writer(s) : 16 | * Detect class by John Gardiner Myers 17 | * C wrapping API by JoungKyun.Kim 18 | * 19 | * Alternatively, the contents of this file may be used under the terms of 20 | * either the GNU General Public License Version 2 or later (the "GPL"), or 21 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 22 | * in which case the provisions of the GPL or the LGPL are applicable instead 23 | * of those above. If you wish to allow use of your version of this file only 24 | * under the terms of either the GPL or the LGPL, and not to allow others to 25 | * use your version of this file under the terms of the MPL, indicate your 26 | * decision by deleting the provisions above and replace them with the notice 27 | * and other provisions required by the GPL or the LGPL. If you do not delete 28 | * the provisions above, a recipient may use your version of this file under 29 | * the terms of any one of the MPL, the GPL or the LGPL. 30 | * 31 | * ***** END LICENSE BLOCK ***** */ 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | class Detector: public nsUniversalDetector { 38 | public: 39 | Detector () 40 | : nsUniversalDetector (NS_FILTER_ALL) {} 41 | virtual ~Detector () {} 42 | const char *getCharsetName () { return mDetectedCharset; } 43 | float getConfidence () { return mDetectedConfidence; } 44 | short getIsBOM () { return mDetectedIsBOM; } 45 | virtual void Reset () { this->nsUniversalDetector::Reset (); } 46 | protected: 47 | virtual void Report (const char* aCharset) { mDetectedCharset = aCharset; } 48 | }; 49 | 50 | typedef struct Detect_t { 51 | Detector *detect; 52 | } Detect; 53 | 54 | #include 55 | 56 | CHARDET_API char * detect_version (void) { 57 | return (char *) LIBCHARDET_VERSION; 58 | } 59 | 60 | CHARDET_API char * detect_uversion (void) { 61 | return (char *) LIBCHARDET_UVERSION; 62 | } 63 | 64 | CHARDET_API DetectObj * detect_obj_init (void) { 65 | DetectObj * obj; 66 | 67 | if ( (obj = (DetectObj *) PR_Malloc (sizeof (DetectObj))) == NULL ) 68 | return NULL; 69 | 70 | obj->encoding = NULL; 71 | obj->confidence = 0.0; 72 | obj->bom = 0; 73 | 74 | return obj; 75 | } 76 | 77 | CHARDET_API void detect_obj_free (DetectObj ** obj) { 78 | if ( *obj != NULL ) { 79 | PR_FREEIF ((*obj)->encoding); 80 | PR_FREEIF (*obj); 81 | } 82 | } 83 | 84 | CHARDET_API Detect * detect_init (void) { 85 | Detect *det = NULL; 86 | 87 | det = (Detect *) PR_Malloc (sizeof (Detect)); 88 | 89 | if ( det == NULL ) 90 | return NULL; 91 | 92 | det->detect = new Detector; 93 | return det; 94 | } 95 | 96 | CHARDET_API void detect_reset (Detect **det) { 97 | (*det)->detect->Reset (); 98 | } 99 | 100 | CHARDET_API void detect_dataend (Detect **det) { 101 | (*det)->detect->DataEnd (); 102 | } 103 | 104 | CHARDET_API short detect_handledata (Detect ** det, const char * buf, DetectObj ** obj) { 105 | return detect_handledata_r (det, buf, strlen (buf), obj); 106 | } 107 | 108 | CHARDET_API short detect_handledata_r (Detect ** det, const char * buf, size_t buflen, DetectObj ** obj) { 109 | const char * ret; 110 | 111 | if ( (*det)->detect->HandleData (buf, buflen) == NS_ERROR_OUT_OF_MEMORY ) 112 | return CHARDET_OUT_OF_MEMORY; 113 | (*det)->detect->DataEnd (); 114 | 115 | ret = (*det)->detect->getCharsetName (); 116 | 117 | if ( ! ret ) 118 | return CHARDET_NO_RESULT; 119 | else if ( *obj == NULL ) 120 | return CHARDET_NULL_OBJECT; 121 | 122 | (*obj)->encoding = (char *) strdup (ret); 123 | (*obj)->confidence = (*det)->detect->getConfidence (); 124 | (*obj)->bom = (*det)->detect->getIsBOM (); 125 | 126 | return CHARDET_SUCCESS; 127 | } 128 | 129 | CHARDET_API void detect_destroy (Detect **det) { 130 | delete (*det)->detect; 131 | PR_FREEIF (*det); 132 | } 133 | 134 | CHARDET_API short detect (const char *buf, DetectObj ** obj) { 135 | return detect_r (buf, strlen (buf), obj); 136 | } 137 | 138 | CHARDET_API short detect_r (const char *buf, size_t buflen, DetectObj ** obj) { 139 | Detector * det; 140 | const char * ret; 141 | 142 | det = new Detector; 143 | det->Reset (); 144 | if ( det->HandleData (buf, buflen) == NS_ERROR_OUT_OF_MEMORY ) { 145 | delete det; 146 | return CHARDET_OUT_OF_MEMORY; 147 | } 148 | det->DataEnd (); 149 | 150 | ret = det->getCharsetName (); 151 | 152 | if ( ! ret ) { 153 | delete det; 154 | return CHARDET_NO_RESULT; 155 | } else if ( *obj == NULL ) { 156 | delete det; 157 | return CHARDET_NULL_OBJECT; 158 | } 159 | 160 | (*obj)->encoding = (char *) strdup (ret); 161 | (*obj)->confidence = det->getConfidence (); 162 | (*obj)->bom = det->getIsBOM (); 163 | 164 | delete det; 165 | return CHARDET_SUCCESS; 166 | } 167 | 168 | /* 169 | * Local variables: 170 | * tab-width: 4 171 | * c-basic-offset: 4 172 | * End: 173 | * vim: noet sw=4 ts=4 fdm=marker 174 | * vim<600: noet sw=4 ts=4 175 | */ 176 | -------------------------------------------------------------------------------- /src/chardet.h: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * Mozilla's universal charset detector C/C++ Wrapping API 15 | * Writer(s) : 16 | * Detect class by John Gardiner Myers 17 | * C wrapping API by JoungKyun.Kim 18 | * 19 | * Alternatively, the contents of this file may be used under the terms of 20 | * either the GNU General Public License Version 2 or later (the "GPL"), or 21 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 22 | * in which case the provisions of the GPL or the LGPL are applicable instead 23 | * of those above. If you wish to allow use of your version of this file only 24 | * under the terms of either the GPL or the LGPL, and not to allow others to 25 | * use your version of this file under the terms of the MPL, indicate your 26 | * decision by deleting the provisions above and replace them with the notice 27 | * and other provisions required by the GPL or the LGPL. If you do not delete 28 | * the provisions above, a recipient may use your version of this file under 29 | * the terms of any one of the MPL, the GPL or the LGPL. 30 | * 31 | * ***** END LICENSE BLOCK ***** */ 32 | 33 | #ifndef CHARDET_H 34 | #define CHARDET_H 35 | 36 | /* 37 | #if defined _WIN32 || defined __CYGWIN__ 38 | #define CHARDET_API 39 | #else 40 | #if defined(__GNUC__) && __GNUC__ >= 4 41 | #define CHARDET_API __attribute__ ((visibility("default"))) 42 | #else 43 | #define CHARDET_API 44 | #endif 45 | #endif 46 | */ 47 | 48 | 49 | #ifdef HAVE_CONFIG_H 50 | #include 51 | #endif 52 | 53 | #if defined _WIN32 || defined __CYGWIN__ 54 | #ifdef HAVE_DLL_EXPORT 55 | #define CHARDET_API __declspec(dllexport) 56 | #else 57 | #define CHARDET_API __declspec(dllimport) 58 | #endif 59 | #else 60 | #ifdef HAVE_VISIBILITY 61 | #define CHARDET_API __attribute__ ((visibility("default"))) 62 | #else 63 | #define CHARDET_API 64 | #endif 65 | #endif 66 | 67 | #include 68 | 69 | #include 70 | #include 71 | 72 | #define CHARDET_OUT_OF_MEMORY -128 73 | #define CHARDET_MEM_ALLOCATED_FAIL -127 74 | 75 | #define CHARDET_SUCCESS 0 76 | #define CHARDET_NO_RESULT 1 77 | #define CHARDET_NULL_OBJECT 2 78 | 79 | // whether to support detect_r and detect_handledata_r API 80 | #define CHARDET_BINARY_SAFE 1 81 | 82 | // whether to support bom member of DetectObj structure 83 | #define CHARDET_BOM_CHECK 1 84 | 85 | #ifdef __cplusplus 86 | extern "C" { 87 | #endif 88 | typedef struct Detect_t Detect; 89 | 90 | typedef struct DetectObject { 91 | char * encoding; 92 | float confidence; 93 | short bom; 94 | } DetectObj; 95 | 96 | CHARDET_API char * detect_version (void); 97 | CHARDET_API char * detect_uversion (void); 98 | 99 | CHARDET_API DetectObj * detect_obj_init (void); 100 | CHARDET_API void detect_obj_free (DetectObj **); 101 | 102 | CHARDET_API Detect * detect_init (void); 103 | CHARDET_API void detect_reset (Detect **); 104 | CHARDET_API void detect_dataend (Detect **); 105 | CHARDET_API short detect_handledata (Detect **, const char *, DetectObj **); 106 | CHARDET_API short detect_handledata_r (Detect **, const char *, size_t, DetectObj **); 107 | CHARDET_API void detect_destroy (Detect **); 108 | CHARDET_API short detect (const char *, DetectObj **); 109 | CHARDET_API short detect_r (const char *, size_t, DetectObj **); 110 | #ifdef __cplusplus 111 | }; 112 | #endif 113 | 114 | #endif // close define CHARDET_H 115 | 116 | /* 117 | * Local variables: 118 | * tab-width: 4 119 | * c-basic-offset: 4 120 | * End: 121 | * vim: noet sw=4 ts=4 fdm=marker 122 | * vim<600: noet sw=4 ts=4 123 | */ 124 | -------------------------------------------------------------------------------- /src/nsBig5Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #include "nsBig5Prober.h" 41 | 42 | void nsBig5Prober::Reset(void) 43 | { 44 | mCodingSM->Reset(); 45 | mState = eDetecting; 46 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 47 | } 48 | 49 | nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) 50 | { 51 | nsSMState codingState; 52 | 53 | for (PRUint32 i = 0; i < aLen; i++) 54 | { 55 | codingState = mCodingSM->NextState(aBuf[i]); 56 | if (codingState == eItsMe) 57 | { 58 | mState = eFoundIt; 59 | break; 60 | } 61 | if (codingState == eStart) 62 | { 63 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 64 | 65 | if (i == 0) 66 | { 67 | mLastChar[1] = aBuf[0]; 68 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 69 | } 70 | else 71 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 72 | } 73 | } 74 | 75 | mLastChar[0] = aBuf[aLen-1]; 76 | 77 | if (mState == eDetecting) 78 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 79 | mState = eFoundIt; 80 | 81 | return mState; 82 | } 83 | 84 | float nsBig5Prober::GetConfidence(void) 85 | { 86 | float distribCf = mDistributionAnalyser.GetConfidence(); 87 | 88 | return (float)distribCf; 89 | } 90 | 91 | -------------------------------------------------------------------------------- /src/nsBig5Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsBig5Prober_h__ 41 | #define nsBig5Prober_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | #include "CharDistribution.h" 46 | 47 | class nsBig5Prober: public nsCharSetProber { 48 | public: 49 | nsBig5Prober(PRBool aIsPreferredLanguage) 50 | :mIsPreferredLanguage(aIsPreferredLanguage) 51 | {mCodingSM = new nsCodingStateMachine(&Big5SMModel); 52 | Reset();} 53 | virtual ~nsBig5Prober(void){delete mCodingSM;}; 54 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 55 | const char* GetCharSetName() {return "Big5";}; 56 | nsProbingState GetState(void) {return mState;}; 57 | void Reset(void); 58 | float GetConfidence(void); 59 | void SetOpion() {}; 60 | 61 | protected: 62 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 63 | 64 | nsCodingStateMachine* mCodingSM; 65 | nsProbingState mState; 66 | 67 | //Big5ContextAnalysis mContextAnalyser; 68 | Big5DistributionAnalysis mDistributionAnalyser; 69 | char mLastChar[2]; 70 | PRBool mIsPreferredLanguage; 71 | }; 72 | 73 | 74 | #endif /* nsBig5Prober_h__ */ 75 | 76 | -------------------------------------------------------------------------------- /src/nsCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "prmem.h" 43 | 44 | //This filter applies to all scripts which do not use English characters 45 | PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 46 | { 47 | char *newptr; 48 | char *prevPtr, *curPtr; 49 | 50 | PRBool meetMSB = PR_FALSE; 51 | newptr = *newBuf = (char*)PR_Malloc(aLen); 52 | if (!newptr) 53 | return PR_FALSE; 54 | 55 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 56 | { 57 | if (*curPtr & 0x80) 58 | { 59 | meetMSB = PR_TRUE; 60 | } 61 | else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 62 | { 63 | //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 64 | if (meetMSB && curPtr > prevPtr) 65 | //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 66 | { 67 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 68 | prevPtr++; 69 | *newptr++ = ' '; 70 | meetMSB = PR_FALSE; 71 | } 72 | else //ignore current segment. (either because it is just a symbol or just an English word) 73 | prevPtr = curPtr+1; 74 | } 75 | } 76 | if (meetMSB && curPtr > prevPtr) 77 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 78 | 79 | newLen = newptr - *newBuf; 80 | 81 | return PR_TRUE; 82 | } 83 | 84 | //This filter applies to all scripts which contain both English characters and upper ASCII characters. 85 | PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 86 | { 87 | //do filtering to reduce load to probers 88 | char *newptr; 89 | char *prevPtr, *curPtr; 90 | PRBool isInTag = PR_FALSE; 91 | 92 | newptr = *newBuf = (char*)PR_Malloc(aLen); 93 | if (!newptr) 94 | return PR_FALSE; 95 | 96 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 97 | { 98 | if (*curPtr == '>') 99 | isInTag = PR_FALSE; 100 | else if (*curPtr == '<') 101 | isInTag = PR_TRUE; 102 | 103 | if (!(*curPtr & 0x80) && 104 | (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 105 | { 106 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 107 | // and it is not inside a tag, keep it. 108 | { 109 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 110 | prevPtr++; 111 | *newptr++ = ' '; 112 | } 113 | else 114 | prevPtr = curPtr+1; 115 | } 116 | } 117 | 118 | // If the current segment contains more than just a symbol 119 | // and it is not inside a tag then keep it. 120 | if (!isInTag) 121 | while (prevPtr < curPtr) 122 | *newptr++ = *prevPtr++; 123 | 124 | newLen = newptr - *newBuf; 125 | 126 | return PR_TRUE; 127 | } 128 | -------------------------------------------------------------------------------- /src/nsCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | #ifndef nsCharSetProber_h__ 41 | #define nsCharSetProber_h__ 42 | 43 | #include "nscore.h" 44 | 45 | //#define DEBUG_chardet // Uncomment this for debug dump. 46 | 47 | typedef enum { 48 | eDetecting = 0, //We are still detecting, no sure answer yet, but caller can ask for confidence. 49 | eFoundIt = 1, //That's a positive answer 50 | eNotMe = 2 //Negative answer 51 | } nsProbingState; 52 | 53 | #define SHORTCUT_THRESHOLD (float)0.95 54 | 55 | class nsCharSetProber { 56 | public: 57 | virtual ~nsCharSetProber() {}; 58 | virtual const char* GetCharSetName() = 0; 59 | virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; 60 | virtual nsProbingState GetState(void) = 0; 61 | virtual void Reset(void) = 0; 62 | virtual float GetConfidence(void) = 0; 63 | virtual void SetOpion() = 0; 64 | 65 | #ifdef DEBUG_chardet 66 | virtual void DumpStatus() {}; 67 | #endif 68 | 69 | // Helper functions used in the Latin1 and Group probers. 70 | // both functions Allocate a new buffer for newBuf. This buffer should be 71 | // freed by the caller using PR_FREEIF. 72 | // Both functions return PR_FALSE in case of memory allocation failure. 73 | static PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen); 74 | static PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen); 75 | 76 | }; 77 | 78 | #endif /* nsCharSetProber_h__ */ 79 | -------------------------------------------------------------------------------- /src/nsCodingStateMachine.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | #ifndef nsCodingStateMachine_h__ 40 | #define nsCodingStateMachine_h__ 41 | 42 | #include "nsPkgInt.h" 43 | 44 | typedef enum { 45 | eStart = 0, 46 | eError = 1, 47 | eItsMe = 2 48 | } nsSMState; 49 | 50 | #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) 51 | 52 | //state machine model 53 | typedef struct 54 | { 55 | nsPkgInt classTable; 56 | PRUint32 classFactor; 57 | nsPkgInt stateTable; 58 | const PRUint32* charLenTable; 59 | const char* name; 60 | } SMModel; 61 | 62 | class nsCodingStateMachine { 63 | public: 64 | nsCodingStateMachine(SMModel* sm) : mModel(sm) { mCurrentState = eStart; } 65 | nsSMState NextState(char c){ 66 | //for each byte we get its class , if it is first byte, we also get byte length 67 | PRUint32 byteCls = GETCLASS(c); 68 | if (mCurrentState == eStart) 69 | { 70 | mCurrentBytePos = 0; 71 | mCurrentCharLen = mModel->charLenTable[byteCls]; 72 | } 73 | //from byte's class and stateTable, we get its next state 74 | mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, 75 | mModel->stateTable); 76 | mCurrentBytePos++; 77 | return mCurrentState; 78 | }; 79 | PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}; 80 | void Reset(void) {mCurrentState = eStart;}; 81 | const char * GetCodingStateMachine() {return mModel->name;}; 82 | 83 | protected: 84 | nsSMState mCurrentState; 85 | PRUint32 mCurrentCharLen; 86 | PRUint32 mCurrentBytePos; 87 | 88 | SMModel *mModel; 89 | }; 90 | 91 | extern SMModel UTF8SMModel; 92 | extern SMModel Big5SMModel; 93 | extern SMModel EUCJPSMModel; 94 | extern SMModel EUCKRSMModel; 95 | extern SMModel EUCTWSMModel; 96 | extern SMModel GB18030SMModel; 97 | extern SMModel SJISSMModel; 98 | extern SMModel UCS2BESMModel; 99 | 100 | 101 | extern SMModel HZSMModel; 102 | extern SMModel ISO2022CNSMModel; 103 | extern SMModel ISO2022JPSMModel; 104 | extern SMModel ISO2022KRSMModel; 105 | 106 | #endif /* nsCodingStateMachine_h__ */ 107 | 108 | -------------------------------------------------------------------------------- /src/nsEUCJPProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | // for japanese encoding, obeserve characteristic: 41 | // 1, kana character (or hankaku?) often have hight frequency of appereance 42 | // 2, kana character often exist in group 43 | // 3, certain combination of kana is never used in japanese language 44 | 45 | #include "nsEUCJPProber.h" 46 | 47 | void nsEUCJPProber::Reset(void) 48 | { 49 | mCodingSM->Reset(); 50 | mState = eDetecting; 51 | mContextAnalyser.Reset(mIsPreferredLanguage); 52 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 53 | } 54 | 55 | nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) 56 | { 57 | nsSMState codingState; 58 | 59 | for (PRUint32 i = 0; i < aLen; i++) 60 | { 61 | codingState = mCodingSM->NextState(aBuf[i]); 62 | if (codingState == eItsMe) 63 | { 64 | mState = eFoundIt; 65 | break; 66 | } 67 | if (codingState == eStart) 68 | { 69 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 70 | 71 | if (i == 0) 72 | { 73 | mLastChar[1] = aBuf[0]; 74 | mContextAnalyser.HandleOneChar(mLastChar, charLen); 75 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 76 | } 77 | else 78 | { 79 | mContextAnalyser.HandleOneChar(aBuf+i-1, charLen); 80 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 81 | } 82 | } 83 | } 84 | 85 | mLastChar[0] = aBuf[aLen-1]; 86 | 87 | if (mState == eDetecting) 88 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 89 | mState = eFoundIt; 90 | 91 | return mState; 92 | } 93 | 94 | float nsEUCJPProber::GetConfidence(void) 95 | { 96 | float contxtCf = mContextAnalyser.GetConfidence(); 97 | float distribCf = mDistributionAnalyser.GetConfidence(); 98 | 99 | return (contxtCf > distribCf ? contxtCf : distribCf); 100 | } 101 | 102 | -------------------------------------------------------------------------------- /src/nsEUCJPProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | // for S-JIS encoding, obeserve characteristic: 41 | // 1, kana character (or hankaku?) often have hight frequency of appereance 42 | // 2, kana character often exist in group 43 | // 3, certain combination of kana is never used in japanese language 44 | 45 | #ifndef nsEUCJPProber_h__ 46 | #define nsEUCJPProber_h__ 47 | 48 | #include "nsCharSetProber.h" 49 | #include "nsCodingStateMachine.h" 50 | #include "JpCntx.h" 51 | #include "CharDistribution.h" 52 | 53 | class nsEUCJPProber: public nsCharSetProber { 54 | public: 55 | nsEUCJPProber(PRBool aIsPreferredLanguage) 56 | :mIsPreferredLanguage(aIsPreferredLanguage) 57 | {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); 58 | Reset();} 59 | virtual ~nsEUCJPProber(void){delete mCodingSM;}; 60 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 61 | const char* GetCharSetName() {return "EUC-JP";}; 62 | nsProbingState GetState(void) {return mState;}; 63 | void Reset(void); 64 | float GetConfidence(void); 65 | void SetOpion() {}; 66 | 67 | protected: 68 | nsCodingStateMachine* mCodingSM; 69 | nsProbingState mState; 70 | 71 | EUCJPContextAnalysis mContextAnalyser; 72 | EUCJPDistributionAnalysis mDistributionAnalyser; 73 | 74 | char mLastChar[2]; 75 | PRBool mIsPreferredLanguage; 76 | }; 77 | 78 | 79 | #endif /* nsEUCJPProber_h__ */ 80 | 81 | -------------------------------------------------------------------------------- /src/nsEUCKRProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #include "nsEUCKRProber.h" 41 | 42 | void nsEUCKRProber::Reset(void) 43 | { 44 | mCodingSM->Reset(); 45 | mState = eDetecting; 46 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 47 | //mContextAnalyser.Reset(); 48 | } 49 | 50 | nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) 51 | { 52 | nsSMState codingState; 53 | 54 | for (PRUint32 i = 0; i < aLen; i++) 55 | { 56 | codingState = mCodingSM->NextState(aBuf[i]); 57 | if (codingState == eItsMe) 58 | { 59 | mState = eFoundIt; 60 | break; 61 | } 62 | if (codingState == eStart) 63 | { 64 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 65 | 66 | if (i == 0) 67 | { 68 | mLastChar[1] = aBuf[0]; 69 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 70 | } 71 | else 72 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 73 | } 74 | } 75 | 76 | mLastChar[0] = aBuf[aLen-1]; 77 | 78 | if (mState == eDetecting) 79 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 80 | mState = eFoundIt; 81 | // else 82 | // mDistributionAnalyser.HandleData(aBuf, aLen); 83 | 84 | return mState; 85 | } 86 | 87 | float nsEUCKRProber::GetConfidence(void) 88 | { 89 | float distribCf = mDistributionAnalyser.GetConfidence(); 90 | 91 | return (float)distribCf; 92 | } 93 | 94 | -------------------------------------------------------------------------------- /src/nsEUCKRProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsEUCKRProber_h__ 41 | #define nsEUCKRProber_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | #include "CharDistribution.h" 46 | 47 | class nsEUCKRProber: public nsCharSetProber { 48 | public: 49 | nsEUCKRProber(PRBool aIsPreferredLanguage) 50 | :mIsPreferredLanguage(aIsPreferredLanguage) 51 | {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); 52 | Reset();} 53 | virtual ~nsEUCKRProber(void){delete mCodingSM;}; 54 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 55 | const char* GetCharSetName() {return "EUC-KR";}; 56 | nsProbingState GetState(void) {return mState;}; 57 | void Reset(void); 58 | float GetConfidence(void); 59 | void SetOpion() {}; 60 | 61 | protected: 62 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 63 | 64 | nsCodingStateMachine* mCodingSM; 65 | nsProbingState mState; 66 | 67 | //EUCKRContextAnalysis mContextAnalyser; 68 | EUCKRDistributionAnalysis mDistributionAnalyser; 69 | char mLastChar[2]; 70 | PRBool mIsPreferredLanguage; 71 | }; 72 | 73 | 74 | #endif /* nsEUCKRProber_h__ */ 75 | 76 | -------------------------------------------------------------------------------- /src/nsEUCTWProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #include "nsEUCTWProber.h" 41 | 42 | void nsEUCTWProber::Reset(void) 43 | { 44 | mCodingSM->Reset(); 45 | mState = eDetecting; 46 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 47 | //mContextAnalyser.Reset(); 48 | } 49 | 50 | nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) 51 | { 52 | nsSMState codingState; 53 | 54 | for (PRUint32 i = 0; i < aLen; i++) 55 | { 56 | codingState = mCodingSM->NextState(aBuf[i]); 57 | if (codingState == eItsMe) 58 | { 59 | mState = eFoundIt; 60 | break; 61 | } 62 | if (codingState == eStart) 63 | { 64 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 65 | 66 | if (i == 0) 67 | { 68 | mLastChar[1] = aBuf[0]; 69 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 70 | } 71 | else 72 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 73 | } 74 | } 75 | 76 | mLastChar[0] = aBuf[aLen-1]; 77 | 78 | if (mState == eDetecting) 79 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 80 | mState = eFoundIt; 81 | // else 82 | // mDistributionAnalyser.HandleData(aBuf, aLen); 83 | 84 | return mState; 85 | } 86 | 87 | float nsEUCTWProber::GetConfidence(void) 88 | { 89 | float distribCf = mDistributionAnalyser.GetConfidence(); 90 | 91 | return (float)distribCf; 92 | } 93 | 94 | -------------------------------------------------------------------------------- /src/nsEUCTWProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsEUCTWProber_h__ 41 | #define nsEUCTWProber_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | #include "CharDistribution.h" 46 | 47 | class nsEUCTWProber: public nsCharSetProber { 48 | public: 49 | nsEUCTWProber(PRBool aIsPreferredLanguage) 50 | :mIsPreferredLanguage(aIsPreferredLanguage) 51 | {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); 52 | Reset();} 53 | virtual ~nsEUCTWProber(void){delete mCodingSM;}; 54 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 55 | const char* GetCharSetName() {return "EUC-TW";}; 56 | nsProbingState GetState(void) {return mState;}; 57 | void Reset(void); 58 | float GetConfidence(void); 59 | void SetOpion() {}; 60 | 61 | protected: 62 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 63 | 64 | nsCodingStateMachine* mCodingSM; 65 | nsProbingState mState; 66 | 67 | //EUCTWContextAnalysis mContextAnalyser; 68 | EUCTWDistributionAnalysis mDistributionAnalyser; 69 | char mLastChar[2]; 70 | PRBool mIsPreferredLanguage; 71 | }; 72 | 73 | 74 | #endif /* nsEUCTWProber_h__ */ 75 | 76 | -------------------------------------------------------------------------------- /src/nsEscCharsetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | 41 | #include "nsEscCharsetProber.h" 42 | #include "nsUniversalDetector.h" 43 | 44 | nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter) 45 | { 46 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 47 | mCodingSM[i] = nsnull; 48 | if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 49 | { 50 | mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); 51 | mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); 52 | } 53 | if (aLanguageFilter & NS_FILTER_JAPANESE) 54 | mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); 55 | if (aLanguageFilter & NS_FILTER_KOREAN) 56 | mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); 57 | mActiveSM = NUM_OF_ESC_CHARSETS; 58 | mState = eDetecting; 59 | mDetectedCharset = nsnull; 60 | } 61 | 62 | nsEscCharSetProber::~nsEscCharSetProber(void) 63 | { 64 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 65 | delete mCodingSM[i]; 66 | } 67 | 68 | void nsEscCharSetProber::Reset(void) 69 | { 70 | mState = eDetecting; 71 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 72 | if (mCodingSM[i]) mCodingSM[i]->Reset(); 73 | mActiveSM = NUM_OF_ESC_CHARSETS; 74 | mDetectedCharset = nsnull; 75 | } 76 | 77 | nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) 78 | { 79 | nsSMState codingState; 80 | PRInt32 j; 81 | PRUint32 i; 82 | 83 | for ( i = 0; i < aLen && mState == eDetecting; i++) 84 | { 85 | for (j = mActiveSM-1; j>= 0; j--) 86 | { 87 | if (mCodingSM[j]) 88 | { 89 | codingState = mCodingSM[j]->NextState(aBuf[i]); 90 | if (codingState == eItsMe) 91 | { 92 | mState = eFoundIt; 93 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 94 | return mState; 95 | } 96 | } 97 | } 98 | } 99 | 100 | return mState; 101 | } 102 | 103 | -------------------------------------------------------------------------------- /src/nsEscCharsetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsEscCharSetProber_h__ 41 | #define nsEscCharSetProber_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | 46 | #define NUM_OF_ESC_CHARSETS 4 47 | 48 | class nsEscCharSetProber: public nsCharSetProber { 49 | public: 50 | nsEscCharSetProber(PRUint32 aLanguageFilter); 51 | virtual ~nsEscCharSetProber(void); 52 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 53 | const char* GetCharSetName() {return mDetectedCharset;}; 54 | nsProbingState GetState(void) {return mState;}; 55 | void Reset(void); 56 | float GetConfidence(void){return (float)0.99;}; 57 | void SetOpion() {}; 58 | 59 | protected: 60 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 61 | 62 | nsCodingStateMachine* mCodingSM[NUM_OF_ESC_CHARSETS] ; 63 | PRUint32 mActiveSM; 64 | nsProbingState mState; 65 | const char * mDetectedCharset; 66 | }; 67 | 68 | #endif /* nsEscCharSetProber_h__ */ 69 | 70 | -------------------------------------------------------------------------------- /src/nsGB2312Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | // for S-JIS encoding, obeserve characteristic: 41 | // 1, kana character (or hankaku?) often have hight frequency of appereance 42 | // 2, kana character often exist in group 43 | // 3, certain combination of kana is never used in japanese language 44 | 45 | #include "nsGB2312Prober.h" 46 | 47 | void nsGB18030Prober::Reset(void) 48 | { 49 | mCodingSM->Reset(); 50 | mState = eDetecting; 51 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 52 | //mContextAnalyser.Reset(); 53 | } 54 | 55 | nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) 56 | { 57 | nsSMState codingState; 58 | 59 | for (PRUint32 i = 0; i < aLen; i++) 60 | { 61 | codingState = mCodingSM->NextState(aBuf[i]); 62 | if (codingState == eItsMe) 63 | { 64 | mState = eFoundIt; 65 | break; 66 | } 67 | if (codingState == eStart) 68 | { 69 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 70 | 71 | if (i == 0) 72 | { 73 | mLastChar[1] = aBuf[0]; 74 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 75 | } 76 | else 77 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 78 | } 79 | } 80 | 81 | mLastChar[0] = aBuf[aLen-1]; 82 | 83 | if (mState == eDetecting) 84 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 85 | mState = eFoundIt; 86 | // else 87 | // mDistributionAnalyser.HandleData(aBuf, aLen); 88 | 89 | return mState; 90 | } 91 | 92 | float nsGB18030Prober::GetConfidence(void) 93 | { 94 | float distribCf = mDistributionAnalyser.GetConfidence(); 95 | 96 | return (float)distribCf; 97 | } 98 | 99 | -------------------------------------------------------------------------------- /src/nsGB2312Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsGB2312Prober_h__ 41 | #define nsGB2312Prober_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | #include "CharDistribution.h" 46 | 47 | // We use gb18030 to replace gb2312, because 18030 is a superset. 48 | 49 | class nsGB18030Prober: public nsCharSetProber { 50 | public: 51 | nsGB18030Prober(PRBool aIsPreferredLanguage) 52 | :mIsPreferredLanguage(aIsPreferredLanguage) 53 | {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); 54 | Reset();} 55 | virtual ~nsGB18030Prober(void){delete mCodingSM;}; 56 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 57 | const char* GetCharSetName() {return "GB18030";}; 58 | nsProbingState GetState(void) {return mState;}; 59 | void Reset(void); 60 | float GetConfidence(void); 61 | void SetOpion() {}; 62 | 63 | protected: 64 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 65 | 66 | nsCodingStateMachine* mCodingSM; 67 | nsProbingState mState; 68 | 69 | //GB2312ContextAnalysis mContextAnalyser; 70 | GB2312DistributionAnalysis mDistributionAnalyser; 71 | char mLastChar[2]; 72 | PRBool mIsPreferredLanguage; 73 | }; 74 | 75 | 76 | #endif /* nsGB2312Prober_h__ */ 77 | -------------------------------------------------------------------------------- /src/nsLatin1Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | 41 | #include "nsLatin1Prober.h" 42 | #include "prmem.h" 43 | #include 44 | 45 | #define UDF 0 // undefined 46 | #define OTH 1 //other 47 | #define ASC 2 // ascii capital letter 48 | #define ASS 3 // ascii small letter 49 | #define ACV 4 // accent capital vowel 50 | #define ACO 5 // accent capital other 51 | #define ASV 6 // accent small vowel 52 | #define ASO 7 // accent small other 53 | #define CLASS_NUM 8 // total classes 54 | 55 | static unsigned char Latin1_CharToClass[] = 56 | { 57 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 58 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 59 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 60 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 61 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 62 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 63 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 64 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 65 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 66 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 67 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 68 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 69 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 70 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 71 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 72 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 73 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 74 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 75 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 76 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 77 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 78 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 79 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 80 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 81 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 82 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 83 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 84 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 85 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 86 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 87 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 88 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 89 | }; 90 | 91 | 92 | /* 0 : illegal 93 | 1 : very unlikely 94 | 2 : normal 95 | 3 : very likely 96 | */ 97 | static unsigned char Latin1ClassModel[] = 98 | { 99 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ 100 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 101 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 102 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 103 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 104 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 105 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 106 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 107 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 108 | }; 109 | 110 | void nsLatin1Prober::Reset(void) 111 | { 112 | mState = eDetecting; 113 | mLastCharClass = OTH; 114 | for (int i = 0; i < FREQ_CAT_NUM; i++) 115 | mFreqCounter[i] = 0; 116 | } 117 | 118 | 119 | nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) 120 | { 121 | char *newBuf1 = 0; 122 | PRUint32 newLen1 = 0; 123 | 124 | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 125 | newBuf1 = (char*)aBuf; 126 | newLen1 = aLen; 127 | } 128 | 129 | unsigned char charClass; 130 | unsigned char freq; 131 | for (PRUint32 i = 0; i < newLen1; i++) 132 | { 133 | charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 134 | freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; 135 | if (freq == 0) { 136 | mState = eNotMe; 137 | break; 138 | } 139 | mFreqCounter[freq]++; 140 | mLastCharClass = charClass; 141 | } 142 | 143 | if (newBuf1 != aBuf) 144 | PR_FREEIF(newBuf1); 145 | 146 | return mState; 147 | } 148 | 149 | float nsLatin1Prober::GetConfidence(void) 150 | { 151 | if (mState == eNotMe) 152 | return 0.01f; 153 | 154 | float confidence; 155 | PRUint32 total = 0; 156 | for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) 157 | total += mFreqCounter[i]; 158 | 159 | if(!total) 160 | confidence = 0.0f; 161 | else 162 | { 163 | confidence = mFreqCounter[3]*1.0f / total; 164 | confidence -= mFreqCounter[1]*20.0f/total; 165 | } 166 | 167 | if (confidence < 0.0f) 168 | confidence = 0.0f; 169 | 170 | // lower the confidence of latin1 so that other more accurate detector 171 | // can take priority. 172 | confidence *= 0.50f; 173 | 174 | return confidence; 175 | } 176 | 177 | #ifdef DEBUG_chardet 178 | void nsLatin1Prober::DumpStatus() 179 | { 180 | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 181 | } 182 | #endif 183 | 184 | 185 | -------------------------------------------------------------------------------- /src/nsLatin1Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | 41 | #ifndef nsLatin1Prober_h__ 42 | #define nsLatin1Prober_h__ 43 | 44 | #include "nsCharSetProber.h" 45 | 46 | #define FREQ_CAT_NUM 4 47 | 48 | class nsLatin1Prober: public nsCharSetProber { 49 | public: 50 | nsLatin1Prober(void){Reset();}; 51 | virtual ~nsLatin1Prober(void){}; 52 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 53 | const char* GetCharSetName() {return "windows-1252";}; 54 | nsProbingState GetState(void) {return mState;}; 55 | void Reset(void); 56 | float GetConfidence(void); 57 | void SetOpion() {}; 58 | 59 | #ifdef DEBUG_chardet 60 | virtual void DumpStatus(); 61 | #endif 62 | 63 | protected: 64 | 65 | nsProbingState mState; 66 | char mLastCharClass; 67 | PRUint32 mFreqCounter[FREQ_CAT_NUM]; 68 | }; 69 | 70 | 71 | #endif /* nsLatin1Prober_h__ */ 72 | 73 | -------------------------------------------------------------------------------- /src/nsMBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Proofpoint, Inc. 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | 41 | #ifndef nsMBCSGroupProber_h__ 42 | #define nsMBCSGroupProber_h__ 43 | 44 | #include "nsSJISProber.h" 45 | #include "nsUTF8Prober.h" 46 | #include "nsEUCJPProber.h" 47 | #include "nsGB2312Prober.h" 48 | #include "nsEUCKRProber.h" 49 | #include "nsBig5Prober.h" 50 | #include "nsEUCTWProber.h" 51 | 52 | #define NUM_OF_PROBERS 7 53 | 54 | class nsMBCSGroupProber: public nsCharSetProber { 55 | public: 56 | nsMBCSGroupProber(); 57 | nsMBCSGroupProber(PRUint32 aLanguageFilter); 58 | virtual ~nsMBCSGroupProber(); 59 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 60 | const char* GetCharSetName(); 61 | nsProbingState GetState(void) {return mState;}; 62 | void Reset(void); 63 | float GetConfidence(void); 64 | void SetOpion() {}; 65 | 66 | #ifdef DEBUG_chardet 67 | void DumpStatus(); 68 | #endif 69 | #ifdef DEBUG_jgmyers 70 | void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset); 71 | #endif 72 | 73 | protected: 74 | nsProbingState mState; 75 | nsCharSetProber* mProbers[NUM_OF_PROBERS]; 76 | PRBool mIsActive[NUM_OF_PROBERS]; 77 | PRInt32 mBestGuess; 78 | PRUint32 mActiveNum; 79 | PRUint32 mKeepNext; 80 | }; 81 | 82 | #endif /* nsMBCSGroupProber_h__ */ 83 | 84 | -------------------------------------------------------------------------------- /src/nsPkgInt.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsPkgInt_h__ 41 | #define nsPkgInt_h__ 42 | #include "nscore.h" 43 | 44 | typedef enum { 45 | eIdxSft4bits = 3, 46 | eIdxSft8bits = 2, 47 | eIdxSft16bits = 1 48 | } nsIdxSft; 49 | 50 | typedef enum { 51 | eSftMsk4bits = 7, 52 | eSftMsk8bits = 3, 53 | eSftMsk16bits = 1 54 | } nsSftMsk; 55 | 56 | typedef enum { 57 | eBitSft4bits = 2, 58 | eBitSft8bits = 3, 59 | eBitSft16bits = 4 60 | } nsBitSft; 61 | 62 | typedef enum { 63 | eUnitMsk4bits = 0x0000000FL, 64 | eUnitMsk8bits = 0x000000FFL, 65 | eUnitMsk16bits = 0x0000FFFFL 66 | } nsUnitMsk; 67 | 68 | typedef struct nsPkgInt { 69 | nsIdxSft idxsft; 70 | nsSftMsk sftmsk; 71 | nsBitSft bitsft; 72 | nsUnitMsk unitmsk; 73 | PRUint32 *data; 74 | } nsPkgInt; 75 | 76 | 77 | #define PCK16BITS(a,b) ((PRUint32)(((b) << 16) | (a))) 78 | 79 | #define PCK8BITS(a,b,c,d) PCK16BITS( ((PRUint32)(((b) << 8) | (a))), \ 80 | ((PRUint32)(((d) << 8) | (c)))) 81 | 82 | #define PCK4BITS(a,b,c,d,e,f,g,h) PCK8BITS( ((PRUint32)(((b) << 4) | (a))), \ 83 | ((PRUint32)(((d) << 4) | (c))), \ 84 | ((PRUint32)(((f) << 4) | (e))), \ 85 | ((PRUint32)(((h) << 4) | (g))) ) 86 | 87 | #define GETFROMPCK(i, c) \ 88 | (((((c).data)[(i)>>(c).idxsft])>>(((i)&(c).sftmsk)<<(c).bitsft))&(c).unitmsk) 89 | 90 | #endif /* nsPkgInt_h__ */ 91 | 92 | -------------------------------------------------------------------------------- /src/nsSBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | 41 | #ifndef nsSBCSGroupProber_h__ 42 | #define nsSBCSGroupProber_h__ 43 | 44 | 45 | #define NUM_OF_SBCS_PROBERS 35 46 | 47 | class nsCharSetProber; 48 | class nsSBCSGroupProber: public nsCharSetProber { 49 | public: 50 | nsSBCSGroupProber(); 51 | virtual ~nsSBCSGroupProber(); 52 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 53 | const char* GetCharSetName(); 54 | nsProbingState GetState(void) {return mState;}; 55 | void Reset(void); 56 | float GetConfidence(void); 57 | void SetOpion() {}; 58 | 59 | #ifdef DEBUG_chardet 60 | void DumpStatus(); 61 | #endif 62 | 63 | protected: 64 | nsProbingState mState; 65 | nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS]; 66 | PRBool mIsActive[NUM_OF_SBCS_PROBERS]; 67 | PRInt32 mBestGuess; 68 | PRUint32 mActiveNum; 69 | }; 70 | 71 | #endif /* nsSBCSGroupProber_h__ */ 72 | 73 | -------------------------------------------------------------------------------- /src/nsSBCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | #include 41 | #include "nsSBCharSetProber.h" 42 | 43 | nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) 44 | { 45 | unsigned char order; 46 | 47 | for (PRUint32 i = 0; i < aLen; i++) 48 | { 49 | order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; 50 | 51 | if (order < SYMBOL_CAT_ORDER) 52 | mTotalChar++; 53 | else if (order == ILL) 54 | { 55 | mState = eNotMe; 56 | break; 57 | } 58 | else if (order == CTR) 59 | mCtrlChar++; 60 | 61 | if (order < mModel->freqCharCount) 62 | { 63 | mFreqChar++; 64 | 65 | if (mLastOrder < mModel->freqCharCount) 66 | { 67 | mTotalSeqs++; 68 | if (!mReversed) 69 | ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]); 70 | else // reverse the order of the letters in the lookup 71 | ++(mSeqCounters[(int)mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]); 72 | } 73 | } 74 | mLastOrder = order; 75 | } 76 | 77 | if (mState == eDetecting) 78 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) 79 | { 80 | float cf = GetConfidence(); 81 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) 82 | mState = eFoundIt; 83 | else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) 84 | mState = eNotMe; 85 | } 86 | 87 | return mState; 88 | } 89 | 90 | void nsSingleByteCharSetProber::Reset(void) 91 | { 92 | mState = eDetecting; 93 | mLastOrder = 255; 94 | for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++) 95 | mSeqCounters[i] = 0; 96 | mTotalSeqs = 0; 97 | mTotalChar = 0; 98 | mCtrlChar = 0; 99 | mFreqChar = 0; 100 | } 101 | 102 | //#define NEGATIVE_APPROACH 1 103 | 104 | float nsSingleByteCharSetProber::GetConfidence(void) 105 | { 106 | #ifdef NEGATIVE_APPROACH 107 | if (mTotalSeqs > 0) 108 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) 109 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; 110 | return (float)0.01; 111 | #else //POSITIVE_APPROACH 112 | float r; 113 | 114 | if (mTotalSeqs > 0) { 115 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; 116 | r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar; 117 | r = r * (mTotalChar-mCtrlChar) / mTotalChar; 118 | r = r * mFreqChar / mTotalChar; 119 | if (r >= (float)1.00) 120 | r = (float)0.99; 121 | return r; 122 | } 123 | return (float)0.01; 124 | #endif 125 | } 126 | 127 | const char* nsSingleByteCharSetProber::GetCharSetName() 128 | { 129 | if (!mNameProber) 130 | return mModel->charsetName; 131 | return mNameProber->GetCharSetName(); 132 | } 133 | 134 | #ifdef DEBUG_chardet 135 | void nsSingleByteCharSetProber::DumpStatus() 136 | { 137 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 138 | } 139 | #endif 140 | -------------------------------------------------------------------------------- /src/nsSBCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Universal charset detector code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 2001 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * Shy Shalom 26 | * 27 | * Alternatively, the contents of this file may be used under the terms of 28 | * either the GNU General Public License Version 2 or later (the "GPL"), or 29 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 30 | * in which case the provisions of the GPL or the LGPL are applicable instead 31 | * of those above. If you wish to allow use of your version of this file only 32 | * under the terms of either the GPL or the LGPL, and not to allow others to 33 | * use your version of this file under the terms of the MPL, indicate your 34 | * decision by deleting the provisions above and replace them with the notice 35 | * and other provisions required by the GPL or the LGPL. If you do not delete 36 | * the provisions above, a recipient may use your version of this file under 37 | * the terms of any one of the MPL, the GPL or the LGPL. 38 | * 39 | * ***** END LICENSE BLOCK ***** */ 40 | #ifndef nsSingleByteCharSetProber_h__ 41 | #define nsSingleByteCharSetProber_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | 45 | /** Codepoints **/ 46 | 47 | /* Illegal codepoints.*/ 48 | #define ILL 255 49 | /* Control character. */ 50 | #define CTR 254 51 | /* Symbols and punctuation that does not belong to words. */ 52 | #define SYM 253 53 | /* Return/Line feeds. */ 54 | #define RET 252 55 | /* Numbers 0-9. */ 56 | #define NUM 251 57 | 58 | #define SB_ENOUGH_REL_THRESHOLD 1024 59 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 60 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 61 | #define SYMBOL_CAT_ORDER 250 62 | 63 | #define NUMBER_OF_SEQ_CAT 4 64 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) 65 | #define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2) 66 | #define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3) 67 | #define NEGATIVE_CAT 0 68 | 69 | typedef struct 70 | { 71 | unsigned char *charToOrderMap; // [256] table use to find a char's order 72 | char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 73 | int freqCharCount; // The count of frequent characters. 74 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 75 | PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) 76 | const char* charsetName; 77 | } SequenceModel; 78 | 79 | 80 | class nsSingleByteCharSetProber : public nsCharSetProber{ 81 | public: 82 | nsSingleByteCharSetProber(SequenceModel *model) 83 | :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } 84 | nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) 85 | :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } 86 | 87 | virtual const char* GetCharSetName(); 88 | virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 89 | virtual nsProbingState GetState(void) {return mState;}; 90 | virtual void Reset(void); 91 | virtual float GetConfidence(void); 92 | virtual void SetOpion() {}; 93 | 94 | // This feature is not implemented yet. any current language model 95 | // contain this parameter as PR_FALSE. No one is looking at this 96 | // parameter or calling this method. 97 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this 98 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 99 | // of the English letters. 100 | PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented) 101 | 102 | #ifdef DEBUG_chardet 103 | virtual void DumpStatus(); 104 | #endif 105 | 106 | protected: 107 | nsProbingState mState; 108 | const SequenceModel *mModel; 109 | const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup 110 | 111 | //char order of last character 112 | unsigned char mLastOrder; 113 | 114 | PRUint32 mTotalSeqs; 115 | PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; 116 | 117 | PRUint32 mTotalChar; 118 | PRUint32 mCtrlChar; 119 | //characters that fall in our sampling range 120 | PRUint32 mFreqChar; 121 | 122 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 123 | nsCharSetProber* mNameProber; 124 | 125 | }; 126 | 127 | extern SequenceModel Koi8r_CyrillicModel; 128 | extern SequenceModel Win1251_CyrillicModel; 129 | extern SequenceModel Latin5_CyrillicModel; 130 | extern SequenceModel MacCyrillicModel; 131 | extern SequenceModel Ibm866_CyrillicModel; 132 | extern SequenceModel Ibm855_CyrillicModel; 133 | extern SequenceModel Latin7_GreekModel; 134 | extern SequenceModel Win1253_GreekModel; 135 | extern SequenceModel Latin5_BulgarianModel; 136 | extern SequenceModel Win1251_BulgarianModel; 137 | extern SequenceModel Latin2_HungarianModel; 138 | extern SequenceModel Win1250_HungarianModel; 139 | extern SequenceModel TIS620_ThaiModel; 140 | extern SequenceModel Latin11_ThaiModel; 141 | extern SequenceModel Win1255_HebrewModel; 142 | extern SequenceModel Latin6_ArabicModel; 143 | extern SequenceModel Windows1256_ArabicModel; 144 | extern SequenceModel Latin1_DanishModel; 145 | extern SequenceModel Latin15_DanishModel; 146 | extern SequenceModel Windows1252_DanishModel; 147 | extern SequenceModel Latin3_EsperantoModel; 148 | extern SequenceModel Latin1_FrenchModel; 149 | extern SequenceModel Latin15_FrenchModel; 150 | extern SequenceModel Windows1252_FrenchModel; 151 | extern SequenceModel Latin1_GermanModel; 152 | extern SequenceModel Windows1252_GermanModel; 153 | extern SequenceModel Latin1_SpanishModel; 154 | extern SequenceModel Latin15_SpanishModel; 155 | extern SequenceModel Windows1252_SpanishModel; 156 | extern SequenceModel Latin3_TurkishModel; 157 | extern SequenceModel Latin9_TurkishModel; 158 | extern SequenceModel Viscii_VietnameseModel; 159 | extern SequenceModel Windows1258_VietnameseModel; 160 | 161 | #endif /* nsSingleByteCharSetProber_h__ */ 162 | 163 | -------------------------------------------------------------------------------- /src/nsSJISProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | // for S-JIS encoding, obeserve characteristic: 41 | // 1, kana character (or hankaku?) often have hight frequency of appereance 42 | // 2, kana character often exist in group 43 | // 3, certain combination of kana is never used in japanese language 44 | 45 | #include "nsSJISProber.h" 46 | 47 | void nsSJISProber::Reset(void) 48 | { 49 | mCodingSM->Reset(); 50 | mState = eDetecting; 51 | mContextAnalyser.Reset(mIsPreferredLanguage); 52 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 53 | } 54 | 55 | nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) 56 | { 57 | nsSMState codingState; 58 | 59 | for (PRUint32 i = 0; i < aLen; i++) 60 | { 61 | codingState = mCodingSM->NextState(aBuf[i]); 62 | if (codingState == eItsMe) 63 | { 64 | mState = eFoundIt; 65 | break; 66 | } 67 | if (codingState == eStart) 68 | { 69 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 70 | if (i == 0) 71 | { 72 | mLastChar[1] = aBuf[0]; 73 | mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen); 74 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 75 | } 76 | else 77 | { 78 | mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen); 79 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 80 | } 81 | } 82 | } 83 | 84 | mLastChar[0] = aBuf[aLen-1]; 85 | 86 | if (mState == eDetecting) 87 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 88 | mState = eFoundIt; 89 | 90 | return mState; 91 | } 92 | 93 | float nsSJISProber::GetConfidence(void) 94 | { 95 | float contxtCf = mContextAnalyser.GetConfidence(); 96 | float distribCf = mDistributionAnalyser.GetConfidence(); 97 | 98 | return (contxtCf > distribCf ? contxtCf : distribCf); 99 | } 100 | 101 | -------------------------------------------------------------------------------- /src/nsSJISProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | // for S-JIS encoding, obeserve characteristic: 41 | // 1, kana character (or hankaku?) often have hight frequency of appereance 42 | // 2, kana character often exist in group 43 | // 3, certain combination of kana is never used in japanese language 44 | 45 | #ifndef nsSJISProber_h__ 46 | #define nsSJISProber_h__ 47 | 48 | #include "nsCharSetProber.h" 49 | #include "nsCodingStateMachine.h" 50 | #include "JpCntx.h" 51 | #include "CharDistribution.h" 52 | 53 | 54 | class nsSJISProber: public nsCharSetProber { 55 | public: 56 | nsSJISProber(PRBool aIsPreferredLanguage) 57 | :mIsPreferredLanguage(aIsPreferredLanguage) 58 | {mCodingSM = new nsCodingStateMachine(&SJISSMModel); 59 | Reset();} 60 | virtual ~nsSJISProber(void){delete mCodingSM;}; 61 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 62 | const char* GetCharSetName() {return "Shift_JIS";}; 63 | nsProbingState GetState(void) {return mState;}; 64 | void Reset(void); 65 | float GetConfidence(void); 66 | void SetOpion() {}; 67 | 68 | protected: 69 | nsCodingStateMachine* mCodingSM; 70 | nsProbingState mState; 71 | 72 | SJISContextAnalysis mContextAnalyser; 73 | SJISDistributionAnalysis mDistributionAnalyser; 74 | 75 | char mLastChar[2]; 76 | PRBool mIsPreferredLanguage; 77 | }; 78 | 79 | 80 | #endif /* nsSJISProber_h__ */ 81 | 82 | -------------------------------------------------------------------------------- /src/nsUTF8Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #include "nsUTF8Prober.h" 41 | 42 | void nsUTF8Prober::Reset(void) 43 | { 44 | mCodingSM->Reset(); 45 | mNumOfMBChar = 0; 46 | mState = eDetecting; 47 | } 48 | 49 | nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) 50 | { 51 | nsSMState codingState; 52 | 53 | for (PRUint32 i = 0; i < aLen; i++) 54 | { 55 | codingState = mCodingSM->NextState(aBuf[i]); 56 | if (codingState == eItsMe) 57 | { 58 | mState = eFoundIt; 59 | break; 60 | } 61 | if (codingState == eStart) 62 | { 63 | if (mCodingSM->GetCurrentCharLen() >= 2) 64 | mNumOfMBChar++; 65 | } 66 | } 67 | 68 | if (mState == eDetecting) 69 | if (GetConfidence() > SHORTCUT_THRESHOLD) 70 | mState = eFoundIt; 71 | return mState; 72 | } 73 | 74 | #define ONE_CHAR_PROB (float)0.50 75 | 76 | float nsUTF8Prober::GetConfidence(void) 77 | { 78 | float unlike = (float)0.99; 79 | 80 | if (mNumOfMBChar < 6) 81 | { 82 | for (PRUint32 i = 0; i < mNumOfMBChar; i++) 83 | unlike *= ONE_CHAR_PROB; 84 | return (float)1.0 - unlike; 85 | } 86 | else 87 | return (float)0.99; 88 | } 89 | 90 | -------------------------------------------------------------------------------- /src/nsUTF8Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is mozilla.org code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | #ifndef nsUTF8Prober_h__ 41 | #define nsUTF8Prober_h__ 42 | 43 | #include "nsCharSetProber.h" 44 | #include "nsCodingStateMachine.h" 45 | 46 | class nsUTF8Prober: public nsCharSetProber { 47 | public: 48 | nsUTF8Prober(){mNumOfMBChar = 0; 49 | mCodingSM = new nsCodingStateMachine(&UTF8SMModel); 50 | Reset(); }; 51 | virtual ~nsUTF8Prober(){delete mCodingSM;}; 52 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 53 | const char* GetCharSetName() {return "UTF-8";}; 54 | nsProbingState GetState(void) {return mState;}; 55 | void Reset(void); 56 | float GetConfidence(void); 57 | void SetOpion() {}; 58 | 59 | protected: 60 | nsCodingStateMachine* mCodingSM; 61 | nsProbingState mState; 62 | PRUint32 mNumOfMBChar; 63 | }; 64 | 65 | #endif /* nsUTF8Prober_h__ */ 66 | 67 | -------------------------------------------------------------------------------- /src/nsUniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 | * vim: et sw=2 ts=2 fdm=marker 3 | */ 4 | /* ***** BEGIN LICENSE BLOCK ***** 5 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is Mozilla Communicator client code. 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Netscape Communications Corporation. 21 | * Portions created by the Initial Developer are Copyright (C) 1998 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * JoungKyun.Kim 26 | * - Add mDetectedConfidence 27 | * - Add mDetectedIsBOM 28 | * 29 | * Alternatively, the contents of this file may be used under the terms of 30 | * either the GNU General Public License Version 2 or later (the "GPL"), or 31 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 32 | * in which case the provisions of the GPL or the LGPL are applicable instead 33 | * of those above. If you wish to allow use of your version of this file only 34 | * under the terms of either the GPL or the LGPL, and not to allow others to 35 | * use your version of this file under the terms of the MPL, indicate your 36 | * decision by deleting the provisions above and replace them with the notice 37 | * and other provisions required by the GPL or the LGPL. If you do not delete 38 | * the provisions above, a recipient may use your version of this file under 39 | * the terms of any one of the MPL, the GPL or the LGPL. 40 | * 41 | * ***** END LICENSE BLOCK ***** */ 42 | 43 | #ifndef nsUniversalDetector_h__ 44 | #define nsUniversalDetector_h__ 45 | 46 | class nsCharSetProber; 47 | 48 | #define NUM_OF_CHARSET_PROBERS 3 49 | 50 | typedef enum { 51 | ePureAscii = 0, 52 | eEscAscii = 1, 53 | eHighbyte = 2 54 | } nsInputState; 55 | 56 | #define NS_FILTER_CHINESE_SIMPLIFIED 0x01 57 | #define NS_FILTER_CHINESE_TRADITIONAL 0x02 58 | #define NS_FILTER_JAPANESE 0x04 59 | #define NS_FILTER_KOREAN 0x08 60 | #define NS_FILTER_NON_CJK 0x10 61 | #define NS_FILTER_ALL 0x1F 62 | #define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \ 63 | NS_FILTER_CHINESE_TRADITIONAL) 64 | #define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \ 65 | NS_FILTER_CHINESE_TRADITIONAL | \ 66 | NS_FILTER_JAPANESE | \ 67 | NS_FILTER_KOREAN) 68 | 69 | class nsUniversalDetector { 70 | public: 71 | nsUniversalDetector(PRUint32 aLanguageFilter); 72 | virtual ~nsUniversalDetector(); 73 | virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); 74 | virtual void DataEnd(void); 75 | 76 | protected: 77 | virtual void Report(const char* aCharset) = 0; 78 | virtual void Reset(); 79 | nsInputState mInputState; 80 | PRBool mNbspFound; 81 | PRBool mDone; 82 | PRBool mInTag; 83 | PRBool mStart; 84 | PRBool mGotData; 85 | char mLastChar; 86 | const char * mDetectedCharset; 87 | float mDetectedConfidence; 88 | short mDetectedIsBOM; 89 | PRInt32 mBestGuess; 90 | PRUint32 mLanguageFilter; 91 | 92 | nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; 93 | nsCharSetProber *mEscCharSetProber; 94 | }; 95 | 96 | #endif 97 | 98 | -------------------------------------------------------------------------------- /src/tables/LangEsperantoModel.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Communicator client code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Jehan 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #include "nsSBCharSetProber.h" 40 | 41 | /********* Language model for: Esperanto *********/ 42 | 43 | /** 44 | * Generated by BuildLangModel.py 45 | * On: 2016-05-04 10:45:46.721306 46 | **/ 47 | 48 | /* Character Mapping Table: 49 | * ILL: illegal character. 50 | * CTR: control character specific to the charset. 51 | * RET: carriage/return. 52 | * SYM: symbol (punctuation) that does not belong to word. 53 | * NUM: 0 - 9. 54 | * 55 | * Other characters are ordered by probabilities 56 | * (0 is the most common character in the language). 57 | * 58 | * Orders are generic to a language. So the codepoint with order X in 59 | * CHARSET1 maps to the same character as the codepoint with the same 60 | * order X in CHARSET2 for the same language. 61 | * As such, it is possible to get missing order. For instance the 62 | * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 63 | * even though they are both used for French. Same for the euro sign. 64 | */ 65 | static unsigned char Latin3CharToOrderMap[] = 66 | { 67 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 69 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 70 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 71 | SYM, 0, 19, 17, 10, 2, 18, 15, 20, 3, 11, 9, 6, 13, 4, 1, /* 4X */ 72 | 14, 34, 5, 8, 7, 12, 16, 26, 32, 25, 21,SYM,SYM,SYM,SYM,SYM, /* 5X */ 73 | SYM, 0, 19, 17, 10, 2, 18, 15, 20, 3, 11, 9, 6, 13, 4, 1, /* 6X */ 74 | 14, 34, 5, 8, 7, 12, 16, 26, 32, 25, 21,SYM,SYM,SYM,SYM,CTR, /* 7X */ 75 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 77 | SYM, 55,SYM,SYM,SYM,ILL, 30,SYM,SYM, 56, 53, 57, 28,SYM,ILL, 43, /* AX */ 78 | SYM, 58,SYM,SYM,SYM,SYM, 30,SYM,SYM, 59, 53, 60, 28,SYM,ILL, 43, /* BX */ 79 | 48, 29, 45,ILL, 39, 61, 24, 42, 46, 31, 47, 50, 54, 37, 44, 51, /* CX */ 80 | ILL, 40, 62, 33, 49, 52, 35,SYM, 22, 63, 38, 64, 36, 23, 27, 41, /* DX */ 81 | 48, 29, 45,ILL, 39, 65, 24, 42, 46, 31, 47, 50, 54, 37, 44, 51, /* EX */ 82 | ILL, 40, 66, 33, 49, 52, 35,SYM, 22, 67, 38, 68, 36, 23, 27,SYM, /* FX */ 83 | }; 84 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 85 | 86 | 87 | /* Model Table: 88 | * Total sequences: 966 89 | * First 512 sequences: 0.9949447894488864 90 | * Next 512 sequences (512-1024): 0.005055210551113598 91 | * Rest: -2.2551405187698492e-17 92 | * Negative sequences: TODO 93 | */ 94 | static char EsperantoLangModel[] = 95 | { 96 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3, 97 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,0,2, 98 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3, 99 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,3,0,2,3, 100 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,2,2,2,2,2, 101 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,3,2, 102 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,3,2,2,0,3,2, 103 | 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,0,0,2,3,2,2,0,2,0, 104 | 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,0,0,2,3,2,0,0,3,0, 105 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,0,2,2,2,2,2,0,3,0, 106 | 3,3,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,3,3,2,0,2,3,3,0,2,2,0, 107 | 3,3,3,3,3,3,3,3,2,2,2,0,3,3,3,0,2,2,2,3,2,2,2,0,2,2,0,2,0,3,0, 108 | 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,3,2,2,3,0,0,2, 109 | 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,3,3,0,2,0,0,2,3,0,2,0,3,0, 110 | 3,3,3,3,2,3,3,3,3,2,2,2,3,2,3,0,0,2,2,2,3,2,0,0,2,2,0,2,0,3,0, 111 | 3,3,3,3,3,3,3,2,3,2,3,2,3,3,2,2,3,2,2,2,3,0,0,0,0,3,2,2,0,3,0, 112 | 3,3,3,3,2,3,2,2,2,2,3,3,3,0,2,2,0,2,2,2,2,2,0,0,0,2,0,0,0,3,0, 113 | 3,3,3,3,2,3,3,3,3,3,2,0,3,2,2,2,2,3,2,3,3,3,0,0,0,2,0,0,0,2,2, 114 | 3,3,3,3,2,3,3,3,2,2,2,2,3,3,2,3,2,2,3,2,2,0,0,0,0,2,2,0,0,2,0, 115 | 3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,2,0,2,2,2,0,0,0,3,0,3,0,2,0, 116 | 3,3,3,3,3,3,3,3,2,0,0,2,3,3,2,2,0,2,2,2,2,0,0,0,0,2,2,0,0,3,0, 117 | 3,3,3,3,3,2,3,3,3,2,2,2,3,2,2,2,2,2,2,3,0,3,2,0,0,3,2,0,0,2,0, 118 | 3,3,3,3,0,0,3,2,0,2,2,2,3,2,2,3,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0, 119 | 3,2,3,3,2,3,3,3,3,3,3,2,2,2,2,3,2,0,2,2,0,3,0,0,2,0,0,2,0,0,0, 120 | 3,3,3,3,0,2,2,0,0,2,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 121 | 3,3,3,3,3,2,3,2,3,2,3,2,2,2,2,2,2,2,0,2,2,0,0,0,0,2,2,0,0,2,0, 122 | 3,3,3,3,3,2,2,2,3,0,0,0,0,2,0,2,0,2,0,0,2,2,0,0,0,2,3,0,0,0,0, 123 | 3,3,3,3,2,2,3,3,0,2,2,0,3,2,2,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0, 124 | 3,3,2,2,0,2,0,0,0,2,2,0,3,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0, 125 | 0,0,0,0,3,3,3,3,3,2,3,0,0,2,2,2,2,3,0,2,2,3,0,0,0,0,0,0,0,0,0, 126 | 3,3,3,3,2,2,0,2,2,2,2,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0, 127 | }; 128 | 129 | 130 | SequenceModel Latin3_EsperantoModel = 131 | { 132 | Latin3CharToOrderMap, 133 | EsperantoLangModel, 134 | 31, 135 | (float)0.9949447894488864, 136 | PR_FALSE, 137 | "ISO-8859-3" 138 | }; 139 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1 2 | # 3 | # The contents of this file are subject to the Mozilla Public License Version 4 | # 1.1 (the "License"); you may not use this file except in compliance with 5 | # the License. You may obtain a copy of the License at 6 | # http://www.mozilla.org/MPL/ 7 | # 8 | # Software distributed under the License is distributed on an "AS IS" basis, 9 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 | # for the specific language governing rights and limitations under the 11 | # License. 12 | # 13 | # Mozilla's universal charset detector C/C++ Wrapping API 14 | # Writer(s) : 15 | # Detect class by John Gardiner Myers 16 | # C wrapping API by JoungKyun.Kim 17 | # 18 | # Alternatively, the contents of this file may be used under the terms of 19 | # either the GNU General Public License Version 2 or later (the "GPL"), or 20 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 21 | # in which case the provisions of the GPL or the LGPL are applicable instead 22 | # of those above. If you wish to allow use of your version of this file only 23 | # under the terms of either the GPL or the LGPL, and not to allow others to 24 | # use your version of this file under the terms of the MPL, indicate your 25 | # decision by deleting the provisions above and replace them with the notice 26 | # and other provisions required by the GPL or the LGPL. If you do not delete 27 | # the provisions above, a recipient may use your version of this file under 28 | # the terms of any one of the MPL, the GPL or the LGPL. 29 | 30 | AM_CLFAGS = -Wall 31 | AM_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/include 32 | LDADD = $(top_srcdir)/src/libchardet.la 33 | 34 | ctests = bom-test sample sample1 35 | TESTS = $(ctests) 36 | check_PROGRAMS = $(ctests) 37 | TESTS_ENVIRONMENT = $(VALGRIND) 38 | 39 | SAMPLESOURCES = bom-test sample sample1 40 | 41 | byhand: 42 | for bins in $(SAMPLESOURCES); \ 43 | do \ 44 | $(CC) -I$(top_srcdir)/src -I$(top_srcdir)/include -L$(top_srcdir)/src/.libs -o $$bins $$bins.c -lchardet; \ 45 | LD_LIBRARY_PATH=$(top_srcdir)/src/.libs ./$$bins 1; \ 46 | done 47 | 48 | CLEANFILES = $(SAMPLESOURCES) 49 | 50 | -------------------------------------------------------------------------------- /test/bom-test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * sample code with libchardet 3 | * author: JoungKyun.Kim 4 | */ 5 | #include 6 | //#include "../src/chardet.h" 7 | #include 8 | #include 9 | 10 | #ifdef CHARDET_BINARY_SAFE 11 | #define detect_api(x,y) detect_r(x, strlen(x), y) 12 | #else 13 | #define detect_api(x,y) detect(x, y) 14 | #endif 15 | 16 | int main (int argc, char **argv) { 17 | DetectObj *obj; 18 | FILE *fp; 19 | struct stat sb; 20 | char *buf; 21 | char *buf1; 22 | size_t n, i, bom, ret = 0; 23 | char *f[2] = { "utf-8-bom.txt", "utf-8.txt" }; 24 | 25 | for ( i=0; i<2; i++ ) { 26 | if ( stat (f[i], &sb) != 0 ) { 27 | fprintf (stderr, "file %s not found.", f[i]); 28 | continue; 29 | } 30 | 31 | buf = malloc (sizeof (char) * (sb.st_size + 4)); 32 | memset (buf, 0, sb.st_size + 4); 33 | 34 | 35 | fp = fopen (f[i], "r"); 36 | n = fread (buf, sizeof (char), sb.st_size, fp); 37 | fclose (fp); 38 | 39 | obj = detect_obj_init (); 40 | if ( obj == NULL ) { 41 | fprintf (stderr, "On attemped detector, memory allocation failed\n"); 42 | return CHARDET_OUT_OF_MEMORY; 43 | } 44 | 45 | if ( detect_api (buf, &obj) == CHARDET_OUT_OF_MEMORY ) 46 | { 47 | fprintf (stderr, "On handle processing, occured out of memory\n"); 48 | return CHARDET_OUT_OF_MEMORY; 49 | } 50 | 51 | if ( buf[n-1] == '\n' ) { 52 | memset (buf + n - 1, 0, 1); 53 | n--; 54 | } 55 | 56 | if ( obj->bom == 1 ) { 57 | memmove (buf, buf + 4, n - 4); 58 | memset (buf + n - 4, 0, 1); 59 | } 60 | 61 | if ( argc > 1 ) { 62 | printf ("## Charset: %s, Confidence: %f, BOM: %d => %s\n", obj->encoding, obj->confidence, obj->bom, buf); 63 | } else { 64 | bom = i ? 0 : 1; 65 | if ( strcmp (obj->encoding, "UTF-8") != 0 && obj->confidence < 0.6 && bom != obj->bom ) { 66 | ret = 1; 67 | } 68 | } 69 | 70 | free (buf); 71 | detect_obj_free (&obj); 72 | 73 | if ( ret != 0 ) 74 | return 1; 75 | } 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /test/sample.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/test/sample.c -------------------------------------------------------------------------------- /test/sample1.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joungkyun/libchardet/dcdcdfc6a207eb98196b277d72c967beb36cb250/test/sample1.c -------------------------------------------------------------------------------- /test/utf-8-bom.txt: -------------------------------------------------------------------------------- 1 | utf-8 bom 테스트 입니다. bom 을 잘 발견할 수 있을까요? 2 | -------------------------------------------------------------------------------- /test/utf-8.txt: -------------------------------------------------------------------------------- 1 | utf-8 bom 테스트 입니다. bom 이 없는데 구분을 잘 할까요? 2 | -------------------------------------------------------------------------------- /tools/ar-lib: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Wrapper for Microsoft lib.exe 3 | 4 | me=ar-lib 5 | scriptversion=2012-03-01.08; # UTC 6 | 7 | # Copyright (C) 2010-2018 Free Software Foundation, Inc. 8 | # Written by Peter Rosin . 9 | # 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program. If not, see . 22 | 23 | # As a special exception to the GNU General Public License, if you 24 | # distribute this file as part of a program that contains a 25 | # configuration script generated by Autoconf, you may include it under 26 | # the same distribution terms that you use for the rest of that program. 27 | 28 | # This file is maintained in Automake, please report 29 | # bugs to or send patches to 30 | # . 31 | 32 | 33 | # func_error message 34 | func_error () 35 | { 36 | echo "$me: $1" 1>&2 37 | exit 1 38 | } 39 | 40 | file_conv= 41 | 42 | # func_file_conv build_file 43 | # Convert a $build file to $host form and store it in $file 44 | # Currently only supports Windows hosts. 45 | func_file_conv () 46 | { 47 | file=$1 48 | case $file in 49 | / | /[!/]*) # absolute file, and not a UNC file 50 | if test -z "$file_conv"; then 51 | # lazily determine how to convert abs files 52 | case `uname -s` in 53 | MINGW*) 54 | file_conv=mingw 55 | ;; 56 | CYGWIN*) 57 | file_conv=cygwin 58 | ;; 59 | *) 60 | file_conv=wine 61 | ;; 62 | esac 63 | fi 64 | case $file_conv in 65 | mingw) 66 | file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` 67 | ;; 68 | cygwin) 69 | file=`cygpath -m "$file" || echo "$file"` 70 | ;; 71 | wine) 72 | file=`winepath -w "$file" || echo "$file"` 73 | ;; 74 | esac 75 | ;; 76 | esac 77 | } 78 | 79 | # func_at_file at_file operation archive 80 | # Iterate over all members in AT_FILE performing OPERATION on ARCHIVE 81 | # for each of them. 82 | # When interpreting the content of the @FILE, do NOT use func_file_conv, 83 | # since the user would need to supply preconverted file names to 84 | # binutils ar, at least for MinGW. 85 | func_at_file () 86 | { 87 | operation=$2 88 | archive=$3 89 | at_file_contents=`cat "$1"` 90 | eval set x "$at_file_contents" 91 | shift 92 | 93 | for member 94 | do 95 | $AR -NOLOGO $operation:"$member" "$archive" || exit $? 96 | done 97 | } 98 | 99 | case $1 in 100 | '') 101 | func_error "no command. Try '$0 --help' for more information." 102 | ;; 103 | -h | --h*) 104 | cat <. 20 | 21 | # As a special exception to the GNU General Public License, if you 22 | # distribute this file as part of a program that contains a 23 | # configuration script generated by Autoconf, you may include it under 24 | # the same distribution terms that you use for the rest of that program. 25 | 26 | # This file is maintained in Automake, please report 27 | # bugs to or send patches to 28 | # . 29 | 30 | # Make unconditional expansion of undefined variables an error. This 31 | # helps a lot in preventing typo-related bugs. 32 | set -u 33 | 34 | usage_error () 35 | { 36 | echo "$0: $*" >&2 37 | print_usage >&2 38 | exit 2 39 | } 40 | 41 | print_usage () 42 | { 43 | cat <$log_file 2>&1 108 | estatus=$? 109 | 110 | if test $enable_hard_errors = no && test $estatus -eq 99; then 111 | tweaked_estatus=1 112 | else 113 | tweaked_estatus=$estatus 114 | fi 115 | 116 | case $tweaked_estatus:$expect_failure in 117 | 0:yes) col=$red res=XPASS recheck=yes gcopy=yes;; 118 | 0:*) col=$grn res=PASS recheck=no gcopy=no;; 119 | 77:*) col=$blu res=SKIP recheck=no gcopy=yes;; 120 | 99:*) col=$mgn res=ERROR recheck=yes gcopy=yes;; 121 | *:yes) col=$lgn res=XFAIL recheck=no gcopy=yes;; 122 | *:*) col=$red res=FAIL recheck=yes gcopy=yes;; 123 | esac 124 | 125 | # Report the test outcome and exit status in the logs, so that one can 126 | # know whether the test passed or failed simply by looking at the '.log' 127 | # file, without the need of also peaking into the corresponding '.trs' 128 | # file (automake bug#11814). 129 | echo "$res $test_name (exit status: $estatus)" >>$log_file 130 | 131 | # Report outcome to console. 132 | echo "${col}${res}${std}: $test_name" 133 | 134 | # Register the test result, and other relevant metadata. 135 | echo ":test-result: $res" > $trs_file 136 | echo ":global-test-result: $res" >> $trs_file 137 | echo ":recheck: $recheck" >> $trs_file 138 | echo ":copy-in-global-log: $gcopy" >> $trs_file 139 | 140 | # Local Variables: 141 | # mode: shell-script 142 | # sh-indentation: 2 143 | # eval: (add-hook 'before-save-hook 'time-stamp) 144 | # time-stamp-start: "scriptversion=" 145 | # time-stamp-format: "%:y-%02m-%02d.%02H" 146 | # time-stamp-time-zone: "UTC0" 147 | # time-stamp-end: "; # UTC" 148 | # End: 149 | --------------------------------------------------------------------------------