├── .github └── workflows │ └── codeql-analysis.yml ├── DetectorTest.m ├── English.lproj └── InfoPlist.strings ├── Info.plist ├── LICENSE ├── Makefile.common ├── Makefile.freebsd ├── Makefile.legacy ├── Makefile.linux ├── Makefile.windows ├── README.md ├── UniversalDetector.h ├── UniversalDetector.m ├── UniversalDetector.xcodeproj └── project.pbxproj ├── UniversalDetector_Prefix.pch ├── WrappedUniversalDetector.cpp ├── WrappedUniversalDetector.h ├── scan.pl └── universalchardet ├── .cvsignore ├── Big5Freq.tab ├── CharDistribution.cpp ├── CharDistribution.h ├── EUCKRFreq.tab ├── EUCTWFreq.tab ├── GB2312Freq.tab ├── JISFreq.tab ├── JpCntx.cpp ├── JpCntx.h ├── LangBulgarianModel.cpp ├── LangCyrillicModel.cpp ├── LangGreekModel.cpp ├── LangHebrewModel.cpp ├── LangHungarianModel.cpp ├── LangThaiModel.cpp ├── kludge.c ├── nsBig5Prober.cpp ├── nsBig5Prober.h ├── nsCharSetProber.cpp ├── nsCharSetProber.h ├── nsCodingStateMachine.h ├── nsEUCJPProber.cpp ├── nsEUCJPProber.h ├── nsEUCKRProber.cpp ├── nsEUCKRProber.h ├── nsEUCTWProber.cpp ├── nsEUCTWProber.h ├── nsError.h ├── nsEscCharsetProber.cpp ├── nsEscCharsetProber.h ├── nsEscSM.cpp ├── nsGB2312Prober.cpp ├── nsGB2312Prober.h ├── nsHebrewProber.cpp ├── nsHebrewProber.h ├── nsLatin1Prober.cpp ├── nsLatin1Prober.h ├── nsMBCSGroupProber.cpp ├── nsMBCSGroupProber.h ├── nsMBCSSM.cpp ├── nsPkgInt.h ├── nsSBCSGroupProber.cpp ├── nsSBCSGroupProber.h ├── nsSBCharSetProber.cpp ├── nsSBCharSetProber.h ├── nsSJISProber.cpp ├── nsSJISProber.h ├── nsUTF8Prober.cpp ├── nsUTF8Prober.h ├── nsUniversalDetector.cpp ├── nsUniversalDetector.h ├── nscore.h ├── prcpucfg.h ├── prmem.h ├── protypes.h └── prtypes.h /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [ "master" ] 9 | 10 | jobs: 11 | analyze: 12 | name: Analyze 13 | runs-on: ubuntu-latest 14 | permissions: 15 | actions: read 16 | contents: read 17 | security-events: write 18 | 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | language: [ 'cpp' ] 23 | 24 | steps: 25 | - name: Checkout repository 26 | uses: actions/checkout@v3 27 | 28 | # Initializes the CodeQL tools for scanning. 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v2 31 | with: 32 | languages: ${{ matrix.language }} 33 | 34 | - name: Install libs 35 | run: sudo apt-get install build-essential libgnustep-base-dev libz-dev libbz2-dev libssl-dev libicu-dev libwavpack-dev 36 | 37 | - name: Build project 38 | run: make -f Makefile.linux 39 | 40 | - name: Perform CodeQL Analysis 41 | uses: github/codeql-action/analyze@v2 42 | -------------------------------------------------------------------------------- /DetectorTest.m: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectorTest.m 3 | * 4 | * Copyright (c) 2017-present, MacPaw Inc. All rights reserved. 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | #import 22 | #import "UniversalDetector.h" 23 | 24 | int main(int argc,char **argv) 25 | { 26 | NSAutoreleasePool *pool=[[NSAutoreleasePool alloc] init]; 27 | 28 | UniversalDetector *detector=[UniversalDetector detector]; 29 | 30 | for(int i=1;i 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | English 7 | CFBundleExecutable 8 | ${EXECUTABLE_NAME} 9 | CFBundleIconFile 10 | 11 | CFBundleIdentifier 12 | $(PRODUCT_BUNDLE_IDENTIFIER) 13 | CFBundleInfoDictionaryVersion 14 | 6.0 15 | CFBundleName 16 | ${PRODUCT_NAME} 17 | CFBundlePackageType 18 | FMWK 19 | CFBundleSignature 20 | ???? 21 | CFBundleVersion 22 | 1.1 23 | NSPrincipalClass 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Makefile.common: -------------------------------------------------------------------------------- 1 | # Source files 2 | 3 | LIBRARY_OBJC_FILES = UniversalDetector.m 4 | 5 | LIBRARY_C_FILES = universalchardet/kludge.c 6 | 7 | LIBRARY_CXX_FILES = WrappedUniversalDetector.cpp \ 8 | universalchardet/CharDistribution.cpp \ 9 | universalchardet/JpCntx.cpp \ 10 | universalchardet/LangBulgarianModel.cpp \ 11 | universalchardet/LangCyrillicModel.cpp \ 12 | universalchardet/LangGreekModel.cpp \ 13 | universalchardet/LangHebrewModel.cpp \ 14 | universalchardet/LangHungarianModel.cpp \ 15 | universalchardet/LangThaiModel.cpp \ 16 | universalchardet/nsBig5Prober.cpp \ 17 | universalchardet/nsCharSetProber.cpp \ 18 | universalchardet/nsEscCharsetProber.cpp \ 19 | universalchardet/nsEscSM.cpp \ 20 | universalchardet/nsEUCJPProber.cpp \ 21 | universalchardet/nsEUCKRProber.cpp \ 22 | universalchardet/nsEUCTWProber.cpp \ 23 | universalchardet/nsGB2312Prober.cpp \ 24 | universalchardet/nsHebrewProber.cpp \ 25 | universalchardet/nsLatin1Prober.cpp \ 26 | universalchardet/nsMBCSGroupProber.cpp \ 27 | universalchardet/nsMBCSSM.cpp \ 28 | universalchardet/nsSBCharSetProber.cpp \ 29 | universalchardet/nsSBCSGroupProber.cpp \ 30 | universalchardet/nsSJISProber.cpp \ 31 | universalchardet/nsUniversalDetector.cpp \ 32 | universalchardet/nsUTF8Prober.cpp 33 | 34 | DETECTORTEST_OBJC_FILES = DetectorTest.m 35 | 36 | # Object files 37 | 38 | LIBRARY_OBJS = $(LIBRARY_OBJC_FILES:%.m=$(BUILD_DIR)/%.o) \ 39 | $(LIBRARY_C_FILES:%.c=$(BUILD_DIR)/%.o) \ 40 | $(LIBRARY_CXX_FILES:%.cpp=$(BUILD_DIR)/%.o) 41 | 42 | DETECTORTEST_OBJS = $(DETECTORTEST_OBJC_FILES:%.m=$(BUILD_DIR)/%.o) 43 | 44 | OBJS = $(LIBRARY_OBJS) \ 45 | $(DETECTORTEST_OBJS) 46 | 47 | # Other files 48 | 49 | EXECUTABLE_FILES = DetectorTest 50 | 51 | LIBRARY_FILES = libUniversalDetector.a 52 | -------------------------------------------------------------------------------- /Makefile.freebsd: -------------------------------------------------------------------------------- 1 | # Compilers and linkers 2 | 3 | OBJCC = clang 4 | CC = clang 5 | CXX = clang++ 6 | LD = $(CXX) 7 | AR = ar 8 | 9 | # Options for compilation 10 | 11 | GNUSTEP_OPTS = -DGNUSTEP \ 12 | -DGNU_RUNTIME=1 \ 13 | -D_NATIVE_OBJC_EXCEPTIONS \ 14 | -fgnu-runtime \ 15 | -fexceptions \ 16 | -fobjc-exceptions \ 17 | -fconstant-string-class=NSConstantString 18 | 19 | GCC_OPTS = -O2 \ 20 | -Wno-import \ 21 | -Wno-multichar \ 22 | -g \ 23 | -D_FILE_OFFSET_BITS=64 \ 24 | -isystem /usr/local/GNUstep/System/Library/Headers \ 25 | -I/usr/local/include 26 | 27 | OBJC_OPTS = -std=gnu99 28 | 29 | C_OPTS = -std=gnu99 30 | 31 | CXX_OPTS = 32 | 33 | AUTODEPENDENCY_CFLAGS=-MMD -MF$(@:.o=.d) -MT$@ 34 | 35 | ALL_OBJCFLAGS = $(OBJC_OPTS) $(GCC_OPTS) $(GNUSTEP_OPTS) $(OBJCFLAGS) 36 | ALL_CFLAGS = $(C_OPTS) $(GCC_OPTS) $(CFLAGS) 37 | ALL_CXXFLAGS = $(CXX_OPTS) $(GCC_OPTS) $(CXXFLAGS) 38 | 39 | # Options for linking 40 | 41 | LIBS = -Wl,--no-whole-archive \ 42 | -lgnustep-base \ 43 | -lobjc \ 44 | -lm 45 | 46 | ALL_LDFLAGS = -Wl,--whole-archive \ 47 | -fexceptions \ 48 | -fgnu-runtime \ 49 | -L/usr/local/GNUstep/System/Library/Libraries \ 50 | -L/usr/local/lib \ 51 | $(LDFLAGS) 52 | 53 | # Paths 54 | 55 | SOURCE_DIR = . 56 | BUILD_DIR = Build 57 | 58 | # Files 59 | 60 | include Makefile.common 61 | 62 | # Compilation rules 63 | 64 | all: $(EXECUTABLE_FILES) $(LIBRARY_FILES) 65 | 66 | DetectorTest: $(DETECTORTEST_OBJS) libUniversalDetector.a 67 | $(LD) $(ALL_LDFLAGS) -o $@ $^ $(LIBS) 68 | 69 | libUniversalDetector.a: $(LIBRARY_OBJS) 70 | rm -f $@ 71 | $(AR) rcs $@ $^ 72 | 73 | clean: 74 | rm -rf $(BUILD_DIR) $(EXECUTABLE_FILES) $(LIBRARY_FILES) 75 | 76 | # Suffix rules 77 | 78 | .SUFFIXES: .o .c .m .cpp 79 | 80 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.c 81 | @mkdir -p $(dir $@) 82 | $(CC) $(ALL_CFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 83 | 84 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.m 85 | @mkdir -p $(dir $@) 86 | $(OBJCC) $(ALL_OBJCFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 87 | 88 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.cpp 89 | @mkdir -p $(dir $@) 90 | $(CXX) $(ALL_CXXFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 91 | 92 | -include $(OBJS:.o=.d) 93 | -------------------------------------------------------------------------------- /Makefile.legacy: -------------------------------------------------------------------------------- 1 | # Compilers and linkers 2 | 3 | OBJCC_I386 = /Xcode3/usr/bin/gcc-4.2 -arch i386 4 | OBJCC_PPC = /Xcode3/usr/bin/gcc-4.0 -arch ppc 5 | CC_I386 = /Xcode3/usr/bin/gcc-4.2 -arch i386 6 | CC_PPC = /Xcode3/usr/bin/gcc-4.0 -arch ppc 7 | CXX_I386 = /Xcode3/usr/bin/g++-4.2 -arch i386 8 | CXX_PPC = /Xcode3/usr/bin/g++-4.0 -arch ppc 9 | LD = /Xcode3/usr/bin/gcc-4.0 10 | AR = ar 11 | SDK = -isysroot /Xcode3/SDKs/MacOSX10.5.sdk 12 | 13 | # Options for compilation 14 | 15 | GCC_OPTS = -O2 \ 16 | -Wno-import \ 17 | -Wno-multichar \ 18 | -mmacosx-version-min=10.3.9 19 | 20 | OBJC_OPTS = -std=gnu99 \ 21 | $(SDK) 22 | 23 | C_OPTS = -std=gnu99 \ 24 | $(SDK) 25 | 26 | CXX_OPTS = $(SDK) 27 | 28 | AUTODEPENDENCY_CFLAGS=-MMD -MF$(@:.o=.d) -MT$@ 29 | 30 | ALL_OBJCFLAGS = $(OBJC_OPTS) $(GCC_OPTS) $(OBJCFLAGS) 31 | ALL_CFLAGS = $(C_OPTS) $(GCC_OPTS) $(CFLAGS) 32 | ALL_CXXFLAGS = $(CXX_OPTS) $(GCC_OPTS) $(CXXFLAGS) 33 | 34 | # Options for linking 35 | 36 | LIBS = -framework Foundation \ 37 | -lobjc \ 38 | -lstdc++ \ 39 | -lm 40 | 41 | ALL_LDFLAGS = -arch i386 -arch ppc \ 42 | -mmacosx-version-min=10.3.9 \ 43 | $(SDK) \ 44 | $(LDFLAGS) 45 | 46 | # Paths 47 | 48 | SOURCE_DIR = . 49 | BUILD_DIR = Build 50 | 51 | # Files 52 | 53 | include Makefile.common 54 | 55 | # Framework 56 | 57 | FRAMEWORK_NAME = UniversalDetector 58 | 59 | FRAMEWORK_FILES = $(FRAMEWORK_NAME).framework 60 | 61 | FRAMEWORK_HEADER_FILES = UniversalDetector.h 62 | 63 | FRAMEWORK_LDFLAGS = -arch i386 -arch ppc \ 64 | -mmacosx-version-min=10.3.9 \ 65 | -dynamiclib \ 66 | -install_name @executable_path/../Frameworks/$(FRAMEWORK_NAME).framework/$(FRAMEWORK_NAME) \ 67 | $(SDK) \ 68 | $(LDFLAGS) 69 | 70 | # Compilation rules 71 | 72 | all: $(FRAMEWORK_FILES) $(EXECUTABLE_FILES) $(LIBRARY_FILES) 73 | 74 | $(FRAMEWORK_NAME).framework: \ 75 | $(FRAMEWORK_NAME).framework/Versions/A/$(FRAMEWORK_NAME) \ 76 | $(FRAMEWORK_NAME).framework/Versions/A/Resources/Info.plist \ 77 | $(FRAMEWORK_NAME).framework/Versions/A/Resources/en.lproj/InfoPlist.strings \ 78 | $(FRAMEWORK_HEADER_FILES:%=$(FRAMEWORK_NAME).framework/Versions/A/Headers/%) 79 | ln -shf A $(FRAMEWORK_NAME).framework/Versions/Current 80 | ln -shf Versions/Current/$(FRAMEWORK_NAME) $(FRAMEWORK_NAME).framework/$(FRAMEWORK_NAME) 81 | ln -shf Versions/Current/Resources $(FRAMEWORK_NAME).framework/Resources 82 | ln -shf Versions/Current/Headers $(FRAMEWORK_NAME).framework/Headers 83 | 84 | $(FRAMEWORK_NAME).framework/Versions/A/$(FRAMEWORK_NAME): $(LIBRARY_OBJS) 85 | @mkdir -p $(dir $@) 86 | $(LD) $(FRAMEWORK_LDFLAGS) -o $@ $^ $(LIBS) 87 | 88 | $(FRAMEWORK_NAME).framework/Versions/A/Resources/Info.plist: Info.plist 89 | @mkdir -p $(dir $@) 90 | cat $^ |\ 91 | perl -pe 's/\$$\{EXECUTABLE_NAME\}/$(FRAMEWORK_NAME)/sge' |\ 92 | perl -pe 's/\$$\{PRODUCT_NAME\}/$(FRAMEWORK_NAME)/sge' |\ 93 | cat >$@ 94 | 95 | $(FRAMEWORK_NAME).framework/Versions/A/Resources/en.lproj/InfoPlist.strings: English.lproj/InfoPlist.strings 96 | @mkdir -p $(dir $@) 97 | cp $^ $@ 98 | 99 | $(FRAMEWORK_NAME).framework/Versions/A/Headers/%: % 100 | @mkdir -p $(dir $@) 101 | cp $^ $@ 102 | 103 | libUniversalDetector.a: $(LIBRARY_OBJS) 104 | rm -f $@ 105 | $(AR) rcs $@ $^ 106 | 107 | DetectorTest: $(DETECTORTEST_OBJS) libUniversalDetector.a 108 | $(LD) $(ALL_LDFLAGS) -o $@ $^ $(LIBS) 109 | 110 | clean: 111 | rm -rf $(BUILD_DIR) $(FRAMEWORK_FILES) $(EXECUTABLE_FILES) $(LIBRARY_FILES) 112 | 113 | # Suffix rules 114 | 115 | .SUFFIXES: .o .c .m .cpp 116 | 117 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.c 118 | @mkdir -p $(dir $@) 119 | $(CC_I386) $(ALL_CFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@.i386 120 | $(CC_PPC) $(ALL_CFLAGS) -c $< -o $@.ppc 121 | lipo -create $@.i386 $@.ppc -output $@ 122 | 123 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.m 124 | @mkdir -p $(dir $@) 125 | $(OBJCC_I386) $(ALL_OBJCFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@.i386 126 | $(OBJCC_PPC) $(ALL_OBJCFLAGS) -c $< -o $@.ppc 127 | lipo -create $@.i386 $@.ppc -output $@ 128 | 129 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.cpp 130 | @mkdir -p $(dir $@) 131 | $(CXX_I386) $(ALL_CXXFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@.i386 132 | $(CXX_PPC) $(ALL_CXXFLAGS) -c $< -o $@.ppc 133 | lipo -create $@.i386 $@.ppc -output $@ 134 | 135 | -include $(OBJS:.o=.d) 136 | -------------------------------------------------------------------------------- /Makefile.linux: -------------------------------------------------------------------------------- 1 | # Compilers and linkers 2 | 3 | OBJCC = gcc 4 | CC = gcc 5 | CXX = g++ 6 | LD = $(CXX) 7 | AR = ar 8 | 9 | # Options for compilation 10 | 11 | GNUSTEP_OPTS = -DGNUSTEP \ 12 | -DGNU_RUNTIME=1 \ 13 | -D_NATIVE_OBJC_EXCEPTIONS \ 14 | -fgnu-runtime \ 15 | -fexceptions \ 16 | -fobjc-exceptions \ 17 | -fconstant-string-class=NSConstantString 18 | 19 | GCC_OPTS = -O2 \ 20 | -Wno-import \ 21 | -Wno-multichar \ 22 | -g \ 23 | -D_FILE_OFFSET_BITS=64 \ 24 | -isystem /usr/include/GNUstep 25 | 26 | OBJC_OPTS = -std=gnu99 27 | 28 | C_OPTS = -std=gnu99 29 | 30 | CXX_OPTS = 31 | 32 | AUTODEPENDENCY_CFLAGS=-MMD -MF$(@:.o=.d) -MT$@ 33 | 34 | ALL_OBJCFLAGS = $(OBJC_OPTS) $(GCC_OPTS) $(GNUSTEP_OPTS) $(OBJCFLAGS) 35 | ALL_CFLAGS = $(C_OPTS) $(GCC_OPTS) $(CFLAGS) 36 | ALL_CXXFLAGS = $(CXX_OPTS) $(GCC_OPTS) $(CXXFLAGS) 37 | 38 | # Options for linking 39 | 40 | LIBS = -Wl,--no-whole-archive \ 41 | -lgnustep-base \ 42 | -lobjc \ 43 | -lm 44 | 45 | ALL_LDFLAGS = -Wl,--whole-archive \ 46 | -fexceptions \ 47 | -fgnu-runtime \ 48 | $(LDFLAGS) 49 | 50 | # Paths 51 | 52 | SOURCE_DIR = . 53 | BUILD_DIR = Build 54 | 55 | # Files 56 | 57 | include Makefile.common 58 | 59 | # Compilation rules 60 | 61 | all: $(EXECUTABLE_FILES) $(LIBRARY_FILES) 62 | 63 | DetectorTest: $(DETECTORTEST_OBJS) libUniversalDetector.a 64 | $(LD) $(ALL_LDFLAGS) -o $@ $^ $(LIBS) 65 | 66 | libUniversalDetector.a: $(LIBRARY_OBJS) 67 | rm -f $@ 68 | $(AR) rcs $@ $^ 69 | 70 | clean: 71 | rm -rf $(BUILD_DIR) $(EXECUTABLE_FILES) $(LIBRARY_FILES) 72 | 73 | # Suffix rules 74 | 75 | .SUFFIXES: .o .c .m .cpp 76 | 77 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.c 78 | @mkdir -p $(dir $@) 79 | $(CC) $(ALL_CFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 80 | 81 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.m 82 | @mkdir -p $(dir $@) 83 | $(OBJCC) $(ALL_OBJCFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 84 | 85 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.cpp 86 | @mkdir -p $(dir $@) 87 | $(CXX) $(ALL_CXXFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 88 | 89 | -include $(OBJS:.o=.d) 90 | -------------------------------------------------------------------------------- /Makefile.windows: -------------------------------------------------------------------------------- 1 | # Compilers and linkers 2 | 3 | OBJCC = gcc 4 | CC = gcc 5 | CXX = g++ 6 | LD = gcc 7 | AR = ar 8 | 9 | # Options for compilation 10 | 11 | GNUSTEP_OPTS = -DGNUSTEP \ 12 | -DGNU_RUNTIME=1 \ 13 | -D_NATIVE_OBJC_EXCEPTIONS \ 14 | -fgnu-runtime \ 15 | -fexceptions \ 16 | -fobjc-exceptions \ 17 | -fconstant-string-class=NSConstantString 18 | 19 | GCC_OPTS = -O2 \ 20 | -Wno-import \ 21 | -Wno-multichar \ 22 | -g \ 23 | -D_FILE_OFFSET_BITS=64 \ 24 | -isystem C:\GNUstep\GNUstep\System\Library\Headers 25 | 26 | OBJC_OPTS = -std=gnu99 27 | 28 | C_OPTS = -std=gnu99 29 | 30 | CXX_OPTS = 31 | 32 | AUTODEPENDENCY_CFLAGS=-MMD -MF$(@:.o=.d) -MT$@ 33 | 34 | ALL_OBJCFLAGS = $(OBJC_OPTS) $(GCC_OPTS) $(GNUSTEP_OPTS) $(OBJCFLAGS) 35 | ALL_CFLAGS = $(C_OPTS) $(GCC_OPTS) $(CFLAGS) 36 | ALL_CXXFLAGS = $(CXX_OPTS) $(GCC_OPTS) $(CXXFLAGS) 37 | 38 | # Options for linking 39 | 40 | LIBS = -Wl,--no-whole-archive \ 41 | -lgnustep-base \ 42 | -lobjc \ 43 | -lstdc++ \ 44 | -lm 45 | 46 | ALL_LDFLAGS = -Wl,--whole-archive \ 47 | -fexceptions \ 48 | -fgnu-runtime \ 49 | -LC:\GNUstep\GNUstep\System\Library\Libraries \ 50 | $(LDFLAGS) 51 | 52 | # Paths 53 | 54 | SOURCE_DIR = . 55 | BUILD_DIR = Build 56 | 57 | # Files 58 | 59 | include Makefile.common 60 | 61 | # Compilation rules 62 | 63 | all: $(EXECUTABLE_FILES) $(LIBRARY_FILES) 64 | 65 | DetectorTest: $(DETECTORTEST_OBJS) libUniversalDetector.a 66 | $(LD) $(ALL_LDFLAGS) -o $@ $^ $(LIBS) 67 | 68 | libUniversalDetector.a: $(LIBRARY_OBJS) 69 | rm -f $@ 70 | $(AR) rcs $@ $^ 71 | 72 | clean: 73 | rm -rf $(BUILD_DIR) $(EXECUTABLE_FILES) $(LIBRARY_FILES) 74 | 75 | # Suffix rules 76 | 77 | .SUFFIXES: .o .c .m .cpp 78 | 79 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.c 80 | @mkdir -p $(dir $@) 81 | $(CC) $(ALL_CFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 82 | 83 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.m 84 | @mkdir -p $(dir $@) 85 | $(OBJCC) $(ALL_OBJCFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 86 | 87 | $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.cpp 88 | @mkdir -p $(dir $@) 89 | $(CXX) $(ALL_CXXFLAGS) $(AUTODEPENDENCY_CFLAGS) -c $< -o $@ 90 | 91 | -include $(OBJS:.o=.d) 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Universal Detector is a library for character set autodetection. 2 | 3 | * This is an Objective-C wrapper for [universalchardet](https://www-archive.mozilla.org/projects/intl/detectorsrc.html) by Mozilla 4 | * It is used by [XADMaster](https://github.com/MacPaw/XADMaster) library for filenames encoding detection 5 | * Originally developed by [Dag Ågren](https://github.com/DagAgren) 6 | 7 | # Usages 8 | 9 | - [The Unarchiver](https://theunarchiver.com/) application. 10 | 11 | # License 12 | 13 | This Universal Detector Library is distributed under the [LGPL 2.1](https://www.gnu.org/licenses/lgpl-2.1.html) license. Please read LICENSE for information on the software availability and distribution. 14 | -------------------------------------------------------------------------------- /UniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * UniversalDetector.h 3 | * 4 | * Copyright (c) 2017-present, MacPaw Inc. All rights reserved. 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | #import 22 | 23 | @interface UniversalDetector:NSObject 24 | { 25 | void *detector; 26 | NSString *charset; 27 | float confidence; 28 | const char *lastcstring; 29 | } 30 | 31 | +(UniversalDetector *)detector; 32 | +(NSArray *)possibleMIMECharsets; 33 | 34 | -(id)init; 35 | -(void)dealloc; 36 | 37 | -(void)analyzeData:(NSData *)data; 38 | -(void)analyzeBytes:(const char *)data length:(int)len; 39 | -(void)reset; 40 | 41 | -(BOOL)done; 42 | -(NSString *)MIMECharset; 43 | -(float)confidence; 44 | 45 | #ifdef __APPLE__ 46 | -(NSStringEncoding)encoding; 47 | #endif 48 | 49 | @end 50 | -------------------------------------------------------------------------------- /UniversalDetector.m: -------------------------------------------------------------------------------- 1 | /* 2 | * UniversalDetector.m 3 | * 4 | * Copyright (c) 2017-present, MacPaw Inc. All rights reserved. 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | #import "UniversalDetector.h" 22 | #import "WrappedUniversalDetector.h" 23 | 24 | @implementation UniversalDetector 25 | 26 | +(UniversalDetector *)detector 27 | { 28 | return [[self new] autorelease]; 29 | } 30 | 31 | +(NSArray *)possibleMIMECharsets 32 | { 33 | static NSArray *array=nil; 34 | 35 | if(!array) array=[[NSArray alloc] initWithObjects: 36 | @"UTF-8",@"UTF-16BE",@"UTF-16LE",@"UTF-32BE",@"UTF-32LE", 37 | @"ISO-8859-2",@"ISO-8859-5",@"ISO-8859-7",@"ISO-8859-8",@"ISO-8859-8-I", 38 | @"windows-1250",@"windows-1251",@"windows-1252",@"windows-1253",@"windows-1255", 39 | @"KOI8-R",@"Shift_JIS",@"EUC-JP",@"EUC-KR"/* actually CP949 */,@"x-euc-tw", 40 | @"ISO-2022-JP",@"ISO-2022-CN",@"ISO-2022-KR", 41 | @"Big5",@"GB2312",@"HZ-GB-2312",@"gb18030",@"GB18030", 42 | @"IBM855",@"IBM866",@"TIS-620",@"X-ISO-10646-UCS-4-2143",@"X-ISO-10646-UCS-4-3412", 43 | @"x-mac-cyrillic",@"x-mac-hebrew", 44 | nil]; 45 | 46 | return array; 47 | } 48 | 49 | -(id)init 50 | { 51 | if((self=[super init])) 52 | { 53 | detector=AllocUniversalDetector(); 54 | charset=nil; 55 | lastcstring=NULL; 56 | } 57 | return self; 58 | } 59 | 60 | -(void)dealloc 61 | { 62 | FreeUniversalDetector(detector); 63 | [charset release]; 64 | [super dealloc]; 65 | } 66 | 67 | -(void)analyzeData:(NSData *)data 68 | { 69 | [self analyzeBytes:(const char *)[data bytes] length:(int)[data length]]; 70 | } 71 | 72 | -(void)analyzeBytes:(const char *)data length:(int)len 73 | { 74 | UniversalDetectorHandleData(detector,data,len); 75 | } 76 | 77 | -(void)reset { UniversalDetectorReset(detector); } 78 | 79 | -(BOOL)done { return UniversalDetectorDone(detector); } 80 | 81 | -(NSString *)MIMECharset 82 | { 83 | const char *cstr=UniversalDetectorCharset(detector,&confidence); 84 | if(!cstr) return nil; 85 | 86 | // nsUniversalDetector detects CP949 but returns "EUC-KR" because CP949 87 | // lacks an IANA name. Kludge the name to make sure decoding succeeds. 88 | if(strcmp(cstr,"EUC-KR")==0) cstr="CP949"; 89 | 90 | if(cstr!=lastcstring) 91 | { 92 | [charset release]; 93 | charset=[[NSString alloc] initWithUTF8String:cstr]; 94 | lastcstring=cstr; 95 | } 96 | 97 | return charset; 98 | } 99 | 100 | -(float)confidence 101 | { 102 | if(!charset) [self MIMECharset]; 103 | return confidence; 104 | } 105 | 106 | #ifdef __APPLE__ 107 | -(NSStringEncoding)encoding 108 | { 109 | NSString *mimecharset=[self MIMECharset]; 110 | if(!mimecharset) return 0; 111 | 112 | CFStringEncoding cfenc=CFStringConvertIANACharSetNameToEncoding((CFStringRef)mimecharset); 113 | if(cfenc==kCFStringEncodingInvalidId) return 0; 114 | 115 | return CFStringConvertEncodingToNSStringEncoding(cfenc); 116 | } 117 | 118 | #endif 119 | 120 | @end 121 | -------------------------------------------------------------------------------- /UniversalDetector_Prefix.pch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MacPaw/universal-detector/4eb832d999628edcd3d134e46bd35357c8c99a85/UniversalDetector_Prefix.pch -------------------------------------------------------------------------------- /WrappedUniversalDetector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * WrappedUniversalDetector.cpp 3 | * 4 | * Copyright (c) 2017-present, MacPaw Inc. All rights reserved. 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | #include "WrappedUniversalDetector.h" 22 | 23 | #include "universalchardet/nscore.h" 24 | #include "universalchardet/nsUniversalDetector.h" 25 | #include "universalchardet/nsCharSetProber.h" 26 | 27 | 28 | class wrappedUniversalDetector:public nsUniversalDetector 29 | { 30 | public: 31 | wrappedUniversalDetector():nsUniversalDetector(NS_FILTER_ALL) {} 32 | 33 | void Report(const char* aCharset) {} 34 | 35 | const char *charset(float &confidence) 36 | { 37 | if(!mGotData) 38 | { 39 | confidence=0; 40 | return 0; 41 | } 42 | 43 | if(mDetectedCharset) 44 | { 45 | confidence=1; 46 | return mDetectedCharset; 47 | } 48 | 49 | switch(mInputState) 50 | { 51 | case eHighbyte: 52 | { 53 | float proberConfidence; 54 | float maxProberConfidence = (float)0.0; 55 | PRInt32 maxProber = 0; 56 | 57 | for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 58 | { 59 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 60 | if (proberConfidence > maxProberConfidence) 61 | { 62 | maxProberConfidence = proberConfidence; 63 | maxProber = i; 64 | } 65 | } 66 | 67 | confidence=maxProberConfidence; 68 | return mCharSetProbers[maxProber]->GetCharSetName(); 69 | } 70 | break; 71 | 72 | case ePureAscii: 73 | confidence=0; 74 | return "US-ASCII"; 75 | 76 | default: 77 | break; 78 | } 79 | 80 | confidence=0; 81 | return 0; 82 | } 83 | 84 | bool done() 85 | { 86 | if(mDetectedCharset) return true; 87 | return false; 88 | } 89 | 90 | void reset() { Reset(); } 91 | }; 92 | 93 | 94 | 95 | extern "C" { 96 | 97 | void *AllocUniversalDetector() 98 | { 99 | return (void *)new wrappedUniversalDetector; 100 | } 101 | 102 | void FreeUniversalDetector(void *detectorptr) 103 | { 104 | delete (wrappedUniversalDetector *)detectorptr; 105 | } 106 | 107 | void UniversalDetectorHandleData(void *detectorptr,const char *data,int length) 108 | { 109 | if(length==0) return; // There seems to be a bug in UniversalDetector that accesses beyond the end of 0-length buffers. 110 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorptr; 111 | if(detector->done()) return; 112 | detector->HandleData(data,length); 113 | } 114 | 115 | void UniversalDetectorReset(void *detectorptr) 116 | { 117 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorptr; 118 | detector->reset(); 119 | } 120 | 121 | int UniversalDetectorDone(void *detectorptr) 122 | { 123 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorptr; 124 | return detector->done()?1:0; 125 | } 126 | 127 | const char *UniversalDetectorCharset(void *detectorptr,float *confidence) 128 | { 129 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorptr; 130 | return detector->charset(*confidence); 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /WrappedUniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * WrappedUniversalDetector.h 3 | * 4 | * Copyright (c) 2017-present, MacPaw Inc. All rights reserved. 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301 USA 20 | */ 21 | #ifndef __WrappedUniversalDetector_h__ 22 | #define __WrappedUniversalDetector_h__ 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | void *AllocUniversalDetector(); 29 | void FreeUniversalDetector(void *detectorptr); 30 | void UniversalDetectorHandleData(void *detectorptr,const char *data,int length); 31 | void UniversalDetectorReset(void *detectorptr); 32 | int UniversalDetectorDone(void *detectorptr); 33 | const char *UniversalDetectorCharset(void *detectorptr,float *confidence); 34 | 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /scan.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | my %charsets; 6 | 7 | for(@ARGV) 8 | { 9 | open FILE,$_ or die; 10 | $_=do {local $/; }; 11 | 12 | # $charsets{$1}=1 while(/SequenceModel.*?=.*?\{[^}"]+"([^"]*)"[^}]+\}/gs); 13 | $charsets{$1}=1 while(/"([A-Za-z0-9_\-]+)"/g); 14 | } 15 | 16 | print join "\n",sort keys %charsets; 17 | print "\n"; -------------------------------------------------------------------------------- /universalchardet/.cvsignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | -------------------------------------------------------------------------------- /universalchardet/CharDistribution.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Communicator client code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "CharDistribution.h" 39 | 40 | #include "JISFreq.tab" 41 | #include "Big5Freq.tab" 42 | #include "EUCKRFreq.tab" 43 | #include "EUCTWFreq.tab" 44 | #include "GB2312Freq.tab" 45 | 46 | #define SURE_YES 0.99f 47 | #define SURE_NO 0.01f 48 | 49 | #define MINIMUM_DATA_THRESHOLD 4 50 | 51 | //return confidence base on received data 52 | float CharDistributionAnalysis::GetConfidence() 53 | { 54 | //if we didn't receive any character in our consideration range, or the 55 | // number of frequent characters is below the minimum threshold, return 56 | // negative answer 57 | if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD) 58 | return SURE_NO; 59 | 60 | if (mTotalChars != mFreqChars) { 61 | float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); 62 | 63 | if (r < SURE_YES) 64 | return r; 65 | } 66 | //normalize confidence, (we don't want to be 100% sure) 67 | return SURE_YES; 68 | } 69 | 70 | EUCTWDistributionAnalysis::EUCTWDistributionAnalysis() 71 | { 72 | mCharToFreqOrder = EUCTWCharToFreqOrder; 73 | mTableSize = sizeof(EUCTWCharToFreqOrder)/sizeof(EUCTWCharToFreqOrder[0]); 74 | mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO; 75 | } 76 | 77 | EUCKRDistributionAnalysis::EUCKRDistributionAnalysis() 78 | { 79 | mCharToFreqOrder = EUCKRCharToFreqOrder; 80 | mTableSize = sizeof(EUCKRCharToFreqOrder)/sizeof(EUCKRCharToFreqOrder[0]); 81 | mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO; 82 | } 83 | 84 | GB2312DistributionAnalysis::GB2312DistributionAnalysis() 85 | { 86 | mCharToFreqOrder = GB2312CharToFreqOrder; 87 | mTableSize = sizeof(GB2312CharToFreqOrder)/sizeof(GB2312CharToFreqOrder[0]); 88 | mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO; 89 | } 90 | 91 | Big5DistributionAnalysis::Big5DistributionAnalysis() 92 | { 93 | mCharToFreqOrder = Big5CharToFreqOrder; 94 | mTableSize = sizeof(Big5CharToFreqOrder)/sizeof(Big5CharToFreqOrder[0]); 95 | mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO; 96 | } 97 | 98 | SJISDistributionAnalysis::SJISDistributionAnalysis() 99 | { 100 | mCharToFreqOrder = JISCharToFreqOrder; 101 | mTableSize = sizeof(JISCharToFreqOrder)/sizeof(JISCharToFreqOrder[0]); 102 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 103 | } 104 | 105 | EUCJPDistributionAnalysis::EUCJPDistributionAnalysis() 106 | { 107 | mCharToFreqOrder = JISCharToFreqOrder; 108 | mTableSize = sizeof(JISCharToFreqOrder)/sizeof(JISCharToFreqOrder[0]); 109 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 110 | } 111 | 112 | -------------------------------------------------------------------------------- /universalchardet/CharDistribution.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Communicator client code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef CharDistribution_h__ 39 | #define CharDistribution_h__ 40 | 41 | #include "nscore.h" 42 | 43 | #define ENOUGH_DATA_THRESHOLD 1024 44 | 45 | class CharDistributionAnalysis 46 | { 47 | public: 48 | CharDistributionAnalysis() {Reset();} 49 | 50 | //feed a block of data and do distribution analysis 51 | void HandleData(const char* aBuf, PRUint32 aLen) {} 52 | 53 | //Feed a character with known length 54 | void HandleOneChar(const char* aStr, PRUint32 aCharLen) 55 | { 56 | PRInt32 order; 57 | 58 | //we only care about 2-bytes character in our distribution analysis 59 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 60 | 61 | if (order >= 0) 62 | { 63 | mTotalChars++; 64 | //order is valid 65 | if ((PRUint32)order < mTableSize) 66 | { 67 | if (512 > mCharToFreqOrder[order]) 68 | mFreqChars++; 69 | } 70 | } 71 | } 72 | 73 | //return confidence base on existing data 74 | float GetConfidence(); 75 | 76 | //Reset analyser, clear any state 77 | void Reset(void) 78 | { 79 | mDone = PR_FALSE; 80 | mTotalChars = 0; 81 | mFreqChars = 0; 82 | } 83 | 84 | //This function is for future extension. Caller can use this function to control 85 | //analyser's behavior 86 | void SetOpion(){} 87 | 88 | //It is not necessary to receive all data to draw conclusion. For charset detection, 89 | // certain amount of data is enough 90 | PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} 91 | 92 | protected: 93 | //we do not handle character base on its original encoding string, but 94 | //convert this encoding string to a number, here called order. 95 | //This allow multiple encoding of a language to share one frequency table 96 | virtual PRInt32 GetOrder(const char* str) {return -1;} 97 | 98 | //If this flag is set to PR_TRUE, detection is done and conclusion has been made 99 | PRBool mDone; 100 | 101 | //The number of characters whose frequency order is less than 512 102 | PRUint32 mFreqChars; 103 | 104 | //Total character encounted. 105 | PRUint32 mTotalChars; 106 | 107 | //Mapping table to get frequency order from char order (get from GetOrder()) 108 | const PRInt16 *mCharToFreqOrder; 109 | 110 | //Size of above table 111 | PRUint32 mTableSize; 112 | 113 | //This is a constant value varies from language to language, it is used in 114 | //calculating confidence. See my paper for further detail. 115 | float mTypicalDistributionRatio; 116 | }; 117 | 118 | 119 | class EUCTWDistributionAnalysis: public CharDistributionAnalysis 120 | { 121 | public: 122 | EUCTWDistributionAnalysis(); 123 | protected: 124 | 125 | //for euc-TW encoding, we are interested 126 | // first byte range: 0xc4 -- 0xfe 127 | // second byte range: 0xa1 -- 0xfe 128 | //no validation needed here. State machine has done that 129 | PRInt32 GetOrder(const char* str) 130 | { if ((unsigned char)*str >= (unsigned char)0xc4) 131 | return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; 132 | else 133 | return -1; 134 | } 135 | }; 136 | 137 | 138 | class EUCKRDistributionAnalysis : public CharDistributionAnalysis 139 | { 140 | public: 141 | EUCKRDistributionAnalysis(); 142 | protected: 143 | //for euc-KR encoding, we are interested 144 | // first byte range: 0xb0 -- 0xfe 145 | // second byte range: 0xa1 -- 0xfe 146 | //no validation needed here. State machine has done that 147 | PRInt32 GetOrder(const char* str) 148 | { if ((unsigned char)*str >= (unsigned char)0xb0) 149 | return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 150 | else 151 | return -1; 152 | } 153 | }; 154 | 155 | class GB2312DistributionAnalysis : public CharDistributionAnalysis 156 | { 157 | public: 158 | GB2312DistributionAnalysis(); 159 | protected: 160 | //for GB2312 encoding, we are interested 161 | // first byte range: 0xb0 -- 0xfe 162 | // second byte range: 0xa1 -- 0xfe 163 | //no validation needed here. State machine has done that 164 | PRInt32 GetOrder(const char* str) 165 | { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) 166 | return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 167 | else 168 | return -1; 169 | } 170 | }; 171 | 172 | 173 | class Big5DistributionAnalysis : public CharDistributionAnalysis 174 | { 175 | public: 176 | Big5DistributionAnalysis(); 177 | protected: 178 | //for big5 encoding, we are interested 179 | // first byte range: 0xa4 -- 0xfe 180 | // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 181 | //no validation needed here. State machine has done that 182 | PRInt32 GetOrder(const char* str) 183 | { if ((unsigned char)*str >= (unsigned char)0xa4) 184 | if ((unsigned char)str[1] >= (unsigned char)0xa1) 185 | return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; 186 | else 187 | return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 188 | else 189 | return -1; 190 | } 191 | }; 192 | 193 | class SJISDistributionAnalysis : public CharDistributionAnalysis 194 | { 195 | public: 196 | SJISDistributionAnalysis(); 197 | protected: 198 | //for sjis encoding, we are interested 199 | // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 200 | // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 201 | //no validation needed here. State machine has done that 202 | PRInt32 GetOrder(const char* str) 203 | { 204 | PRInt32 order; 205 | if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) 206 | order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); 207 | else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) 208 | order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); 209 | else 210 | return -1; 211 | order += (unsigned char)*(str+1) - 0x40; 212 | if ((unsigned char)str[1] > (unsigned char)0x7f) 213 | order--; 214 | return order; 215 | } 216 | }; 217 | 218 | class EUCJPDistributionAnalysis : public CharDistributionAnalysis 219 | { 220 | public: 221 | EUCJPDistributionAnalysis(); 222 | protected: 223 | //for euc-JP encoding, we are interested 224 | // first byte range: 0xa0 -- 0xfe 225 | // second byte range: 0xa1 -- 0xfe 226 | //no validation needed here. State machine has done that 227 | PRInt32 GetOrder(const char* str) 228 | { if ((unsigned char)*str >= (unsigned char)0xa0) 229 | return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 230 | else 231 | return -1; 232 | } 233 | }; 234 | 235 | #endif //CharDistribution_h__ 236 | 237 | -------------------------------------------------------------------------------- /universalchardet/JpCntx.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Communicator client code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef __JPCNTX_H__ 39 | #define __JPCNTX_H__ 40 | 41 | #define NUM_OF_CATEGORY 6 42 | 43 | #include "nscore.h" 44 | 45 | #define ENOUGH_REL_THRESHOLD 100 46 | #define MAX_REL_THRESHOLD 1000 47 | 48 | //hiragana frequency category table 49 | extern char jp2CharContext[83][83]; 50 | 51 | class JapaneseContextAnalysis 52 | { 53 | public: 54 | JapaneseContextAnalysis() {Reset();} 55 | 56 | void HandleData(const char* aBuf, PRUint32 aLen); 57 | 58 | void HandleOneChar(const char* aStr, PRUint32 aCharLen) 59 | { 60 | PRInt32 order; 61 | 62 | //if we received enough data, stop here 63 | if (mTotalRel > MAX_REL_THRESHOLD) mDone = PR_TRUE; 64 | if (mDone) return; 65 | 66 | //Only 2-bytes characters are of our interest 67 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 68 | if (order != -1 && mLastCharOrder != -1) 69 | { 70 | mTotalRel++; 71 | //count this sequence to its category counter 72 | mRelSample[jp2CharContext[mLastCharOrder][order]]++; 73 | } 74 | mLastCharOrder = order; 75 | } 76 | 77 | float GetConfidence(); 78 | void Reset(void); 79 | void SetOpion(){} 80 | PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} 81 | 82 | protected: 83 | virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0; 84 | virtual PRInt32 GetOrder(const char* str) = 0; 85 | 86 | //category counters, each integer counts sequences in its category 87 | PRUint32 mRelSample[NUM_OF_CATEGORY]; 88 | 89 | //total sequence received 90 | PRUint32 mTotalRel; 91 | 92 | //The order of previous char 93 | PRInt32 mLastCharOrder; 94 | 95 | //if last byte in current buffer is not the last byte of a character, we 96 | //need to know how many byte to skip in next buffer. 97 | PRUint32 mNeedToSkipCharNum; 98 | 99 | //If this flag is set to PR_TRUE, detection is done and conclusion has been made 100 | PRBool mDone; 101 | }; 102 | 103 | 104 | class SJISContextAnalysis : public JapaneseContextAnalysis 105 | { 106 | //SJISContextAnalysis(){}; 107 | protected: 108 | PRInt32 GetOrder(const char* str, PRUint32 *charLen); 109 | 110 | PRInt32 GetOrder(const char* str) 111 | { 112 | //We only interested in Hiragana, so first byte is '\202' 113 | if (*str == '\202' && 114 | (unsigned char)*(str+1) >= (unsigned char)0x9f && 115 | (unsigned char)*(str+1) <= (unsigned char)0xf1) 116 | return (unsigned char)*(str+1) - (unsigned char)0x9f; 117 | return -1; 118 | } 119 | }; 120 | 121 | class EUCJPContextAnalysis : public JapaneseContextAnalysis 122 | { 123 | protected: 124 | PRInt32 GetOrder(const char* str, PRUint32 *charLen); 125 | PRInt32 GetOrder(const char* str) 126 | //We only interested in Hiragana, so first byte is '\244' 127 | { 128 | if (*str == '\244' && 129 | (unsigned char)*(str+1) >= (unsigned char)0xa1 && 130 | (unsigned char)*(str+1) <= (unsigned char)0xf3) 131 | return (unsigned char)*(str+1) - (unsigned char)0xa1; 132 | return -1; 133 | } 134 | }; 135 | 136 | #endif /* __JPCNTX_H__ */ 137 | 138 | -------------------------------------------------------------------------------- /universalchardet/kludge.c: -------------------------------------------------------------------------------- 1 | #include "prmem.h" // nose core nose core nose core nose core 2 | 3 | PR_IMPLEMENT(void *) PR_Malloc(PRUint32 size) 4 | { 5 | return malloc(size); 6 | } 7 | 8 | PR_IMPLEMENT(void *) PR_Calloc(PRUint32 nelem, PRUint32 elsize) 9 | { 10 | return calloc(nelem, elsize); 11 | } 12 | 13 | PR_IMPLEMENT(void *) PR_Realloc(void *ptr, PRUint32 size) 14 | { 15 | return realloc(ptr, size); 16 | } 17 | 18 | PR_IMPLEMENT(void) PR_Free(void *ptr) 19 | { 20 | free(ptr); 21 | } 22 | -------------------------------------------------------------------------------- /universalchardet/nsBig5Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "nsBig5Prober.h" 39 | 40 | void nsBig5Prober::Reset(void) 41 | { 42 | mCodingSM->Reset(); 43 | mState = eDetecting; 44 | mDistributionAnalyser.Reset(); 45 | } 46 | 47 | nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) 48 | { 49 | nsSMState codingState; 50 | 51 | for (PRUint32 i = 0; i < aLen; i++) 52 | { 53 | codingState = mCodingSM->NextState(aBuf[i]); 54 | if (codingState == eItsMe) 55 | { 56 | mState = eFoundIt; 57 | break; 58 | } 59 | if (codingState == eStart) 60 | { 61 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 62 | 63 | if (i == 0) 64 | { 65 | mLastChar[1] = aBuf[0]; 66 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 67 | } 68 | else 69 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 70 | } 71 | } 72 | 73 | mLastChar[0] = aBuf[aLen-1]; 74 | 75 | if (mState == eDetecting) 76 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 77 | mState = eFoundIt; 78 | 79 | return mState; 80 | } 81 | 82 | float nsBig5Prober::GetConfidence(void) 83 | { 84 | float distribCf = mDistributionAnalyser.GetConfidence(); 85 | 86 | return (float)distribCf; 87 | } 88 | 89 | -------------------------------------------------------------------------------- /universalchardet/nsBig5Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsBig5Prober_h__ 39 | #define nsBig5Prober_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | #include "CharDistribution.h" 44 | 45 | class nsBig5Prober: public nsCharSetProber { 46 | public: 47 | nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel); 48 | Reset();} 49 | virtual ~nsBig5Prober(void){delete mCodingSM;} 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return "Big5";} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | protected: 58 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 59 | 60 | nsCodingStateMachine* mCodingSM; 61 | nsProbingState mState; 62 | 63 | //Big5ContextAnalysis mContextAnalyser; 64 | Big5DistributionAnalysis mDistributionAnalyser; 65 | char mLastChar[2]; 66 | 67 | }; 68 | 69 | 70 | #endif /* nsBig5Prober_h__ */ 71 | 72 | -------------------------------------------------------------------------------- /universalchardet/nsCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #include "nsCharSetProber.h" 40 | #include "prmem.h" 41 | 42 | //This filter applies to all scripts which do not use English characters 43 | PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 44 | { 45 | char *newptr; 46 | char *prevPtr, *curPtr; 47 | 48 | PRBool meetMSB = PR_FALSE; 49 | newptr = *newBuf = (char*)PR_Malloc(aLen); 50 | if (!newptr) 51 | return PR_FALSE; 52 | 53 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 54 | { 55 | if (*curPtr & 0x80) 56 | { 57 | meetMSB = PR_TRUE; 58 | } 59 | else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 60 | { 61 | //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 62 | if (meetMSB && curPtr > prevPtr) 63 | //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 64 | { 65 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 66 | prevPtr++; 67 | *newptr++ = ' '; 68 | meetMSB = PR_FALSE; 69 | } 70 | else //ignore current segment. (either because it is just a symbol or just an English word) 71 | prevPtr = curPtr+1; 72 | } 73 | } 74 | if (meetMSB && curPtr > prevPtr) 75 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 76 | 77 | newLen = (PRUint32)(newptr - *newBuf); 78 | 79 | return PR_TRUE; 80 | } 81 | 82 | //This filter applies to all scripts which contain both English characters and upper ASCII characters. 83 | PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 84 | { 85 | //do filtering to reduce load to probers 86 | char *newptr; 87 | char *prevPtr, *curPtr; 88 | PRBool isInTag = PR_FALSE; 89 | 90 | newptr = *newBuf = (char*)PR_Malloc(aLen); 91 | if (!newptr) 92 | return PR_FALSE; 93 | 94 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 95 | { 96 | if (*curPtr == '>') 97 | isInTag = PR_FALSE; 98 | else if (*curPtr == '<') 99 | isInTag = PR_TRUE; 100 | 101 | if (!(*curPtr & 0x80) && 102 | (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 103 | { 104 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 105 | // and it is not inside a tag, keep it. 106 | { 107 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 108 | prevPtr++; 109 | *newptr++ = ' '; 110 | } 111 | else 112 | prevPtr = curPtr+1; 113 | } 114 | } 115 | 116 | // If the current segment contains more than just a symbol 117 | // and it is not inside a tag then keep it. 118 | if (!isInTag) 119 | while (prevPtr < curPtr) 120 | *newptr++ = *prevPtr++; 121 | 122 | newLen = (PRUint32)(newptr - *newBuf); 123 | 124 | return PR_TRUE; 125 | } 126 | -------------------------------------------------------------------------------- /universalchardet/nsCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | #ifndef nsCharSetProber_h__ 39 | #define nsCharSetProber_h__ 40 | 41 | #include "nscore.h" 42 | 43 | //#define DEBUG_chardet // Uncomment this for debug dump. 44 | 45 | typedef enum { 46 | eDetecting = 0, //We are still detecting, no sure answer yet, but caller can ask for confidence. 47 | eFoundIt = 1, //That's a positive answer 48 | eNotMe = 2 //Negative answer 49 | } nsProbingState; 50 | 51 | #define SHORTCUT_THRESHOLD (float)0.95 52 | 53 | class nsCharSetProber { 54 | public: 55 | virtual ~nsCharSetProber() {} 56 | virtual const char* GetCharSetName() = 0; 57 | virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; 58 | virtual nsProbingState GetState(void) = 0; 59 | virtual void Reset(void) = 0; 60 | virtual float GetConfidence(void) = 0; 61 | virtual void SetOpion() = 0; 62 | 63 | #ifdef DEBUG_chardet 64 | virtual void DumpStatus() {}; 65 | #endif 66 | 67 | // Helper functions used in the Latin1 and Group probers. 68 | // both functions Allocate a new buffer for newBuf. This buffer should be 69 | // freed by the caller using PR_FREEIF. 70 | // Both functions return PR_FALSE in case of memory allocation failure. 71 | static PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen); 72 | static PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen); 73 | 74 | }; 75 | 76 | #endif /* nsCharSetProber_h__ */ 77 | -------------------------------------------------------------------------------- /universalchardet/nsCodingStateMachine.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | #ifndef nsCodingStateMachine_h__ 38 | #define nsCodingStateMachine_h__ 39 | 40 | #include "nsPkgInt.h" 41 | 42 | typedef enum { 43 | eStart = 0, 44 | eError = 1, 45 | eItsMe = 2 46 | } nsSMState; 47 | 48 | #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) 49 | 50 | //state machine model 51 | typedef struct 52 | { 53 | nsPkgInt classTable; 54 | PRUint32 classFactor; 55 | nsPkgInt stateTable; 56 | const PRUint32* charLenTable; 57 | const char* name; 58 | } SMModel; 59 | 60 | class nsCodingStateMachine { 61 | public: 62 | nsCodingStateMachine(SMModel* sm){ 63 | mCurrentState = eStart; 64 | mModel = sm; 65 | } 66 | nsSMState NextState(char c){ 67 | //for each byte we get its class , if it is first byte, we also get byte length 68 | PRUint32 byteCls = GETCLASS(c); 69 | if (mCurrentState == eStart) 70 | { 71 | mCurrentBytePos = 0; 72 | mCurrentCharLen = mModel->charLenTable[byteCls]; 73 | } 74 | //from byte's class and stateTable, we get its next state 75 | mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, 76 | mModel->stateTable); 77 | mCurrentBytePos++; 78 | return mCurrentState; 79 | } 80 | PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} 81 | void Reset(void) {mCurrentState = eStart;} 82 | const char * GetCodingStateMachine() {return mModel->name;} 83 | 84 | protected: 85 | nsSMState mCurrentState; 86 | PRUint32 mCurrentCharLen; 87 | PRUint32 mCurrentBytePos; 88 | 89 | SMModel *mModel; 90 | }; 91 | 92 | extern SMModel UTF8SMModel; 93 | extern SMModel Big5SMModel; 94 | extern SMModel EUCJPSMModel; 95 | extern SMModel EUCKRSMModel; 96 | extern SMModel EUCTWSMModel; 97 | extern SMModel GB18030SMModel; 98 | extern SMModel SJISSMModel; 99 | extern SMModel UCS2BESMModel; 100 | 101 | 102 | extern SMModel HZSMModel; 103 | extern SMModel ISO2022CNSMModel; 104 | extern SMModel ISO2022JPSMModel; 105 | extern SMModel ISO2022KRSMModel; 106 | 107 | #endif /* nsCodingStateMachine_h__ */ 108 | 109 | -------------------------------------------------------------------------------- /universalchardet/nsEUCJPProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | // for japanese encoding, obeserve characteristic: 39 | // 1, kana character (or hankaku?) often have hight frequency of appereance 40 | // 2, kana character often exist in group 41 | // 3, certain combination of kana is never used in japanese language 42 | 43 | #include "nsEUCJPProber.h" 44 | 45 | void nsEUCJPProber::Reset(void) 46 | { 47 | mCodingSM->Reset(); 48 | mState = eDetecting; 49 | mContextAnalyser.Reset(); 50 | mDistributionAnalyser.Reset(); 51 | } 52 | 53 | nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) 54 | { 55 | nsSMState codingState; 56 | 57 | for (PRUint32 i = 0; i < aLen; i++) 58 | { 59 | codingState = mCodingSM->NextState(aBuf[i]); 60 | if (codingState == eItsMe) 61 | { 62 | mState = eFoundIt; 63 | break; 64 | } 65 | if (codingState == eStart) 66 | { 67 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 68 | 69 | if (i == 0) 70 | { 71 | mLastChar[1] = aBuf[0]; 72 | mContextAnalyser.HandleOneChar(mLastChar, charLen); 73 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 74 | } 75 | else 76 | { 77 | mContextAnalyser.HandleOneChar(aBuf+i-1, charLen); 78 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 79 | } 80 | } 81 | } 82 | 83 | mLastChar[0] = aBuf[aLen-1]; 84 | 85 | if (mState == eDetecting) 86 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 87 | mState = eFoundIt; 88 | 89 | return mState; 90 | } 91 | 92 | float nsEUCJPProber::GetConfidence(void) 93 | { 94 | float contxtCf = mContextAnalyser.GetConfidence(); 95 | float distribCf = mDistributionAnalyser.GetConfidence(); 96 | 97 | return (contxtCf > distribCf ? contxtCf : distribCf); 98 | } 99 | 100 | -------------------------------------------------------------------------------- /universalchardet/nsEUCJPProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | // for S-JIS encoding, obeserve characteristic: 39 | // 1, kana character (or hankaku?) often have hight frequency of appereance 40 | // 2, kana character often exist in group 41 | // 3, certain combination of kana is never used in japanese language 42 | 43 | #ifndef nsEUCJPProber_h__ 44 | #define nsEUCJPProber_h__ 45 | 46 | #include "nsCharSetProber.h" 47 | #include "nsCodingStateMachine.h" 48 | #include "JpCntx.h" 49 | #include "CharDistribution.h" 50 | 51 | class nsEUCJPProber: public nsCharSetProber { 52 | public: 53 | nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); 54 | Reset();} 55 | virtual ~nsEUCJPProber(void){delete mCodingSM;} 56 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 57 | const char* GetCharSetName() {return "EUC-JP";} 58 | nsProbingState GetState(void) {return mState;} 59 | void Reset(void); 60 | float GetConfidence(void); 61 | void SetOpion() {} 62 | 63 | protected: 64 | nsCodingStateMachine* mCodingSM; 65 | nsProbingState mState; 66 | 67 | EUCJPContextAnalysis mContextAnalyser; 68 | EUCJPDistributionAnalysis mDistributionAnalyser; 69 | 70 | char mLastChar[2]; 71 | }; 72 | 73 | 74 | #endif /* nsEUCJPProber_h__ */ 75 | 76 | -------------------------------------------------------------------------------- /universalchardet/nsEUCKRProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "nsEUCKRProber.h" 39 | 40 | void nsEUCKRProber::Reset(void) 41 | { 42 | mCodingSM->Reset(); 43 | mState = eDetecting; 44 | mDistributionAnalyser.Reset(); 45 | //mContextAnalyser.Reset(); 46 | } 47 | 48 | nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) 49 | { 50 | nsSMState codingState; 51 | 52 | for (PRUint32 i = 0; i < aLen; i++) 53 | { 54 | codingState = mCodingSM->NextState(aBuf[i]); 55 | if (codingState == eItsMe) 56 | { 57 | mState = eFoundIt; 58 | break; 59 | } 60 | if (codingState == eStart) 61 | { 62 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 63 | 64 | if (i == 0) 65 | { 66 | mLastChar[1] = aBuf[0]; 67 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 68 | } 69 | else 70 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 71 | } 72 | } 73 | 74 | mLastChar[0] = aBuf[aLen-1]; 75 | 76 | if (mState == eDetecting) 77 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 78 | mState = eFoundIt; 79 | // else 80 | // mDistributionAnalyser.HandleData(aBuf, aLen); 81 | 82 | return mState; 83 | } 84 | 85 | float nsEUCKRProber::GetConfidence(void) 86 | { 87 | float distribCf = mDistributionAnalyser.GetConfidence(); 88 | 89 | return (float)distribCf; 90 | } 91 | 92 | -------------------------------------------------------------------------------- /universalchardet/nsEUCKRProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsEUCKRProber_h__ 39 | #define nsEUCKRProber_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | #include "CharDistribution.h" 44 | 45 | class nsEUCKRProber: public nsCharSetProber { 46 | public: 47 | nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); 48 | Reset();} 49 | virtual ~nsEUCKRProber(void){delete mCodingSM;} 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return "EUC-KR";} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | protected: 58 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 59 | 60 | nsCodingStateMachine* mCodingSM; 61 | nsProbingState mState; 62 | 63 | //EUCKRContextAnalysis mContextAnalyser; 64 | EUCKRDistributionAnalysis mDistributionAnalyser; 65 | char mLastChar[2]; 66 | 67 | }; 68 | 69 | 70 | #endif /* nsEUCKRProber_h__ */ 71 | 72 | -------------------------------------------------------------------------------- /universalchardet/nsEUCTWProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "nsEUCTWProber.h" 39 | 40 | void nsEUCTWProber::Reset(void) 41 | { 42 | mCodingSM->Reset(); 43 | mState = eDetecting; 44 | mDistributionAnalyser.Reset(); 45 | //mContextAnalyser.Reset(); 46 | } 47 | 48 | nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) 49 | { 50 | nsSMState codingState; 51 | 52 | for (PRUint32 i = 0; i < aLen; i++) 53 | { 54 | codingState = mCodingSM->NextState(aBuf[i]); 55 | if (codingState == eItsMe) 56 | { 57 | mState = eFoundIt; 58 | break; 59 | } 60 | if (codingState == eStart) 61 | { 62 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 63 | 64 | if (i == 0) 65 | { 66 | mLastChar[1] = aBuf[0]; 67 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 68 | } 69 | else 70 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 71 | } 72 | } 73 | 74 | mLastChar[0] = aBuf[aLen-1]; 75 | 76 | if (mState == eDetecting) 77 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 78 | mState = eFoundIt; 79 | // else 80 | // mDistributionAnalyser.HandleData(aBuf, aLen); 81 | 82 | return mState; 83 | } 84 | 85 | float nsEUCTWProber::GetConfidence(void) 86 | { 87 | float distribCf = mDistributionAnalyser.GetConfidence(); 88 | 89 | return (float)distribCf; 90 | } 91 | 92 | -------------------------------------------------------------------------------- /universalchardet/nsEUCTWProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsEUCTWProber_h__ 39 | #define nsEUCTWProber_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | #include "CharDistribution.h" 44 | 45 | class nsEUCTWProber: public nsCharSetProber { 46 | public: 47 | nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); 48 | Reset();} 49 | virtual ~nsEUCTWProber(void){delete mCodingSM;} 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return "x-euc-tw";} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | protected: 58 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 59 | 60 | nsCodingStateMachine* mCodingSM; 61 | nsProbingState mState; 62 | 63 | //EUCTWContextAnalysis mContextAnalyser; 64 | EUCTWDistributionAnalysis mDistributionAnalyser; 65 | char mLastChar[2]; 66 | 67 | }; 68 | 69 | 70 | #endif /* nsEUCTWProber_h__ */ 71 | 72 | -------------------------------------------------------------------------------- /universalchardet/nsEscCharsetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | 39 | #include "nsEscCharsetProber.h" 40 | #include "nsUniversalDetector.h" 41 | 42 | nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter) 43 | { 44 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 45 | mCodingSM[i] = nsnull; 46 | if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 47 | { 48 | mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); 49 | mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); 50 | } 51 | if (aLanguageFilter & NS_FILTER_JAPANESE) 52 | mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); 53 | if (aLanguageFilter & NS_FILTER_KOREAN) 54 | mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); 55 | mActiveSM = NUM_OF_ESC_CHARSETS; 56 | mState = eDetecting; 57 | mDetectedCharset = nsnull; 58 | } 59 | 60 | nsEscCharSetProber::~nsEscCharSetProber(void) 61 | { 62 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 63 | delete mCodingSM[i]; 64 | } 65 | 66 | void nsEscCharSetProber::Reset(void) 67 | { 68 | mState = eDetecting; 69 | for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) 70 | if (mCodingSM[i]) 71 | mCodingSM[i]->Reset(); 72 | mActiveSM = NUM_OF_ESC_CHARSETS; 73 | mDetectedCharset = nsnull; 74 | } 75 | 76 | nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) 77 | { 78 | nsSMState codingState; 79 | PRInt32 j; 80 | PRUint32 i; 81 | 82 | for ( i = 0; i < aLen && mState == eDetecting; i++) 83 | { 84 | for (j = mActiveSM-1; j>= 0; j--) 85 | { 86 | if (mCodingSM[j]) 87 | { 88 | codingState = mCodingSM[j]->NextState(aBuf[i]); 89 | if (codingState == eItsMe) 90 | { 91 | mState = eFoundIt; 92 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 93 | return mState; 94 | } 95 | } 96 | } 97 | } 98 | 99 | return mState; 100 | } 101 | 102 | -------------------------------------------------------------------------------- /universalchardet/nsEscCharsetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsEscCharSetProber_h__ 39 | #define nsEscCharSetProber_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | 44 | #define NUM_OF_ESC_CHARSETS 4 45 | 46 | class nsEscCharSetProber: public nsCharSetProber { 47 | public: 48 | nsEscCharSetProber(PRUint32 aLanguageFilter); 49 | virtual ~nsEscCharSetProber(void); 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return mDetectedCharset;} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void){return (float)0.99;} 55 | void SetOpion() {} 56 | 57 | protected: 58 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 59 | 60 | nsCodingStateMachine* mCodingSM[NUM_OF_ESC_CHARSETS] ; 61 | PRUint32 mActiveSM; 62 | nsProbingState mState; 63 | const char * mDetectedCharset; 64 | }; 65 | 66 | #endif /* nsEscCharSetProber_h__ */ 67 | 68 | -------------------------------------------------------------------------------- /universalchardet/nsEscSM.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | #include "nsCodingStateMachine.h" 38 | 39 | static PRUint32 HZ_cls[ 256 / 8 ] = { 40 | PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07 41 | PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f 42 | PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 43 | PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f 44 | PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 45 | PCK4BITS(0,0,0,0,0,0,0,0), // 28 - 2f 46 | PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 47 | PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f 48 | PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47 49 | PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f 50 | PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 51 | PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f 52 | PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 53 | PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f 54 | PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 55 | PCK4BITS(0,0,0,4,0,5,2,0), // 78 - 7f 56 | PCK4BITS(1,1,1,1,1,1,1,1), // 80 - 87 57 | PCK4BITS(1,1,1,1,1,1,1,1), // 88 - 8f 58 | PCK4BITS(1,1,1,1,1,1,1,1), // 90 - 97 59 | PCK4BITS(1,1,1,1,1,1,1,1), // 98 - 9f 60 | PCK4BITS(1,1,1,1,1,1,1,1), // a0 - a7 61 | PCK4BITS(1,1,1,1,1,1,1,1), // a8 - af 62 | PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7 63 | PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf 64 | PCK4BITS(1,1,1,1,1,1,1,1), // c0 - c7 65 | PCK4BITS(1,1,1,1,1,1,1,1), // c8 - cf 66 | PCK4BITS(1,1,1,1,1,1,1,1), // d0 - d7 67 | PCK4BITS(1,1,1,1,1,1,1,1), // d8 - df 68 | PCK4BITS(1,1,1,1,1,1,1,1), // e0 - e7 69 | PCK4BITS(1,1,1,1,1,1,1,1), // e8 - ef 70 | PCK4BITS(1,1,1,1,1,1,1,1), // f0 - f7 71 | PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff 72 | }; 73 | 74 | 75 | static PRUint32 HZ_st [ 6] = { 76 | PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07 77 | PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 78 | PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17 79 | PCK4BITS( 5,eError, 6,eError, 5, 5, 4,eError),//18-1f 80 | PCK4BITS( 4,eError, 4, 4, 4,eError, 4,eError),//20-27 81 | PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f 82 | }; 83 | 84 | static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0}; 85 | 86 | SMModel HZSMModel = { 87 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls }, 88 | 6, 89 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st }, 90 | HZCharLenTable, 91 | "HZ-GB-2312", 92 | }; 93 | 94 | 95 | static PRUint32 ISO2022CN_cls [ 256 / 8 ] = { 96 | PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 97 | PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f 98 | PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 99 | PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f 100 | PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 101 | PCK4BITS(0,3,0,0,0,0,0,0), // 28 - 2f 102 | PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 103 | PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f 104 | PCK4BITS(0,0,0,4,0,0,0,0), // 40 - 47 105 | PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f 106 | PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 107 | PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f 108 | PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 109 | PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f 110 | PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 111 | PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f 112 | PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87 113 | PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f 114 | PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97 115 | PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f 116 | PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 117 | PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af 118 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 119 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf 120 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 121 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf 122 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 123 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df 124 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 125 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef 126 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 127 | PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff 128 | }; 129 | 130 | 131 | static PRUint32 ISO2022CN_st [ 8] = { 132 | PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 133 | PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f 134 | PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 135 | PCK4BITS(eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError),//18-1f 136 | PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//20-27 137 | PCK4BITS( 5, 6,eError,eError,eError,eError,eError,eError),//28-2f 138 | PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//30-37 139 | PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f 140 | }; 141 | 142 | static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 143 | 144 | SMModel ISO2022CNSMModel = { 145 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls }, 146 | 9, 147 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st }, 148 | ISO2022CNCharLenTable, 149 | "ISO-2022-CN", 150 | }; 151 | 152 | static PRUint32 ISO2022JP_cls [ 256 / 8 ] = { 153 | PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 154 | PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f 155 | PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 156 | PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f 157 | PCK4BITS(0,0,0,0,7,0,0,0), // 20 - 27 158 | PCK4BITS(3,0,0,0,0,0,0,0), // 28 - 2f 159 | PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 160 | PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f 161 | PCK4BITS(6,0,4,0,8,0,0,0), // 40 - 47 162 | PCK4BITS(0,9,5,0,0,0,0,0), // 48 - 4f 163 | PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 164 | PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f 165 | PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 166 | PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f 167 | PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 168 | PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f 169 | PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87 170 | PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f 171 | PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97 172 | PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f 173 | PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 174 | PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af 175 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 176 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf 177 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 178 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf 179 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 180 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df 181 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 182 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef 183 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 184 | PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff 185 | }; 186 | 187 | 188 | static PRUint32 ISO2022JP_st [ 9] = { 189 | PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 190 | PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f 191 | PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 192 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError),//18-1f 193 | PCK4BITS(eError, 5,eError,eError,eError, 4,eError,eError),//20-27 194 | PCK4BITS(eError,eError,eError, 6,eItsMe,eError,eItsMe,eError),//28-2f 195 | PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//30-37 196 | PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//38-3f 197 | PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 198 | }; 199 | 200 | static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0}; 201 | 202 | SMModel ISO2022JPSMModel = { 203 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls }, 204 | 10, 205 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st }, 206 | ISO2022JPCharLenTable, 207 | "ISO-2022-JP", 208 | }; 209 | 210 | static PRUint32 ISO2022KR_cls [ 256 / 8 ] = { 211 | PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 212 | PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f 213 | PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 214 | PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f 215 | PCK4BITS(0,0,0,0,3,0,0,0), // 20 - 27 216 | PCK4BITS(0,4,0,0,0,0,0,0), // 28 - 2f 217 | PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 218 | PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f 219 | PCK4BITS(0,0,0,5,0,0,0,0), // 40 - 47 220 | PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f 221 | PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 222 | PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f 223 | PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 224 | PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f 225 | PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 226 | PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f 227 | PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87 228 | PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f 229 | PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97 230 | PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f 231 | PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7 232 | PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af 233 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 234 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf 235 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 236 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf 237 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 238 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df 239 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 240 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef 241 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 242 | PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff 243 | }; 244 | 245 | 246 | static PRUint32 ISO2022KR_st [ 5] = { 247 | PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07 248 | PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f 249 | PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17 250 | PCK4BITS(eError,eError,eError,eError, 5,eError,eError,eError),//18-1f 251 | PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27 252 | }; 253 | 254 | static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0}; 255 | 256 | SMModel ISO2022KRSMModel = { 257 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls }, 258 | 6, 259 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st }, 260 | ISO2022KRCharLenTable, 261 | "ISO-2022-KR", 262 | }; 263 | 264 | -------------------------------------------------------------------------------- /universalchardet/nsGB2312Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | // for S-JIS encoding, obeserve characteristic: 39 | // 1, kana character (or hankaku?) often have hight frequency of appereance 40 | // 2, kana character often exist in group 41 | // 3, certain combination of kana is never used in japanese language 42 | 43 | #include "nsGB2312Prober.h" 44 | 45 | void nsGB18030Prober::Reset(void) 46 | { 47 | mCodingSM->Reset(); 48 | mState = eDetecting; 49 | mDistributionAnalyser.Reset(); 50 | //mContextAnalyser.Reset(); 51 | } 52 | 53 | nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) 54 | { 55 | nsSMState codingState; 56 | 57 | for (PRUint32 i = 0; i < aLen; i++) 58 | { 59 | codingState = mCodingSM->NextState(aBuf[i]); 60 | if (codingState == eItsMe) 61 | { 62 | mState = eFoundIt; 63 | break; 64 | } 65 | if (codingState == eStart) 66 | { 67 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 68 | 69 | if (i == 0) 70 | { 71 | mLastChar[1] = aBuf[0]; 72 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 73 | } 74 | else 75 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 76 | } 77 | } 78 | 79 | mLastChar[0] = aBuf[aLen-1]; 80 | 81 | if (mState == eDetecting) 82 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 83 | mState = eFoundIt; 84 | // else 85 | // mDistributionAnalyser.HandleData(aBuf, aLen); 86 | 87 | return mState; 88 | } 89 | 90 | float nsGB18030Prober::GetConfidence(void) 91 | { 92 | float distribCf = mDistributionAnalyser.GetConfidence(); 93 | 94 | return (float)distribCf; 95 | } 96 | 97 | -------------------------------------------------------------------------------- /universalchardet/nsGB2312Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsGB2312Prober_h__ 39 | #define nsGB2312Prober_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | #include "CharDistribution.h" 44 | 45 | // We use gb18030 to replace gb2312, because 18030 is a superset. 46 | 47 | class nsGB18030Prober: public nsCharSetProber { 48 | public: 49 | nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel); 50 | Reset();} 51 | virtual ~nsGB18030Prober(void){delete mCodingSM;} 52 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 53 | const char* GetCharSetName() {return "gb18030";} 54 | nsProbingState GetState(void) {return mState;} 55 | void Reset(void); 56 | float GetConfidence(void); 57 | void SetOpion() {} 58 | 59 | protected: 60 | void GetDistribution(PRUint32 aCharLen, const char* aStr); 61 | 62 | nsCodingStateMachine* mCodingSM; 63 | nsProbingState mState; 64 | 65 | //GB2312ContextAnalysis mContextAnalyser; 66 | GB2312DistributionAnalysis mDistributionAnalyser; 67 | char mLastChar[2]; 68 | 69 | }; 70 | 71 | 72 | #endif /* nsGB2312Prober_h__ */ 73 | 74 | -------------------------------------------------------------------------------- /universalchardet/nsHebrewProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Shy Shalom 19 | * Portions created by the Initial Developer are Copyright (C) 2005 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "nsHebrewProber.h" 39 | #include 40 | 41 | // windows-1255 / ISO-8859-8 code points of interest 42 | #define FINAL_KAF ('\xea') 43 | #define NORMAL_KAF ('\xeb') 44 | #define FINAL_MEM ('\xed') 45 | #define NORMAL_MEM ('\xee') 46 | #define FINAL_NUN ('\xef') 47 | #define NORMAL_NUN ('\xf0') 48 | #define FINAL_PE ('\xf3') 49 | #define NORMAL_PE ('\xf4') 50 | #define FINAL_TSADI ('\xf5') 51 | #define NORMAL_TSADI ('\xf6') 52 | 53 | // Minimum Visual vs Logical final letter score difference. 54 | // If the difference is below this, don't rely solely on the final letter score distance. 55 | #define MIN_FINAL_CHAR_DISTANCE (5) 56 | 57 | // Minimum Visual vs Logical model score difference. 58 | // If the difference is below this, don't rely at all on the model score distance. 59 | #define MIN_MODEL_DISTANCE (0.01) 60 | 61 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") 62 | #define LOGICAL_HEBREW_NAME ("windows-1255") 63 | 64 | PRBool nsHebrewProber::isFinal(char c) 65 | { 66 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 67 | } 68 | 69 | PRBool nsHebrewProber::isNonFinal(char c) 70 | { 71 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 72 | // The normal Tsadi is not a good Non-Final letter due to words like 73 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 74 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 75 | // the Non-Final tsadi to appear at an end of a word even though this is not 76 | // the case in the original text. 77 | // The letters Pe and Kaf rarely display a related behavior of not being a 78 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 79 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of 80 | // these letters as Non-Final letters outweighs the damage since these words 81 | // are quite rare. 82 | } 83 | 84 | /** HandleData 85 | * Final letter analysis for logical-visual decision. 86 | * Look for evidence that the received buffer is either logical Hebrew or 87 | * visual Hebrew. 88 | * The following cases are checked: 89 | * 1) A word longer than 1 letter, ending with a final letter. This is an 90 | * indication that the text is laid out "naturally" since the final letter 91 | * really appears at the end. +1 for logical score. 92 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 93 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 94 | * the Non-Final form of that letter. Exceptions to this rule are mentioned 95 | * above in isNonFinal(). This is an indication that the text is laid out 96 | * backwards. +1 for visual score 97 | * 3) A word longer than 1 letter, starting with a final letter. Final letters 98 | * should not appear at the beginning of a word. This is an indication that 99 | * the text is laid out backwards. +1 for visual score. 100 | * 101 | * The visual score and logical score are accumulated throughout the text and 102 | * are finally checked against each other in GetCharSetName(). 103 | * No checking for final letters in the middle of words is done since that case 104 | * is not an indication for either Logical or Visual text. 105 | * 106 | * The input buffer should not contain any white spaces that are not (' ') 107 | * or any low-ascii punctuation marks. 108 | */ 109 | nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen) 110 | { 111 | // Both model probers say it's not them. No reason to continue. 112 | if (GetState() == eNotMe) 113 | return eNotMe; 114 | 115 | const char *curPtr, *endPtr = aBuf+aLen; 116 | char cur; 117 | 118 | for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) 119 | { 120 | cur = *curPtr; 121 | if (cur == ' ') // We stand on a space - a word just ended 122 | { 123 | if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word 124 | { 125 | if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] 126 | ++mFinalCharLogicalScore; 127 | else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] 128 | ++mFinalCharVisualScore; 129 | } 130 | } 131 | else // Not standing on a space 132 | { 133 | if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] 134 | ++mFinalCharVisualScore; 135 | } 136 | mBeforePrev = mPrev; 137 | mPrev = cur; 138 | } 139 | 140 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). 141 | return eDetecting; 142 | } 143 | 144 | // Make the decision: is it Logical or Visual? 145 | const char* nsHebrewProber::GetCharSetName() 146 | { 147 | // If the final letter score distance is dominant enough, rely on it. 148 | PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 149 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 150 | return LOGICAL_HEBREW_NAME; 151 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) 152 | return VISUAL_HEBREW_NAME; 153 | 154 | // It's not dominant enough, try to rely on the model scores instead. 155 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 156 | if (modelsub > MIN_MODEL_DISTANCE) 157 | return LOGICAL_HEBREW_NAME; 158 | if (modelsub < -(MIN_MODEL_DISTANCE)) 159 | return VISUAL_HEBREW_NAME; 160 | 161 | // Still no good, back to final letter distance, maybe it'll save the day. 162 | if (finalsub < 0) 163 | return VISUAL_HEBREW_NAME; 164 | 165 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 166 | return LOGICAL_HEBREW_NAME; 167 | } 168 | 169 | 170 | void nsHebrewProber::Reset(void) 171 | { 172 | mFinalCharLogicalScore = 0; 173 | mFinalCharVisualScore = 0; 174 | 175 | // mPrev and mBeforePrev are initialized to space in order to simulate a word 176 | // delimiter at the beginning of the data 177 | mPrev = ' '; 178 | mBeforePrev = ' '; 179 | } 180 | 181 | nsProbingState nsHebrewProber::GetState(void) 182 | { 183 | // Remain active as long as any of the model probers are active. 184 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) 185 | return eNotMe; 186 | return eDetecting; 187 | } 188 | 189 | #ifdef DEBUG_chardet 190 | void nsHebrewProber::DumpStatus() 191 | { 192 | printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 193 | } 194 | #endif 195 | -------------------------------------------------------------------------------- /universalchardet/nsHebrewProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Shy Shalom 19 | * Portions created by the Initial Developer are Copyright (C) 2005 20 | * the Initial Developer: All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsHebrewProber_h__ 39 | #define nsHebrewProber_h__ 40 | 41 | #include "nsSBCharSetProber.h" 42 | 43 | // This prober doesn't actually recognize a language or a charset. 44 | // It is a helper prober for the use of the Hebrew model probers 45 | class nsHebrewProber: public nsCharSetProber 46 | { 47 | public: 48 | nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); } 49 | 50 | virtual ~nsHebrewProber(void) {} 51 | virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 52 | virtual const char* GetCharSetName(); 53 | virtual void Reset(void); 54 | 55 | virtual nsProbingState GetState(void); 56 | 57 | virtual float GetConfidence(void) { return (float)0.0; } 58 | virtual void SetOpion() {} 59 | 60 | void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) 61 | { mLogicalProb = logicalPrb; mVisualProb = visualPrb; } 62 | 63 | #ifdef DEBUG_chardet 64 | virtual void DumpStatus(); 65 | #endif 66 | 67 | protected: 68 | static PRBool isFinal(char c); 69 | static PRBool isNonFinal(char c); 70 | 71 | PRInt32 mFinalCharLogicalScore, mFinalCharVisualScore; 72 | 73 | // The two last characters seen in the previous buffer. 74 | char mPrev, mBeforePrev; 75 | 76 | // These probers are owned by the group prober. 77 | nsCharSetProber *mLogicalProb, *mVisualProb; 78 | }; 79 | 80 | /** 81 | * ** General ideas of the Hebrew charset recognition ** 82 | * 83 | * Four main charsets exist in Hebrew: 84 | * "ISO-8859-8" - Visual Hebrew 85 | * "windows-1255" - Logical Hebrew 86 | * "ISO-8859-8-I" - Logical Hebrew 87 | * "x-mac-hebrew" - ?? Logical Hebrew ?? 88 | * 89 | * Both "ISO" charsets use a completely identical set of code points, whereas 90 | * "windows-1255" and "x-mac-hebrew" are two different proper supersets of 91 | * these code points. windows-1255 defines additional characters in the range 92 | * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific 93 | * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. 94 | * x-mac-hebrew defines similar additional code points but with a different 95 | * mapping. 96 | * 97 | * As far as an average Hebrew text with no diacritics is concerned, all four 98 | * charsets are identical with respect to code points. Meaning that for the 99 | * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters 100 | * (including final letters). 101 | * 102 | * The dominant difference between these charsets is their directionality. 103 | * "Visual" directionality means that the text is ordered as if the renderer is 104 | * not aware of a BIDI rendering algorithm. The renderer sees the text and 105 | * draws it from left to right. The text itself when ordered naturally is read 106 | * backwards. A buffer of Visual Hebrew generally looks like so: 107 | * "[last word of first line spelled backwards] [whole line ordered backwards 108 | * and spelled backwards] [first word of first line spelled backwards] 109 | * [end of line] [last word of second line] ... etc' " 110 | * adding punctuation marks, numbers and English text to visual text is 111 | * naturally also "visual" and from left to right. 112 | * 113 | * "Logical" directionality means the text is ordered "naturally" according to 114 | * the order it is read. It is the responsibility of the renderer to display 115 | * the text from right to left. A BIDI algorithm is used to place general 116 | * punctuation marks, numbers and English text in the text. 117 | * 118 | * Texts in x-mac-hebrew are almost impossible to find on the Internet. From 119 | * what little evidence I could find, it seems that its general directionality 120 | * is Logical. 121 | * 122 | * To sum up all of the above, the Hebrew probing mechanism knows about two 123 | * charsets: 124 | * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are 125 | * backwards while line order is natural. For charset recognition purposes 126 | * the line order is unimportant (In fact, for this implementation, even 127 | * word order is unimportant). 128 | * Logical Hebrew - "windows-1255" - normal, naturally ordered text. 129 | * 130 | * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be 131 | * specifically identified. 132 | * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew 133 | * that contain special punctuation marks or diacritics is displayed with 134 | * some unconverted characters showing as question marks. This problem might 135 | * be corrected using another model prober for x-mac-hebrew. Due to the fact 136 | * that x-mac-hebrew texts are so rare, writing another model prober isn't 137 | * worth the effort and performance hit. 138 | * 139 | * *** The Prober *** 140 | * 141 | * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber, 142 | * all of which are managed, created, fed data, inquired and deleted by the 143 | * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in 144 | * fact some kind of Hebrew, Logical or Visual. The final decision about which 145 | * one is it is made by the nsHebrewProber by combining final-letter scores 146 | * with the scores of the two nsSBCharSetProbers to produce a final answer. 147 | * 148 | * The nsSBCSGroupProber is responsible for stripping the original text of HTML 149 | * tags, English characters, numbers, low-ASCII punctuation characters, spaces 150 | * and new lines. It reduces any sequence of such characters to a single space. 151 | * The buffer fed to each prober in the SBCS group prober is pure text in 152 | * high-ASCII. 153 | * The two nsSBCharSetProbers (model probers) share the same language model: 154 | * Win1255Model. 155 | * The first nsSBCharSetProber uses the model normally as any other 156 | * nsSBCharSetProber does, to recognize windows-1255, upon which this model was 157 | * built. The second nsSBCharSetProber is told to make the pair-of-letter 158 | * lookup in the language model backwards. This in practice exactly simulates 159 | * a visual Hebrew model using the windows-1255 logical Hebrew model. 160 | * 161 | * The nsHebrewProber is not using any language model. All it does is look for 162 | * final-letter evidence suggesting the text is either logical Hebrew or visual 163 | * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber 164 | * alone are meaningless. nsHebrewProber always returns 0.00 as confidence 165 | * since it never identifies a charset by itself. Instead, the pointer to the 166 | * nsHebrewProber is passed to the model probers as a helper "Name Prober". 167 | * When the Group prober receives a positive identification from any prober, 168 | * it asks for the name of the charset identified. If the prober queried is a 169 | * Hebrew model prober, the model prober forwards the call to the 170 | * nsHebrewProber to make the final decision. In the nsHebrewProber, the 171 | * decision is made according to the final-letters scores maintained and Both 172 | * model probers scores. The answer is returned in the form of the name of the 173 | * charset identified, either "windows-1255" or "ISO-8859-8". 174 | * 175 | */ 176 | #endif /* nsHebrewProber_h__ */ 177 | -------------------------------------------------------------------------------- /universalchardet/nsLatin1Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #include "nsLatin1Prober.h" 40 | #include "prmem.h" 41 | #include 42 | 43 | #define UDF 0 // undefined 44 | #define OTH 1 //other 45 | #define ASC 2 // ascii capital letter 46 | #define ASS 3 // ascii small letter 47 | #define ACV 4 // accent capital vowel 48 | #define ACO 5 // accent capital other 49 | #define ASV 6 // accent small vowel 50 | #define ASO 7 // accent small other 51 | #define CLASS_NUM 8 // total classes 52 | 53 | static unsigned char Latin1_CharToClass[] = 54 | { 55 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 56 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 57 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 58 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 59 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 60 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 61 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 62 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 63 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 64 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 65 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 66 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 67 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 68 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 69 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 70 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 71 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 72 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 73 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 74 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 75 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 76 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 77 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 78 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 79 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 80 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 81 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 82 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 83 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 84 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 85 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 86 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 87 | }; 88 | 89 | 90 | /* 0 : illegal 91 | 1 : very unlikely 92 | 2 : normal 93 | 3 : very likely 94 | */ 95 | static unsigned char Latin1ClassModel[] = 96 | { 97 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ 98 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 99 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 100 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 101 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 102 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 103 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 104 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 105 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 106 | }; 107 | 108 | void nsLatin1Prober::Reset(void) 109 | { 110 | mState = eDetecting; 111 | mLastCharClass = OTH; 112 | for (int i = 0; i < FREQ_CAT_NUM; i++) 113 | mFreqCounter[i] = 0; 114 | } 115 | 116 | 117 | nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) 118 | { 119 | char *newBuf1 = 0; 120 | PRUint32 newLen1 = 0; 121 | 122 | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 123 | newBuf1 = (char*)aBuf; 124 | newLen1 = aLen; 125 | } 126 | 127 | unsigned char charClass; 128 | unsigned char freq; 129 | for (PRUint32 i = 0; i < newLen1; i++) 130 | { 131 | charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 132 | freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; 133 | if (freq == 0) { 134 | mState = eNotMe; 135 | break; 136 | } 137 | mFreqCounter[freq]++; 138 | mLastCharClass = charClass; 139 | } 140 | 141 | if (newBuf1 != aBuf) 142 | PR_FREEIF(newBuf1); 143 | 144 | return mState; 145 | } 146 | 147 | float nsLatin1Prober::GetConfidence(void) 148 | { 149 | if (mState == eNotMe) 150 | return 0.01f; 151 | 152 | float confidence; 153 | PRUint32 total = 0; 154 | for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) 155 | total += mFreqCounter[i]; 156 | 157 | if(!total) 158 | confidence = 0.0f; 159 | else 160 | { 161 | confidence = mFreqCounter[3]*1.0f / total; 162 | confidence -= mFreqCounter[1]*20.0f/total; 163 | } 164 | 165 | if (confidence < 0.0f) 166 | confidence = 0.0f; 167 | 168 | // lower the confidence of latin1 so that other more accurate detector 169 | // can take priority. 170 | confidence *= 0.50f; 171 | 172 | return confidence; 173 | } 174 | 175 | #ifdef DEBUG_chardet 176 | void nsLatin1Prober::DumpStatus() 177 | { 178 | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 179 | } 180 | #endif 181 | 182 | 183 | -------------------------------------------------------------------------------- /universalchardet/nsLatin1Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #ifndef nsLatin1Prober_h__ 40 | #define nsLatin1Prober_h__ 41 | 42 | #include "nsCharSetProber.h" 43 | 44 | #define FREQ_CAT_NUM 4 45 | 46 | class nsLatin1Prober: public nsCharSetProber { 47 | public: 48 | nsLatin1Prober(void){Reset();} 49 | virtual ~nsLatin1Prober(void){} 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return "windows-1252";} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | #ifdef DEBUG_chardet 58 | virtual void DumpStatus(); 59 | #endif 60 | 61 | protected: 62 | 63 | nsProbingState mState; 64 | char mLastCharClass; 65 | PRUint32 mFreqCounter[FREQ_CAT_NUM]; 66 | }; 67 | 68 | 69 | #endif /* nsLatin1Prober_h__ */ 70 | 71 | -------------------------------------------------------------------------------- /universalchardet/nsMBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * Proofpoint, Inc. 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | #include 40 | 41 | #include "nsMBCSGroupProber.h" 42 | #include "nsUniversalDetector.h" 43 | 44 | #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) 45 | const char *ProberName[] = 46 | { 47 | "UTF8", 48 | "SJIS", 49 | "EUCJP", 50 | "GB18030", 51 | "EUCKR", 52 | "Big5", 53 | "EUCTW", 54 | }; 55 | 56 | #endif 57 | 58 | nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) 59 | { 60 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 61 | mProbers[i] = nsnull; 62 | 63 | mProbers[0] = new nsUTF8Prober(); 64 | if (aLanguageFilter & NS_FILTER_JAPANESE) 65 | { 66 | mProbers[1] = new nsSJISProber(); 67 | mProbers[2] = new nsEUCJPProber(); 68 | } 69 | if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 70 | mProbers[3] = new nsGB18030Prober(); 71 | if (aLanguageFilter & NS_FILTER_KOREAN) 72 | mProbers[4] = new nsEUCKRProber(); 73 | if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 74 | { 75 | mProbers[5] = new nsBig5Prober(); 76 | mProbers[6] = new nsEUCTWProber(); 77 | } 78 | Reset(); 79 | } 80 | 81 | nsMBCSGroupProber::~nsMBCSGroupProber() 82 | { 83 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 84 | { 85 | delete mProbers[i]; 86 | } 87 | } 88 | 89 | const char* nsMBCSGroupProber::GetCharSetName() 90 | { 91 | if (mBestGuess == -1) 92 | { 93 | GetConfidence(); 94 | if (mBestGuess == -1) 95 | mBestGuess = 0; 96 | } 97 | return mProbers[mBestGuess]->GetCharSetName(); 98 | } 99 | 100 | void nsMBCSGroupProber::Reset(void) 101 | { 102 | mActiveNum = 0; 103 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 104 | { 105 | if (mProbers[i]) 106 | { 107 | mProbers[i]->Reset(); 108 | mIsActive[i] = PR_TRUE; 109 | ++mActiveNum; 110 | } 111 | else 112 | mIsActive[i] = PR_FALSE; 113 | } 114 | mBestGuess = -1; 115 | mState = eDetecting; 116 | mKeepNext = 0; 117 | } 118 | 119 | nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) 120 | { 121 | nsProbingState st; 122 | PRUint32 start = 0; 123 | PRUint32 keepNext = mKeepNext; 124 | 125 | //do filtering to reduce load to probers 126 | for (PRUint32 pos = 0; pos < aLen; ++pos) 127 | { 128 | if (aBuf[pos] & 0x80) 129 | { 130 | if (!keepNext) 131 | start = pos; 132 | keepNext = 2; 133 | } 134 | else if (keepNext) 135 | { 136 | if (--keepNext == 0) 137 | { 138 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 139 | { 140 | if (!mIsActive[i]) 141 | continue; 142 | st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); 143 | if (st == eFoundIt) 144 | { 145 | mBestGuess = i; 146 | mState = eFoundIt; 147 | return mState; 148 | } 149 | } 150 | } 151 | } 152 | } 153 | 154 | if (keepNext) { 155 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 156 | { 157 | if (!mIsActive[i]) 158 | continue; 159 | st = mProbers[i]->HandleData(aBuf + start, aLen - start); 160 | if (st == eFoundIt) 161 | { 162 | mBestGuess = i; 163 | mState = eFoundIt; 164 | return mState; 165 | } 166 | } 167 | } 168 | mKeepNext = keepNext; 169 | 170 | return mState; 171 | } 172 | 173 | float nsMBCSGroupProber::GetConfidence(void) 174 | { 175 | PRUint32 i; 176 | float bestConf = 0.0, cf; 177 | 178 | switch (mState) 179 | { 180 | case eFoundIt: 181 | return (float)0.99; 182 | case eNotMe: 183 | return (float)0.01; 184 | default: 185 | for (i = 0; i < NUM_OF_PROBERS; i++) 186 | { 187 | if (!mIsActive[i]) 188 | continue; 189 | cf = mProbers[i]->GetConfidence(); 190 | if (bestConf < cf) 191 | { 192 | bestConf = cf; 193 | mBestGuess = i; 194 | } 195 | } 196 | } 197 | return bestConf; 198 | } 199 | 200 | #ifdef DEBUG_chardet 201 | void nsMBCSGroupProber::DumpStatus() 202 | { 203 | PRUint32 i; 204 | float cf; 205 | 206 | GetConfidence(); 207 | for (i = 0; i < NUM_OF_PROBERS; i++) 208 | { 209 | if (!mIsActive[i]) 210 | printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 211 | else 212 | { 213 | cf = mProbers[i]->GetConfidence(); 214 | printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); 215 | } 216 | } 217 | } 218 | #endif 219 | 220 | #ifdef DEBUG_jgmyers 221 | void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset) 222 | { 223 | for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) { 224 | states[offset].name = ProberName[i]; 225 | states[offset].isActive = mIsActive[i]; 226 | states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; 227 | ++offset; 228 | } 229 | } 230 | #endif /* DEBUG_jgmyers */ 231 | -------------------------------------------------------------------------------- /universalchardet/nsMBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Proofpoint, Inc. 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #ifndef nsMBCSGroupProber_h__ 40 | #define nsMBCSGroupProber_h__ 41 | 42 | #include "nsSJISProber.h" 43 | #include "nsUTF8Prober.h" 44 | #include "nsEUCJPProber.h" 45 | #include "nsGB2312Prober.h" 46 | #include "nsEUCKRProber.h" 47 | #include "nsBig5Prober.h" 48 | #include "nsEUCTWProber.h" 49 | 50 | #define NUM_OF_PROBERS 7 51 | 52 | class nsMBCSGroupProber: public nsCharSetProber { 53 | public: 54 | nsMBCSGroupProber(PRUint32 aLanguageFilter); 55 | virtual ~nsMBCSGroupProber(); 56 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 57 | const char* GetCharSetName(); 58 | nsProbingState GetState(void) {return mState;} 59 | void Reset(void); 60 | float GetConfidence(void); 61 | void SetOpion() {} 62 | 63 | #ifdef DEBUG_chardet 64 | void DumpStatus(); 65 | #endif 66 | #ifdef DEBUG_jgmyers 67 | void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset); 68 | #endif 69 | 70 | protected: 71 | nsProbingState mState; 72 | nsCharSetProber* mProbers[NUM_OF_PROBERS]; 73 | PRBool mIsActive[NUM_OF_PROBERS]; 74 | PRInt32 mBestGuess; 75 | PRUint32 mActiveNum; 76 | PRUint32 mKeepNext; 77 | }; 78 | 79 | #endif /* nsMBCSGroupProber_h__ */ 80 | 81 | -------------------------------------------------------------------------------- /universalchardet/nsPkgInt.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsPkgInt_h__ 39 | #define nsPkgInt_h__ 40 | #include "nscore.h" 41 | 42 | typedef enum { 43 | eIdxSft4bits = 3, 44 | eIdxSft8bits = 2, 45 | eIdxSft16bits = 1 46 | } nsIdxSft; 47 | 48 | typedef enum { 49 | eSftMsk4bits = 7, 50 | eSftMsk8bits = 3, 51 | eSftMsk16bits = 1 52 | } nsSftMsk; 53 | 54 | typedef enum { 55 | eBitSft4bits = 2, 56 | eBitSft8bits = 3, 57 | eBitSft16bits = 4 58 | } nsBitSft; 59 | 60 | typedef enum { 61 | eUnitMsk4bits = 0x0000000FL, 62 | eUnitMsk8bits = 0x000000FFL, 63 | eUnitMsk16bits = 0x0000FFFFL 64 | } nsUnitMsk; 65 | 66 | typedef struct nsPkgInt { 67 | nsIdxSft idxsft; 68 | nsSftMsk sftmsk; 69 | nsBitSft bitsft; 70 | nsUnitMsk unitmsk; 71 | PRUint32 *data; 72 | } nsPkgInt; 73 | 74 | 75 | #define PCK16BITS(a,b) ((PRUint32)(((b) << 16) | (a))) 76 | 77 | #define PCK8BITS(a,b,c,d) PCK16BITS( ((PRUint32)(((b) << 8) | (a))), \ 78 | ((PRUint32)(((d) << 8) | (c)))) 79 | 80 | #define PCK4BITS(a,b,c,d,e,f,g,h) PCK8BITS( ((PRUint32)(((b) << 4) | (a))), \ 81 | ((PRUint32)(((d) << 4) | (c))), \ 82 | ((PRUint32)(((f) << 4) | (e))), \ 83 | ((PRUint32)(((h) << 4) | (g))) ) 84 | 85 | #define GETFROMPCK(i, c) \ 86 | (((((c).data)[(i)>>(c).idxsft])>>(((i)&(c).sftmsk)<<(c).bitsft))&(c).unitmsk) 87 | 88 | #endif /* nsPkgInt_h__ */ 89 | 90 | -------------------------------------------------------------------------------- /universalchardet/nsSBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #include 40 | #include "prmem.h" 41 | 42 | #include "nsSBCharSetProber.h" 43 | #include "nsSBCSGroupProber.h" 44 | 45 | #include "nsHebrewProber.h" 46 | 47 | nsSBCSGroupProber::nsSBCSGroupProber() 48 | { 49 | mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); 50 | mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); 51 | mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); 52 | mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); 53 | mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); 54 | mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); 55 | mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); 56 | mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); 57 | mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); 58 | mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); 59 | 60 | nsHebrewProber *hebprober = new nsHebrewProber(); 61 | // Notice: Any change in these indexes - 10,11,12 must be reflected 62 | // in the code below as well. 63 | mProbers[10] = hebprober; 64 | mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew 65 | mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew 66 | // Tell the Hebrew prober about the logical and visual probers 67 | if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null 68 | { 69 | hebprober->SetModelProbers(mProbers[11], mProbers[12]); 70 | } 71 | else // One or more is null. avoid any Hebrew probing, null them all 72 | { 73 | for (PRUint32 i = 10; i <= 12; ++i) 74 | { 75 | delete mProbers[i]; 76 | mProbers[i] = 0; 77 | } 78 | } 79 | 80 | // disable latin2 before latin1 is available, otherwise all latin1 81 | // will be detected as latin2 because of their similarity. 82 | //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); 83 | //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); 84 | 85 | Reset(); 86 | } 87 | 88 | nsSBCSGroupProber::~nsSBCSGroupProber() 89 | { 90 | for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) 91 | { 92 | delete mProbers[i]; 93 | } 94 | } 95 | 96 | 97 | const char* nsSBCSGroupProber::GetCharSetName() 98 | { 99 | //if we have no answer yet 100 | if (mBestGuess == -1) 101 | { 102 | GetConfidence(); 103 | //no charset seems positive 104 | if (mBestGuess == -1) 105 | //we will use default. 106 | mBestGuess = 0; 107 | } 108 | return mProbers[mBestGuess]->GetCharSetName(); 109 | } 110 | 111 | void nsSBCSGroupProber::Reset(void) 112 | { 113 | mActiveNum = 0; 114 | for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) 115 | { 116 | if (mProbers[i]) // not null 117 | { 118 | mProbers[i]->Reset(); 119 | mIsActive[i] = PR_TRUE; 120 | ++mActiveNum; 121 | } 122 | else 123 | mIsActive[i] = PR_FALSE; 124 | } 125 | mBestGuess = -1; 126 | mState = eDetecting; 127 | } 128 | 129 | 130 | nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) 131 | { 132 | nsProbingState st; 133 | PRUint32 i; 134 | char *newBuf1 = 0; 135 | PRUint32 newLen1 = 0; 136 | 137 | //apply filter to original buffer, and we got new buffer back 138 | //depend on what script it is, we will feed them the new buffer 139 | //we got after applying proper filter 140 | //this is done without any consideration to KeepEnglishLetters 141 | //of each prober since as of now, there are no probers here which 142 | //recognize languages with English characters. 143 | if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) 144 | goto done; 145 | 146 | if (newLen1 == 0) 147 | goto done; // Nothing to see here, move on. 148 | 149 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 150 | { 151 | if (!mIsActive[i]) 152 | continue; 153 | st = mProbers[i]->HandleData(newBuf1, newLen1); 154 | if (st == eFoundIt) 155 | { 156 | mBestGuess = i; 157 | mState = eFoundIt; 158 | break; 159 | } 160 | else if (st == eNotMe) 161 | { 162 | mIsActive[i] = PR_FALSE; 163 | mActiveNum--; 164 | if (mActiveNum <= 0) 165 | { 166 | mState = eNotMe; 167 | break; 168 | } 169 | } 170 | } 171 | 172 | done: 173 | PR_FREEIF(newBuf1); 174 | 175 | return mState; 176 | } 177 | 178 | float nsSBCSGroupProber::GetConfidence(void) 179 | { 180 | PRUint32 i; 181 | float bestConf = 0.0, cf; 182 | 183 | switch (mState) 184 | { 185 | case eFoundIt: 186 | return (float)0.99; //sure yes 187 | case eNotMe: 188 | return (float)0.01; //sure no 189 | default: 190 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 191 | { 192 | if (!mIsActive[i]) 193 | continue; 194 | cf = mProbers[i]->GetConfidence(); 195 | if (bestConf < cf) 196 | { 197 | bestConf = cf; 198 | mBestGuess = i; 199 | } 200 | } 201 | } 202 | return bestConf; 203 | } 204 | 205 | #ifdef DEBUG_chardet 206 | void nsSBCSGroupProber::DumpStatus() 207 | { 208 | PRUint32 i; 209 | float cf; 210 | 211 | cf = GetConfidence(); 212 | printf(" SBCS Group Prober --------begin status \r\n"); 213 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 214 | { 215 | if (!mIsActive[i]) 216 | printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); 217 | else 218 | mProbers[i]->DumpStatus(); 219 | } 220 | printf(" SBCS Group found best match [%s] confidence %f.\r\n", 221 | mProbers[mBestGuess]->GetCharSetName(), cf); 222 | } 223 | #endif 224 | -------------------------------------------------------------------------------- /universalchardet/nsSBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #ifndef nsSBCSGroupProber_h__ 40 | #define nsSBCSGroupProber_h__ 41 | 42 | 43 | #define NUM_OF_SBCS_PROBERS 13 44 | 45 | class nsCharSetProber; 46 | class nsSBCSGroupProber: public nsCharSetProber { 47 | public: 48 | nsSBCSGroupProber(); 49 | virtual ~nsSBCSGroupProber(); 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName(); 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | #ifdef DEBUG_chardet 58 | void DumpStatus(); 59 | #endif 60 | 61 | protected: 62 | nsProbingState mState; 63 | nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS]; 64 | PRBool mIsActive[NUM_OF_SBCS_PROBERS]; 65 | PRInt32 mBestGuess; 66 | PRUint32 mActiveNum; 67 | }; 68 | 69 | #endif /* nsSBCSGroupProber_h__ */ 70 | 71 | -------------------------------------------------------------------------------- /universalchardet/nsSBCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | #include 39 | #include "nsSBCharSetProber.h" 40 | 41 | nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) 42 | { 43 | unsigned char order; 44 | 45 | for (PRUint32 i = 0; i < aLen; i++) 46 | { 47 | order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; 48 | 49 | if (order < SYMBOL_CAT_ORDER) 50 | mTotalChar++; 51 | if (order < SAMPLE_SIZE) 52 | { 53 | mFreqChar++; 54 | 55 | if (mLastOrder < SAMPLE_SIZE) 56 | { 57 | mTotalSeqs++; 58 | if (!mReversed) 59 | ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); 60 | else // reverse the order of the letters in the lookup 61 | ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); 62 | } 63 | } 64 | mLastOrder = order; 65 | } 66 | 67 | if (mState == eDetecting) 68 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) 69 | { 70 | float cf = GetConfidence(); 71 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) 72 | mState = eFoundIt; 73 | else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) 74 | mState = eNotMe; 75 | } 76 | 77 | return mState; 78 | } 79 | 80 | void nsSingleByteCharSetProber::Reset(void) 81 | { 82 | mState = eDetecting; 83 | mLastOrder = 255; 84 | for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++) 85 | mSeqCounters[i] = 0; 86 | mTotalSeqs = 0; 87 | mTotalChar = 0; 88 | mFreqChar = 0; 89 | } 90 | 91 | //#define NEGATIVE_APPROACH 1 92 | 93 | float nsSingleByteCharSetProber::GetConfidence(void) 94 | { 95 | #ifdef NEGATIVE_APPROACH 96 | if (mTotalSeqs > 0) 97 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) 98 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; 99 | return (float)0.01; 100 | #else //POSITIVE_APPROACH 101 | float r; 102 | 103 | if (mTotalSeqs > 0) { 104 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; 105 | r = r*mFreqChar/mTotalChar; 106 | if (r >= (float)1.00) 107 | r = (float)0.99; 108 | return r; 109 | } 110 | return (float)0.01; 111 | #endif 112 | } 113 | 114 | const char* nsSingleByteCharSetProber::GetCharSetName() 115 | { 116 | if (!mNameProber) 117 | return mModel->charsetName; 118 | return mNameProber->GetCharSetName(); 119 | } 120 | 121 | #ifdef DEBUG_chardet 122 | void nsSingleByteCharSetProber::DumpStatus() 123 | { 124 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 125 | } 126 | #endif 127 | -------------------------------------------------------------------------------- /universalchardet/nsSBCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | #ifndef nsSingleByteCharSetProber_h__ 39 | #define nsSingleByteCharSetProber_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | 43 | #define SAMPLE_SIZE 64 44 | #define SB_ENOUGH_REL_THRESHOLD 1024 45 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 46 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 47 | #define SYMBOL_CAT_ORDER 250 48 | #define NUMBER_OF_SEQ_CAT 4 49 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) 50 | #define NEGATIVE_CAT 0 51 | 52 | typedef struct 53 | { 54 | unsigned char *charToOrderMap; // [256] table use to find a char's order 55 | char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 56 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 57 | PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) 58 | const char* charsetName; 59 | } SequenceModel; 60 | 61 | 62 | class nsSingleByteCharSetProber : public nsCharSetProber{ 63 | public: 64 | nsSingleByteCharSetProber(SequenceModel *model) 65 | :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } 66 | nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) 67 | :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } 68 | 69 | virtual const char* GetCharSetName(); 70 | virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 71 | virtual nsProbingState GetState(void) {return mState;} 72 | virtual void Reset(void); 73 | virtual float GetConfidence(void); 74 | virtual void SetOpion() {} 75 | 76 | // This feature is not implemented yet. any current language model 77 | // contain this parameter as PR_FALSE. No one is looking at this 78 | // parameter or calling this method. 79 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this 80 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 81 | // of the English letters. 82 | PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) 83 | 84 | #ifdef DEBUG_chardet 85 | virtual void DumpStatus(); 86 | #endif 87 | 88 | protected: 89 | nsProbingState mState; 90 | const SequenceModel *mModel; 91 | const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup 92 | 93 | //char order of last character 94 | unsigned char mLastOrder; 95 | 96 | PRUint32 mTotalSeqs; 97 | PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; 98 | 99 | PRUint32 mTotalChar; 100 | //characters that fall in our sampling range 101 | PRUint32 mFreqChar; 102 | 103 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 104 | nsCharSetProber* mNameProber; 105 | 106 | }; 107 | 108 | 109 | extern SequenceModel Koi8rModel; 110 | extern SequenceModel Win1251Model; 111 | extern SequenceModel Latin5Model; 112 | extern SequenceModel MacCyrillicModel; 113 | extern SequenceModel Ibm866Model; 114 | extern SequenceModel Ibm855Model; 115 | extern SequenceModel Latin7Model; 116 | extern SequenceModel Win1253Model; 117 | extern SequenceModel Latin5BulgarianModel; 118 | extern SequenceModel Win1251BulgarianModel; 119 | extern SequenceModel Latin2HungarianModel; 120 | extern SequenceModel Win1250HungarianModel; 121 | extern SequenceModel Win1255Model; 122 | 123 | #endif /* nsSingleByteCharSetProber_h__ */ 124 | 125 | -------------------------------------------------------------------------------- /universalchardet/nsSJISProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | // for S-JIS encoding, obeserve characteristic: 39 | // 1, kana character (or hankaku?) often have hight frequency of appereance 40 | // 2, kana character often exist in group 41 | // 3, certain combination of kana is never used in japanese language 42 | 43 | #include "nsSJISProber.h" 44 | 45 | void nsSJISProber::Reset(void) 46 | { 47 | mCodingSM->Reset(); 48 | mState = eDetecting; 49 | mContextAnalyser.Reset(); 50 | mDistributionAnalyser.Reset(); 51 | } 52 | 53 | nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) 54 | { 55 | nsSMState codingState; 56 | 57 | for (PRUint32 i = 0; i < aLen; i++) 58 | { 59 | codingState = mCodingSM->NextState(aBuf[i]); 60 | if (codingState == eItsMe) 61 | { 62 | mState = eFoundIt; 63 | break; 64 | } 65 | if (codingState == eStart) 66 | { 67 | PRUint32 charLen = mCodingSM->GetCurrentCharLen(); 68 | if (i == 0) 69 | { 70 | mLastChar[1] = aBuf[0]; 71 | mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen); 72 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 73 | } 74 | else 75 | { 76 | mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen); 77 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 78 | } 79 | } 80 | } 81 | 82 | mLastChar[0] = aBuf[aLen-1]; 83 | 84 | if (mState == eDetecting) 85 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 86 | mState = eFoundIt; 87 | 88 | return mState; 89 | } 90 | 91 | float nsSJISProber::GetConfidence(void) 92 | { 93 | float contxtCf = mContextAnalyser.GetConfidence(); 94 | float distribCf = mDistributionAnalyser.GetConfidence(); 95 | 96 | return (contxtCf > distribCf ? contxtCf : distribCf); 97 | } 98 | 99 | -------------------------------------------------------------------------------- /universalchardet/nsSJISProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | // for S-JIS encoding, obeserve characteristic: 39 | // 1, kana character (or hankaku?) often have hight frequency of appereance 40 | // 2, kana character often exist in group 41 | // 3, certain combination of kana is never used in japanese language 42 | 43 | #ifndef nsSJISProber_h__ 44 | #define nsSJISProber_h__ 45 | 46 | #include "nsCharSetProber.h" 47 | #include "nsCodingStateMachine.h" 48 | #include "JpCntx.h" 49 | #include "CharDistribution.h" 50 | 51 | 52 | class nsSJISProber: public nsCharSetProber { 53 | public: 54 | nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel); 55 | Reset();} 56 | virtual ~nsSJISProber(void){delete mCodingSM;} 57 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 58 | const char* GetCharSetName() {return "Shift_JIS";} 59 | nsProbingState GetState(void) {return mState;} 60 | void Reset(void); 61 | float GetConfidence(void); 62 | void SetOpion() {} 63 | 64 | protected: 65 | nsCodingStateMachine* mCodingSM; 66 | nsProbingState mState; 67 | 68 | SJISContextAnalysis mContextAnalyser; 69 | SJISDistributionAnalysis mDistributionAnalyser; 70 | 71 | char mLastChar[2]; 72 | 73 | }; 74 | 75 | 76 | #endif /* nsSJISProber_h__ */ 77 | 78 | -------------------------------------------------------------------------------- /universalchardet/nsUTF8Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #include "nsUTF8Prober.h" 39 | 40 | void nsUTF8Prober::Reset(void) 41 | { 42 | mCodingSM->Reset(); 43 | mNumOfMBChar = 0; 44 | mState = eDetecting; 45 | } 46 | 47 | nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) 48 | { 49 | nsSMState codingState; 50 | 51 | for (PRUint32 i = 0; i < aLen; i++) 52 | { 53 | codingState = mCodingSM->NextState(aBuf[i]); 54 | if (codingState == eItsMe) 55 | { 56 | mState = eFoundIt; 57 | break; 58 | } 59 | if (codingState == eStart) 60 | { 61 | if (mCodingSM->GetCurrentCharLen() >= 2) 62 | mNumOfMBChar++; 63 | } 64 | } 65 | 66 | if (mState == eDetecting) 67 | if (GetConfidence() > SHORTCUT_THRESHOLD) 68 | mState = eFoundIt; 69 | return mState; 70 | } 71 | 72 | #define ONE_CHAR_PROB (float)0.50 73 | 74 | float nsUTF8Prober::GetConfidence(void) 75 | { 76 | float unlike = (float)0.99; 77 | 78 | if (mNumOfMBChar < 6) 79 | { 80 | for (PRUint32 i = 0; i < mNumOfMBChar; i++) 81 | unlike *= ONE_CHAR_PROB; 82 | return (float)1.0 - unlike; 83 | } 84 | else 85 | return (float)0.99; 86 | } 87 | 88 | -------------------------------------------------------------------------------- /universalchardet/nsUTF8Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is mozilla.org code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsUTF8Prober_h__ 39 | #define nsUTF8Prober_h__ 40 | 41 | #include "nsCharSetProber.h" 42 | #include "nsCodingStateMachine.h" 43 | 44 | class nsUTF8Prober: public nsCharSetProber { 45 | public: 46 | nsUTF8Prober(){mNumOfMBChar = 0; 47 | mCodingSM = new nsCodingStateMachine(&UTF8SMModel); 48 | Reset(); } 49 | virtual ~nsUTF8Prober(){delete mCodingSM;} 50 | nsProbingState HandleData(const char* aBuf, PRUint32 aLen); 51 | const char* GetCharSetName() {return "UTF-8";} 52 | nsProbingState GetState(void) {return mState;} 53 | void Reset(void); 54 | float GetConfidence(void); 55 | void SetOpion() {} 56 | 57 | protected: 58 | nsCodingStateMachine* mCodingSM; 59 | nsProbingState mState; 60 | PRUint32 mNumOfMBChar; 61 | }; 62 | 63 | #endif /* nsUTF8Prober_h__ */ 64 | 65 | -------------------------------------------------------------------------------- /universalchardet/nsUniversalDetector.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Universal charset detector code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 2001 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * Shy Shalom 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | #include "nscore.h" 40 | 41 | #include "nsUniversalDetector.h" 42 | 43 | #include "nsMBCSGroupProber.h" 44 | #include "nsSBCSGroupProber.h" 45 | #include "nsEscCharsetProber.h" 46 | #include "nsLatin1Prober.h" 47 | 48 | nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) 49 | { 50 | mDone = PR_FALSE; 51 | mBestGuess = -1; //illegal value as signal 52 | mInTag = PR_FALSE; 53 | mEscCharSetProber = nsnull; 54 | 55 | mStart = PR_TRUE; 56 | mDetectedCharset = nsnull; 57 | mGotData = PR_FALSE; 58 | mInputState = ePureAscii; 59 | mLastChar = '\0'; 60 | mLanguageFilter = aLanguageFilter; 61 | 62 | PRUint32 i; 63 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 64 | mCharSetProbers[i] = nsnull; 65 | } 66 | 67 | nsUniversalDetector::~nsUniversalDetector() 68 | { 69 | for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 70 | if (mCharSetProbers[i]) 71 | delete mCharSetProbers[i]; 72 | if (mEscCharSetProber) 73 | delete mEscCharSetProber; 74 | } 75 | 76 | void 77 | nsUniversalDetector::Reset() 78 | { 79 | mDone = PR_FALSE; 80 | mBestGuess = -1; //illegal value as signal 81 | mInTag = PR_FALSE; 82 | 83 | mStart = PR_TRUE; 84 | mDetectedCharset = nsnull; 85 | mGotData = PR_FALSE; 86 | mInputState = ePureAscii; 87 | mLastChar = '\0'; 88 | 89 | if (mEscCharSetProber) 90 | mEscCharSetProber->Reset(); 91 | 92 | PRUint32 i; 93 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 94 | if (mCharSetProbers[i]) 95 | mCharSetProbers[i]->Reset(); 96 | } 97 | 98 | //--------------------------------------------------------------------- 99 | #define SHORTCUT_THRESHOLD (float)0.95 100 | #define MINIMUM_THRESHOLD (float)0.20 101 | 102 | nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) 103 | { 104 | if(mDone) 105 | return NS_OK; 106 | 107 | if (aLen > 0) 108 | mGotData = PR_TRUE; 109 | 110 | //If the data starts with BOM, we know it is UTF 111 | if (mStart) 112 | { 113 | mStart = PR_FALSE; 114 | if (aLen > 3) 115 | switch (aBuf[0]) 116 | { 117 | case '\xEF': 118 | if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 119 | // EF BB BF UTF-8 encoded BOM 120 | mDetectedCharset = "UTF-8"; 121 | break; 122 | case '\xFE': 123 | if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 124 | // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 125 | mDetectedCharset = "X-ISO-10646-UCS-4-3412"; 126 | else if ('\xFF' == aBuf[1]) 127 | // FE FF UTF-16, big endian BOM 128 | mDetectedCharset = "UTF-16BE"; 129 | break; 130 | case '\x00': 131 | if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 132 | // 00 00 FE FF UTF-32, big-endian BOM 133 | mDetectedCharset = "UTF-32BE"; 134 | else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 135 | // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 136 | mDetectedCharset = "X-ISO-10646-UCS-4-2143"; 137 | break; 138 | case '\xFF': 139 | if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 140 | // FF FE 00 00 UTF-32, little-endian BOM 141 | mDetectedCharset = "UTF-32LE"; 142 | else if ('\xFE' == aBuf[1]) 143 | // FF FE UTF-16, little endian BOM 144 | mDetectedCharset = "UTF-16LE"; 145 | break; 146 | } // switch 147 | 148 | if (mDetectedCharset) 149 | { 150 | mDone = PR_TRUE; 151 | return NS_OK; 152 | } 153 | } 154 | 155 | PRUint32 i; 156 | for (i = 0; i < aLen; i++) 157 | { 158 | //other than 0xa0, if every othe character is ascii, the page is ascii 159 | if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 160 | { 161 | //we got a non-ascii byte (high-byte) 162 | if (mInputState != eHighbyte) 163 | { 164 | //adjust state 165 | mInputState = eHighbyte; 166 | 167 | //kill mEscCharSetProber if it is active 168 | if (mEscCharSetProber) { 169 | delete mEscCharSetProber; 170 | mEscCharSetProber = nsnull; 171 | } 172 | 173 | //start multibyte and singlebyte charset prober 174 | if (nsnull == mCharSetProbers[0]) 175 | { 176 | mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); 177 | if (nsnull == mCharSetProbers[0]) 178 | return NS_ERROR_OUT_OF_MEMORY; 179 | } 180 | if (nsnull == mCharSetProbers[1] && 181 | (mLanguageFilter & NS_FILTER_NON_CJK)) 182 | { 183 | mCharSetProbers[1] = new nsSBCSGroupProber; 184 | if (nsnull == mCharSetProbers[1]) 185 | return NS_ERROR_OUT_OF_MEMORY; 186 | } 187 | mCharSetProbers[2] = new nsLatin1Prober; 188 | if (nsnull == mCharSetProbers[2]) 189 | return NS_ERROR_OUT_OF_MEMORY; 190 | } 191 | } 192 | else 193 | { 194 | //ok, just pure ascii so far 195 | if ( ePureAscii == mInputState && 196 | (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 197 | { 198 | //found escape character or HZ "~{" 199 | mInputState = eEscAscii; 200 | } 201 | mLastChar = aBuf[i]; 202 | } 203 | } 204 | 205 | nsProbingState st; 206 | switch (mInputState) 207 | { 208 | case eEscAscii: 209 | if (nsnull == mEscCharSetProber) { 210 | mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); 211 | if (nsnull == mEscCharSetProber) 212 | return NS_ERROR_OUT_OF_MEMORY; 213 | } 214 | st = mEscCharSetProber->HandleData(aBuf, aLen); 215 | if (st == eFoundIt) 216 | { 217 | mDone = PR_TRUE; 218 | mDetectedCharset = mEscCharSetProber->GetCharSetName(); 219 | } 220 | break; 221 | case eHighbyte: 222 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 223 | { 224 | if (mCharSetProbers[i]) 225 | { 226 | st = mCharSetProbers[i]->HandleData(aBuf, aLen); 227 | if (st == eFoundIt) 228 | { 229 | mDone = PR_TRUE; 230 | mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 231 | return NS_OK; 232 | } 233 | } 234 | } 235 | break; 236 | 237 | default: //pure ascii 238 | ;//do nothing here 239 | } 240 | return NS_OK; 241 | } 242 | 243 | 244 | //--------------------------------------------------------------------- 245 | void nsUniversalDetector::DataEnd() 246 | { 247 | if (!mGotData) 248 | { 249 | // we haven't got any data yet, return immediately 250 | // caller program sometimes call DataEnd before anything has been sent to detector 251 | return; 252 | } 253 | 254 | if (mDetectedCharset) 255 | { 256 | mDone = PR_TRUE; 257 | Report(mDetectedCharset); 258 | return; 259 | } 260 | 261 | switch (mInputState) 262 | { 263 | case eHighbyte: 264 | { 265 | float proberConfidence; 266 | float maxProberConfidence = (float)0.0; 267 | PRInt32 maxProber = 0; 268 | 269 | for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 270 | { 271 | if (mCharSetProbers[i]) 272 | { 273 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 274 | if (proberConfidence > maxProberConfidence) 275 | { 276 | maxProberConfidence = proberConfidence; 277 | maxProber = i; 278 | } 279 | } 280 | } 281 | //do not report anything because we are not confident of it, that's in fact a negative answer 282 | if (maxProberConfidence > MINIMUM_THRESHOLD && mCharSetProbers[maxProber]) 283 | Report(mCharSetProbers[maxProber]->GetCharSetName()); 284 | } 285 | break; 286 | case eEscAscii: 287 | break; 288 | default: 289 | ; 290 | } 291 | return; 292 | } 293 | -------------------------------------------------------------------------------- /universalchardet/nsUniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is Mozilla Communicator client code. 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nsUniversalDetector_h__ 39 | #define nsUniversalDetector_h__ 40 | 41 | class nsCharSetProber; 42 | 43 | #define NUM_OF_CHARSET_PROBERS 3 44 | 45 | typedef enum { 46 | ePureAscii = 0, 47 | eEscAscii = 1, 48 | eHighbyte = 2 49 | } nsInputState; 50 | 51 | #define NS_FILTER_CHINESE_SIMPLIFIED 0x01 52 | #define NS_FILTER_CHINESE_TRADITIONAL 0x02 53 | #define NS_FILTER_JAPANESE 0x04 54 | #define NS_FILTER_KOREAN 0x08 55 | #define NS_FILTER_NON_CJK 0x10 56 | #define NS_FILTER_ALL 0x1F 57 | #define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \ 58 | NS_FILTER_CHINESE_TRADITIONAL) 59 | #define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \ 60 | NS_FILTER_CHINESE_TRADITIONAL | \ 61 | NS_FILTER_JAPANESE | \ 62 | NS_FILTER_KOREAN) 63 | 64 | class nsUniversalDetector { 65 | public: 66 | nsUniversalDetector(PRUint32 aLanguageFilter); 67 | virtual ~nsUniversalDetector(); 68 | virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); 69 | virtual void DataEnd(void); 70 | 71 | protected: 72 | virtual void Report(const char* aCharset) = 0; 73 | virtual void Reset(); 74 | nsInputState mInputState; 75 | PRBool mDone; 76 | PRBool mInTag; 77 | PRBool mStart; 78 | PRBool mGotData; 79 | char mLastChar; 80 | const char * mDetectedCharset; 81 | PRInt32 mBestGuess; 82 | PRUint32 mLanguageFilter; 83 | 84 | nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; 85 | nsCharSetProber *mEscCharSetProber; 86 | }; 87 | 88 | #endif 89 | 90 | -------------------------------------------------------------------------------- /universalchardet/nscore.h: -------------------------------------------------------------------------------- 1 | #include "prtypes.h" 2 | 3 | typedef PRUint32 nsresult; 4 | 5 | #define nsnull 0 6 | #define NS_COM 7 | 8 | #include "nsError.h" 9 | -------------------------------------------------------------------------------- /universalchardet/prcpucfg.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nspr_cpucfg___ 39 | #define nspr_cpucfg___ 40 | 41 | #ifndef XP_MAC 42 | #define XP_MAC 43 | #endif 44 | 45 | #undef IS_LITTLE_ENDIAN 46 | #define IS_BIG_ENDIAN 1 47 | 48 | #define HAVE_LONG_LONG 49 | 50 | #define PR_AF_INET6 30 /* same as AF_INET6 */ 51 | 52 | #define PR_BYTES_PER_BYTE 1L 53 | #define PR_BYTES_PER_SHORT 2L 54 | #define PR_BYTES_PER_INT 4L 55 | #define PR_BYTES_PER_INT64 8L 56 | #define PR_BYTES_PER_LONG 4L 57 | #define PR_BYTES_PER_FLOAT 4L 58 | #define PR_BYTES_PER_DOUBLE 8L 59 | #define PR_BYTES_PER_WORD 4L 60 | #define PR_BYTES_PER_DWORD 8L 61 | 62 | #define PR_BITS_PER_BYTE 8L 63 | #define PR_BITS_PER_SHORT 16L 64 | #define PR_BITS_PER_INT 32L 65 | #define PR_BITS_PER_INT64 64L 66 | #define PR_BITS_PER_LONG 32L 67 | #define PR_BITS_PER_FLOAT 32L 68 | #define PR_BITS_PER_DOUBLE 64L 69 | #define PR_BITS_PER_WORD 32L 70 | 71 | #define PR_BITS_PER_BYTE_LOG2 3L 72 | #define PR_BITS_PER_SHORT_LOG2 4L 73 | #define PR_BITS_PER_INT_LOG2 5L 74 | #define PR_BITS_PER_INT64_LOG2 6L 75 | #define PR_BITS_PER_LONG_LOG2 5L 76 | #define PR_BITS_PER_FLOAT_LOG2 5L 77 | #define PR_BITS_PER_DOUBLE_LOG2 6L 78 | #define PR_BITS_PER_WORD_LOG2 5L 79 | 80 | #define PR_ALIGN_OF_SHORT 2L 81 | #define PR_ALIGN_OF_INT 4L 82 | #define PR_ALIGN_OF_LONG 4L 83 | #define PR_ALIGN_OF_INT64 2L 84 | #define PR_ALIGN_OF_FLOAT 4L 85 | #define PR_ALIGN_OF_DOUBLE 4L 86 | #define PR_ALIGN_OF_POINTER 4L 87 | #define PR_ALIGN_OF_WORD 4L 88 | 89 | #define PR_BYTES_PER_WORD_LOG2 2L 90 | #define PR_BYTES_PER_DWORD_LOG2 3L 91 | #define PR_WORDS_PER_DWORD_LOG2 1L 92 | 93 | #ifndef NO_NSPR_10_SUPPORT 94 | #define BYTES_PER_BYTE PR_BYTES_PER_BYTE 95 | #define BYTES_PER_SHORT PR_BYTES_PER_SHORT 96 | #define BYTES_PER_INT PR_BYTES_PER_INT 97 | #define BYTES_PER_INT64 PR_BYTES_PER_INT64 98 | #define BYTES_PER_LONG PR_BYTES_PER_LONG 99 | #define BYTES_PER_FLOAT PR_BYTES_PER_FLOAT 100 | #define BYTES_PER_DOUBLE PR_BYTES_PER_DOUBLE 101 | #define BYTES_PER_WORD PR_BYTES_PER_WORD 102 | #define BYTES_PER_DWORD PR_BYTES_PER_DWORD 103 | 104 | #define BITS_PER_BYTE PR_BITS_PER_BYTE 105 | #define BITS_PER_SHORT PR_BITS_PER_SHORT 106 | #define BITS_PER_INT PR_BITS_PER_INT 107 | #define BITS_PER_INT64 PR_BITS_PER_INT64 108 | #define BITS_PER_LONG PR_BITS_PER_LONG 109 | #define BITS_PER_FLOAT PR_BITS_PER_FLOAT 110 | #define BITS_PER_DOUBLE PR_BITS_PER_DOUBLE 111 | #define BITS_PER_WORD PR_BITS_PER_WORD 112 | 113 | #define BITS_PER_BYTE_LOG2 PR_BITS_PER_BYTE_LOG2 114 | #define BITS_PER_SHORT_LOG2 PR_BITS_PER_SHORT_LOG2 115 | #define BITS_PER_INT_LOG2 PR_BITS_PER_INT_LOG2 116 | #define BITS_PER_INT64_LOG2 PR_BITS_PER_INT64_LOG2 117 | #define BITS_PER_LONG_LOG2 PR_BITS_PER_LONG_LOG2 118 | #define BITS_PER_FLOAT_LOG2 PR_BITS_PER_FLOAT_LOG2 119 | #define BITS_PER_DOUBLE_LOG2 PR_BITS_PER_DOUBLE_LOG2 120 | #define BITS_PER_WORD_LOG2 PR_BITS_PER_WORD_LOG2 121 | 122 | #define ALIGN_OF_SHORT PR_ALIGN_OF_SHORT 123 | #define ALIGN_OF_INT PR_ALIGN_OF_INT 124 | #define ALIGN_OF_LONG PR_ALIGN_OF_LONG 125 | #define ALIGN_OF_INT64 PR_ALIGN_OF_INT64 126 | #define ALIGN_OF_FLOAT PR_ALIGN_OF_FLOAT 127 | #define ALIGN_OF_DOUBLE PR_ALIGN_OF_DOUBLE 128 | #define ALIGN_OF_POINTER PR_ALIGN_OF_POINTER 129 | #define ALIGN_OF_WORD PR_ALIGN_OF_WORD 130 | 131 | #define BYTES_PER_WORD_LOG2 PR_BYTES_PER_WORD_LOG2 132 | #define BYTES_PER_DWORD_LOG2 PR_BYTES_PER_DWORD_LOG2 133 | #define WORDS_PER_DWORD_LOG2 PR_WORDS_PER_DWORD_LOG2 134 | #endif /* NO_NSPR_10_SUPPORT */ 135 | 136 | #endif /* nspr_cpucfg___ */ 137 | -------------------------------------------------------------------------------- /universalchardet/prmem.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | /* 39 | ** File: prmem.h 40 | ** Description: API to NSPR memory management functions 41 | ** 42 | */ 43 | #ifndef prmem_h___ 44 | #define prmem_h___ 45 | 46 | #include "prtypes.h" 47 | #include 48 | 49 | PR_BEGIN_EXTERN_C 50 | 51 | /* 52 | ** Thread safe memory allocation. 53 | ** 54 | ** NOTE: pr wraps up malloc, free, calloc, realloc so they are already 55 | ** thread safe (and are not declared here - look in stdlib.h). 56 | */ 57 | 58 | /* 59 | ** PR_Malloc, PR_Calloc, PR_Realloc, and PR_Free have the same signatures 60 | ** as their libc equivalent malloc, calloc, realloc, and free, and have 61 | ** the same semantics. (Note that the argument type size_t is replaced 62 | ** by PRUint32.) Memory allocated by PR_Malloc, PR_Calloc, or PR_Realloc 63 | ** must be freed by PR_Free. 64 | */ 65 | 66 | NSPR_API(void *) PR_Malloc(PRUint32 size); 67 | 68 | NSPR_API(void *) PR_Calloc(PRUint32 nelem, PRUint32 elsize); 69 | 70 | NSPR_API(void *) PR_Realloc(void *ptr, PRUint32 size); 71 | 72 | NSPR_API(void) PR_Free(void *ptr); 73 | 74 | /* 75 | ** The following are some convenience macros defined in terms of 76 | ** PR_Malloc, PR_Calloc, PR_Realloc, and PR_Free. 77 | */ 78 | 79 | /*********************************************************************** 80 | ** FUNCTION: PR_MALLOC() 81 | ** DESCRIPTION: 82 | ** PR_NEW() allocates an untyped item of size _size from the heap. 83 | ** INPUTS: _size: size in bytes of item to be allocated 84 | ** OUTPUTS: untyped pointer to the node allocated 85 | ** RETURN: pointer to node or error returned from malloc(). 86 | ***********************************************************************/ 87 | #define PR_MALLOC(_bytes) (PR_Malloc((_bytes))) 88 | 89 | /*********************************************************************** 90 | ** FUNCTION: PR_NEW() 91 | ** DESCRIPTION: 92 | ** PR_NEW() allocates an item of type _struct from the heap. 93 | ** INPUTS: _struct: a data type 94 | ** OUTPUTS: pointer to _struct 95 | ** RETURN: pointer to _struct or error returns from malloc(). 96 | ***********************************************************************/ 97 | #define PR_NEW(_struct) ((_struct *) PR_MALLOC(sizeof(_struct))) 98 | 99 | /*********************************************************************** 100 | ** FUNCTION: PR_REALLOC() 101 | ** DESCRIPTION: 102 | ** PR_REALLOC() re-allocates _ptr bytes from the heap as a _size 103 | ** untyped item. 104 | ** INPUTS: _ptr: pointer to node to reallocate 105 | ** _size: size of node to allocate 106 | ** OUTPUTS: pointer to node allocated 107 | ** RETURN: pointer to node allocated 108 | ***********************************************************************/ 109 | #define PR_REALLOC(_ptr, _size) (PR_Realloc((_ptr), (_size))) 110 | 111 | /*********************************************************************** 112 | ** FUNCTION: PR_CALLOC() 113 | ** DESCRIPTION: 114 | ** PR_CALLOC() allocates a _size bytes untyped item from the heap 115 | ** and sets the allocated memory to all 0x00. 116 | ** INPUTS: _size: size of node to allocate 117 | ** OUTPUTS: pointer to node allocated 118 | ** RETURN: pointer to node allocated 119 | ***********************************************************************/ 120 | #define PR_CALLOC(_size) (PR_Calloc(1, (_size))) 121 | 122 | /*********************************************************************** 123 | ** FUNCTION: PR_NEWZAP() 124 | ** DESCRIPTION: 125 | ** PR_NEWZAP() allocates an item of type _struct from the heap 126 | ** and sets the allocated memory to all 0x00. 127 | ** INPUTS: _struct: a data type 128 | ** OUTPUTS: pointer to _struct 129 | ** RETURN: pointer to _struct 130 | ***********************************************************************/ 131 | #define PR_NEWZAP(_struct) ((_struct*)PR_Calloc(1, sizeof(_struct))) 132 | 133 | /*********************************************************************** 134 | ** FUNCTION: PR_DELETE() 135 | ** DESCRIPTION: 136 | ** PR_DELETE() unallocates an object previosly allocated via PR_NEW() 137 | ** or PR_NEWZAP() to the heap. 138 | ** INPUTS: pointer to previously allocated object 139 | ** OUTPUTS: the referenced object is returned to the heap 140 | ** RETURN: void 141 | ***********************************************************************/ 142 | #define PR_DELETE(_ptr) { PR_Free(_ptr); (_ptr) = NULL; } 143 | 144 | /*********************************************************************** 145 | ** FUNCTION: PR_FREEIF() 146 | ** DESCRIPTION: 147 | ** PR_FREEIF() conditionally unallocates an object previously allocated 148 | ** vial PR_NEW() or PR_NEWZAP(). If the pointer to the object is 149 | ** equal to zero (0), the object is not released. 150 | ** INPUTS: pointer to previously allocated object 151 | ** OUTPUTS: the referenced object is conditionally returned to the heap 152 | ** RETURN: void 153 | ***********************************************************************/ 154 | #define PR_FREEIF(_ptr) if (_ptr) PR_DELETE(_ptr) 155 | 156 | PR_END_EXTERN_C 157 | 158 | #endif /* prmem_h___ */ 159 | -------------------------------------------------------------------------------- /universalchardet/protypes.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | /* 39 | * This header typedefs the old 'native' types to the new PRs. 40 | * These definitions are scheduled to be eliminated at the earliest 41 | * possible time. The NSPR API is implemented and documented using 42 | * the new definitions. 43 | */ 44 | 45 | #if !defined(PROTYPES_H) 46 | #define PROTYPES_H 47 | 48 | typedef PRUintn uintn; 49 | #ifndef _XP_Core_ 50 | typedef PRIntn intn; 51 | #endif 52 | 53 | /* 54 | * It is trickier to define uint, int8, uint8, int16, uint16, 55 | * int32, uint32, int64, and uint64 because some of these int 56 | * types are defined by standard header files on some platforms. 57 | * Our strategy here is to include all such standard headers 58 | * first, and then define these int types only if they are not 59 | * defined by those standard headers. 60 | */ 61 | 62 | /* 63 | * BeOS defines all the int types below in its standard header 64 | * file SupportDefs.h. 65 | */ 66 | #ifdef XP_BEOS 67 | #include 68 | #endif 69 | 70 | /* 71 | * OpenVMS defines all the int types below in its standard 72 | * header files ints.h and types.h. 73 | */ 74 | #ifdef VMS 75 | #include 76 | #include 77 | #endif 78 | 79 | /* 80 | * SVR4 typedef of uint is commonly found on UNIX machines. 81 | * 82 | * On AIX 4.3, sys/inttypes.h (which is included by sys/types.h) 83 | * defines the types int8, int16, int32, and int64. 84 | */ 85 | #ifdef XP_UNIX 86 | #include 87 | #endif 88 | 89 | /* model.h on HP-UX defines int8, int16, and int32. */ 90 | #ifdef HPUX 91 | #include 92 | #endif 93 | 94 | /* 95 | * uint 96 | */ 97 | 98 | #if !defined(XP_BEOS) && !defined(VMS) \ 99 | && !defined(XP_UNIX) || defined(NTO) 100 | typedef PRUintn uint; 101 | #endif 102 | 103 | /* 104 | * uint64 105 | */ 106 | 107 | #if !defined(XP_BEOS) && !defined(VMS) 108 | typedef PRUint64 uint64; 109 | #endif 110 | 111 | /* 112 | * uint32 113 | */ 114 | 115 | #if !defined(XP_BEOS) && !defined(VMS) 116 | #if !defined(XP_MAC) && !defined(_WIN32) && !defined(XP_OS2) && !defined(NTO) 117 | typedef PRUint32 uint32; 118 | #else 119 | typedef unsigned long uint32; 120 | #endif 121 | #endif 122 | 123 | /* 124 | * uint16 125 | */ 126 | 127 | #if !defined(XP_BEOS) && !defined(VMS) 128 | typedef PRUint16 uint16; 129 | #endif 130 | 131 | /* 132 | * uint8 133 | */ 134 | 135 | #if !defined(XP_BEOS) && !defined(VMS) 136 | typedef PRUint8 uint8; 137 | #endif 138 | 139 | /* 140 | * int64 141 | */ 142 | 143 | #if !defined(XP_BEOS) && !defined(VMS) \ 144 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) 145 | typedef PRInt64 int64; 146 | #endif 147 | 148 | /* 149 | * int32 150 | */ 151 | 152 | #if !defined(XP_BEOS) && !defined(VMS) \ 153 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 154 | && !defined(HPUX) 155 | #if !defined(WIN32) || !defined(_WINSOCK2API_) /* defines its own "int32" */ 156 | #if !defined(XP_MAC) && !defined(_WIN32) && !defined(XP_OS2) && !defined(NTO) 157 | typedef PRInt32 int32; 158 | #else 159 | typedef long int32; 160 | #endif 161 | #endif 162 | #endif 163 | 164 | /* 165 | * int16 166 | */ 167 | 168 | #if !defined(XP_BEOS) && !defined(VMS) \ 169 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 170 | && !defined(HPUX) 171 | typedef PRInt16 int16; 172 | #endif 173 | 174 | /* 175 | * int8 176 | */ 177 | 178 | #if !defined(XP_BEOS) && !defined(VMS) \ 179 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 180 | && !defined(HPUX) 181 | typedef PRInt8 int8; 182 | #endif 183 | 184 | typedef PRFloat64 float64; 185 | typedef PRUptrdiff uptrdiff_t; 186 | typedef PRUword uprword_t; 187 | typedef PRWord prword_t; 188 | 189 | 190 | /* Re: prbit.h */ 191 | #define TEST_BIT PR_TEST_BIT 192 | #define SET_BIT PR_SET_BIT 193 | #define CLEAR_BIT PR_CLEAR_BIT 194 | 195 | /* Re: prarena.h->plarena.h */ 196 | #define PRArena PLArena 197 | #define PRArenaPool PLArenaPool 198 | #define PRArenaStats PLArenaStats 199 | #define PR_ARENA_ALIGN PL_ARENA_ALIGN 200 | #define PR_INIT_ARENA_POOL PL_INIT_ARENA_POOL 201 | #define PR_ARENA_ALLOCATE PL_ARENA_ALLOCATE 202 | #define PR_ARENA_GROW PL_ARENA_GROW 203 | #define PR_ARENA_MARK PL_ARENA_MARK 204 | #define PR_CLEAR_UNUSED PL_CLEAR_UNUSED 205 | #define PR_CLEAR_ARENA PL_CLEAR_ARENA 206 | #define PR_ARENA_RELEASE PL_ARENA_RELEASE 207 | #define PR_COUNT_ARENA PL_COUNT_ARENA 208 | #define PR_ARENA_DESTROY PL_ARENA_DESTROY 209 | #define PR_InitArenaPool PL_InitArenaPool 210 | #define PR_FreeArenaPool PL_FreeArenaPool 211 | #define PR_FinishArenaPool PL_FinishArenaPool 212 | #define PR_CompactArenaPool PL_CompactArenaPool 213 | #define PR_ArenaFinish PL_ArenaFinish 214 | #define PR_ArenaAllocate PL_ArenaAllocate 215 | #define PR_ArenaGrow PL_ArenaGrow 216 | #define PR_ArenaRelease PL_ArenaRelease 217 | #define PR_ArenaCountAllocation PL_ArenaCountAllocation 218 | #define PR_ArenaCountInplaceGrowth PL_ArenaCountInplaceGrowth 219 | #define PR_ArenaCountGrowth PL_ArenaCountGrowth 220 | #define PR_ArenaCountRelease PL_ArenaCountRelease 221 | #define PR_ArenaCountRetract PL_ArenaCountRetract 222 | 223 | /* Re: prhash.h->plhash.h */ 224 | #define PRHashEntry PLHashEntry 225 | #define PRHashTable PLHashTable 226 | #define PRHashNumber PLHashNumber 227 | #define PRHashFunction PLHashFunction 228 | #define PRHashComparator PLHashComparator 229 | #define PRHashEnumerator PLHashEnumerator 230 | #define PRHashAllocOps PLHashAllocOps 231 | #define PR_NewHashTable PL_NewHashTable 232 | #define PR_HashTableDestroy PL_HashTableDestroy 233 | #define PR_HashTableRawLookup PL_HashTableRawLookup 234 | #define PR_HashTableRawAdd PL_HashTableRawAdd 235 | #define PR_HashTableRawRemove PL_HashTableRawRemove 236 | #define PR_HashTableAdd PL_HashTableAdd 237 | #define PR_HashTableRemove PL_HashTableRemove 238 | #define PR_HashTableEnumerateEntries PL_HashTableEnumerateEntries 239 | #define PR_HashTableLookup PL_HashTableLookup 240 | #define PR_HashTableDump PL_HashTableDump 241 | #define PR_HashString PL_HashString 242 | #define PR_CompareStrings PL_CompareStrings 243 | #define PR_CompareValues PL_CompareValues 244 | 245 | #if defined(XP_MAC) 246 | #ifndef TRUE /* Mac standard is lower case true */ 247 | #define TRUE 1 248 | #endif 249 | #ifndef FALSE /* Mac standard is lower case false */ 250 | #define FALSE 0 251 | #endif 252 | #endif 253 | 254 | #endif /* !defined(PROTYPES_H) */ 255 | --------------------------------------------------------------------------------