├── .gitignore ├── DetectorTest.m ├── Info.plist ├── LICENSE.md ├── README ├── UniversalDetector.h ├── UniversalDetector.m ├── UniversalDetector.xcodeproj ├── project.pbxproj ├── project.xcworkspace │ └── contents.xcworkspacedata └── xcuserdata │ └── NILTSH.xcuserdatad │ └── xcschemes │ ├── DetectorTest.xcscheme │ ├── UniversalDetector.xcscheme │ └── xcschememanagement.plist ├── UniversalDetector_Prefix.pch ├── WrappedUniversalDetector.cpp ├── WrappedUniversalDetector.h ├── kludge ├── kludge.c ├── nsError.h ├── nscore.h ├── prcpucfg.h ├── prmem.h ├── protypes.h └── prtypes.h ├── mozilla-release └── xpcom │ └── glue │ ├── nsDebug.h │ └── nsMemory.h ├── scan.pl └── universalchardet ├── moz.build ├── src ├── base │ ├── Big5Freq.tab │ ├── CharDistribution.cpp │ ├── CharDistribution.h │ ├── EUCKRFreq.tab │ ├── EUCTWFreq.tab │ ├── GB2312Freq.tab │ ├── JISFreq.tab │ ├── JpCntx.cpp │ ├── JpCntx.h │ ├── LangBulgarianModel.cpp │ ├── LangCyrillicModel.cpp │ ├── LangGreekModel.cpp │ ├── LangHebrewModel.cpp │ ├── LangHungarianModel.cpp │ ├── LangThaiModel.cpp │ ├── moz.build │ ├── nsBig5Prober.cpp │ ├── nsBig5Prober.h │ ├── nsCharSetProber.cpp │ ├── nsCharSetProber.h │ ├── nsCodingStateMachine.h │ ├── nsEUCJPProber.cpp │ ├── nsEUCJPProber.h │ ├── nsEUCKRProber.cpp │ ├── nsEUCKRProber.h │ ├── nsEUCTWProber.cpp │ ├── nsEUCTWProber.h │ ├── nsEscCharsetProber.cpp │ ├── nsEscCharsetProber.h │ ├── nsEscSM.cpp │ ├── nsGB2312Prober.cpp │ ├── nsGB2312Prober.h │ ├── nsHebrewProber.cpp │ ├── nsHebrewProber.h │ ├── nsLatin1Prober.cpp │ ├── nsLatin1Prober.h │ ├── nsMBCSGroupProber.cpp │ ├── nsMBCSGroupProber.h │ ├── nsMBCSSM.cpp │ ├── nsPkgInt.h │ ├── nsSBCSGroupProber.cpp │ ├── nsSBCSGroupProber.h │ ├── nsSBCharSetProber.cpp │ ├── nsSBCharSetProber.h │ ├── nsSJISProber.cpp │ ├── nsSJISProber.h │ ├── nsUTF8Prober.cpp │ ├── nsUTF8Prober.h │ ├── nsUniversalDetector.cpp │ └── nsUniversalDetector.h ├── moz.build └── xpcom │ ├── Makefile.in │ ├── moz.build │ ├── nsUdetXPCOMWrapper.cpp │ ├── nsUdetXPCOMWrapper.h │ ├── nsUniversalCharDetDll.h │ └── nsUniversalCharDetModule.cpp └── tests ├── CharsetDetectionTests.js ├── Makefile.in ├── bug171813_text.html ├── bug306272_text.html ├── bug421271_text.html ├── bug426271_text-euc-jp.html ├── bug426271_text-utf-8.html ├── bug431054_text.html ├── bug488426_text.html ├── bug547487_text.html ├── bug620106_text.html ├── bug631751be_text.html ├── bug631751le_text.html ├── bug638318_text.html ├── bug811363-1.text ├── bug811363-2.text ├── bug811363-3.text ├── bug811363-4.text ├── bug811363-5.text ├── bug811363-6.text ├── bug811363-7.text ├── bug811363-8.text ├── bug811363-9.text ├── bug811363-invalid-1.text ├── bug811363-invalid-2.text ├── bug811363-invalid-3.text ├── bug811363-invalid-4.text ├── bug811363-invalid-5.text ├── bug9357_text.html ├── moz.build ├── test_bug171813.html ├── test_bug306272.html ├── test_bug421271.html ├── test_bug426271-euc-jp.html ├── test_bug426271-utf-8.html ├── test_bug431054-japanese.html ├── test_bug431054.html ├── test_bug488426.html ├── test_bug547487.html ├── test_bug620106.html ├── test_bug631751be.html ├── test_bug631751le.html ├── test_bug638318.html ├── test_bug811363-1-1.html ├── test_bug811363-1-2.html ├── test_bug811363-1-3.html ├── test_bug811363-1-4.html ├── test_bug811363-1-5.html ├── test_bug811363-2-1.html ├── test_bug811363-2-2.html ├── test_bug811363-2-3.html ├── test_bug811363-2-4.html ├── test_bug811363-2-5.html ├── test_bug811363-2-6.html ├── test_bug811363-2-7.html ├── test_bug811363-2-8.html ├── test_bug811363-2-9.html └── test_bug9357.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | build/* 3 | *.pbxuser 4 | !default.pbxuser 5 | *.mode1v3 6 | !default.mode1v3 7 | *.mode2v3 8 | !default.mode2v3 9 | *.perspectivev3 10 | !default.perspectivev3 11 | *.xcworkspace 12 | !default.xcworkspace 13 | xcuserdata 14 | profile 15 | *.moved-aside 16 | 17 | ## Ignore incredibly annoying .DS_Store files 18 | .DS_Store 19 | -------------------------------------------------------------------------------- /DetectorTest.m: -------------------------------------------------------------------------------- 1 | #import 2 | #import 3 | 4 | #import 5 | 6 | int main(int argc,char **argv) 7 | { 8 | NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 9 | 10 | [[NSUserDefaults standardUserDefaults] setBool:YES 11 | forKey:UniversalDetectorUseMacRomanHeuristic]; 12 | 13 | NSError *error = nil; 14 | 15 | for (int i = 1; i < argc; i++) 16 | { 17 | // You need a new detector for each piece of data you want to examine! 18 | UniversalDetector *detector = [UniversalDetector new]; 19 | 20 | NSString *filePath = [NSString stringWithUTF8String:argv[i]]; 21 | NSString *fileName = [filePath lastPathComponent]; 22 | 23 | NSData *data = [NSData dataWithContentsOfFile:filePath 24 | options:0 25 | error:&error]; 26 | 27 | if (data == nil) { 28 | NSLog(@"%@", error); 29 | continue; 30 | } 31 | 32 | NSString *str = nil; 33 | 34 | if (data == nil) { 35 | str = [NSString stringWithFormat:@"%@\n\t%@", fileName, error]; 36 | } 37 | 38 | if (data.length == 0) { 39 | str = [NSString stringWithFormat:@"%@\n\t%@", fileName, @"Error: empty file!"]; 40 | } 41 | 42 | if (str) { 43 | printf("%s\n\n", [str UTF8String]); 44 | continue; 45 | } 46 | 47 | [detector analyzeData:data]; 48 | NSString *MIMECharsetName = [detector MIMECharset]; 49 | NSStringEncoding encoding = [detector encoding]; 50 | NSStringEncoding appKitEncoding = 0; 51 | 52 | //if (encoding == NSWindowsCP1252StringEncoding || encoding == NSShiftJISStringEncoding) 53 | { 54 | NSDictionary *documentAttributes = nil; 55 | 56 | // UniversalDetector does not differentiate between Windows Latin 1 and Mac Roman 57 | // while AppKit has an apparent Mac Roman bias. 58 | NSAttributedString *text = [[NSAttributedString alloc] initWithData:data 59 | options:nil 60 | documentAttributes:&documentAttributes 61 | error:&error]; 62 | 63 | if (text == nil) { 64 | NSLog(@"%@", error); 65 | continue; 66 | } 67 | else { 68 | [text release]; 69 | 70 | NSNumber *encodingNumber = documentAttributes[NSCharacterEncodingDocumentAttribute]; 71 | appKitEncoding = [encodingNumber intValue]; 72 | } 73 | } 74 | 75 | NSString *appKitResultString = nil; 76 | if (appKitEncoding != 0) { 77 | if (appKitEncoding != encoding) { 78 | appKitResultString = [NSString stringWithFormat:@"\"%@\"", 79 | [NSString localizedNameOfStringEncoding:appKitEncoding] 80 | ]; 81 | } 82 | else { 83 | appKitResultString = @"(same result)"; 84 | } 85 | } 86 | 87 | str = [NSString stringWithFormat: 88 | @"%@\n" 89 | "\t" "\"%@\" (%@)\n" 90 | "\t" "confidence:% 6.1f%%" 91 | @"\n" 92 | "\t" "AppKit: %@", 93 | fileName, 94 | (encoding != 0) ? [NSString localizedNameOfStringEncoding:encoding] : @"UNKNOWN", 95 | (MIMECharsetName != nil) ? MIMECharsetName : @"UNKNOWN", 96 | ([detector confidence] * 100.0f), 97 | (appKitResultString != nil) ? appKitResultString : @"UNDEFINED" 98 | ]; 99 | 100 | 101 | 102 | printf("%s\n\n", [str UTF8String]); 103 | 104 | 105 | [detector release]; 106 | } 107 | 108 | [pool release]; 109 | 110 | return EXIT_SUCCESS; 111 | } -------------------------------------------------------------------------------- /Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | English 7 | CFBundleExecutable 8 | ${EXECUTABLE_NAME} 9 | CFBundleIconFile 10 | 11 | CFBundleIdentifier 12 | $(PRODUCT_BUNDLE_IDENTIFIER) 13 | CFBundleInfoDictionaryVersion 14 | 6.0 15 | CFBundleName 16 | ${PRODUCT_NAME} 17 | CFBundlePackageType 18 | FMWK 19 | CFBundleSignature 20 | ???? 21 | CFBundleVersion 22 | 1.0 23 | NSPrincipalClass 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This a branch-off of the UniversalDetector.framework source code taken from the Perian SVN using the following procedure: 2 | 3 | git svn clone http://svn.perian.org/trunk/ 4 | clone --no-hardlinks trunk UniversalDetector 5 | cd UniversalDetector 6 | git filter-branch --subdirectory-filter UniversalDetector HEAD -- --all 7 | git reset --hard 8 | git gc --aggressive 9 | git prune 10 | git remote rm origin 11 | 12 | The procedure above is based on 13 | http://stackoverflow.com/questions/359424/detach-subdirectory-into-separate-git-repository 14 | and keeps the history intact. 15 | 16 | I have put this up with the intention of collecting the various efforts of maintaining of and changes to this framework's source code here on github. 17 | 18 | The core of the frameworks is the (LGPL) Mozilla automatic charset detection library. You can read more about it here: 19 | http://www.mozilla.org/projects/intl/detectorsrc.html -------------------------------------------------------------------------------- /UniversalDetector.h: -------------------------------------------------------------------------------- 1 | #import 2 | 3 | // You can enable this heuristic by setting the BOOL with that key in NSUserDefaults -standardUserDefaults to YES. 4 | // In this case, only -encoding wll be valid and -MIMECharset will be invalid. 5 | extern NSString * const UniversalDetectorUseMacRomanHeuristic; 6 | 7 | @interface UniversalDetector:NSObject 8 | { 9 | void *detectorPtr; 10 | NSString *charsetName; 11 | float confidence; 12 | BOOL possiblyMacRoman; 13 | } 14 | 15 | -(void)analyzeContentsOfFile:(NSString *)path; 16 | -(void)analyzeData:(NSData *)data; 17 | -(void)analyzeBytes:(const char *)data length:(int)len; 18 | -(void)reset; 19 | 20 | -(BOOL)done; 21 | -(NSString *)MIMECharset; 22 | -(NSStringEncoding)encoding; 23 | -(float)confidence; 24 | 25 | @end 26 | -------------------------------------------------------------------------------- /UniversalDetector.m: -------------------------------------------------------------------------------- 1 | #import "UniversalDetector.h" 2 | #import "WrappedUniversalDetector.h" 3 | 4 | 5 | NSString * const UniversalDetectorUseMacRomanHeuristic = @"UniversalDetectorUseMacRomanHeuristic"; 6 | 7 | 8 | @implementation UniversalDetector 9 | 10 | -(id)init 11 | { 12 | self = [super init]; 13 | 14 | if(self) 15 | { 16 | detectorPtr = AllocUniversalDetector(); 17 | charsetName = nil; 18 | confidence = 0; 19 | } 20 | return self; 21 | } 22 | 23 | -(void)dealloc 24 | { 25 | FreeUniversalDetector(detectorPtr); 26 | [charsetName release]; 27 | [super dealloc]; 28 | } 29 | 30 | -(void)analyzeContentsOfFile:(NSString *)path 31 | { 32 | NSData *data = [[NSData alloc] initWithContentsOfFile:path options:NSDataReadingMappedIfSafe error:NULL]; 33 | 34 | if (data) { 35 | [self analyzeBytes:(const char *)[data bytes] length:(int)[data length]]; 36 | } 37 | [data release]; 38 | } 39 | 40 | -(void)analyzeData:(NSData *)data 41 | { 42 | [self analyzeBytes:(const char *)[data bytes] length:(int)[data length]]; 43 | } 44 | 45 | -(void)analyzeBytes:(const char *)data length:(int)len 46 | { 47 | UniversalDetectorHandleData(detectorPtr, data, len); 48 | 49 | BOOL useMacRomanHeuristic = [[NSUserDefaults standardUserDefaults] boolForKey:UniversalDetectorUseMacRomanHeuristic]; 50 | 51 | if (useMacRomanHeuristic) { 52 | // Search for a carriage return (cr) without a following newline. 53 | // We do this to determine, if the data could possibly be MacRoman. 54 | const size_t searchWindowSize = 4096; 55 | char *crPtr = memchr(data, '\r', MIN(len, searchWindowSize)); 56 | if (crPtr == NULL) { 57 | possiblyMacRoman = NO; 58 | } 59 | else { 60 | const int lastIndex = len - 1; 61 | ptrdiff_t crIndex = (crPtr - data); 62 | 63 | // Check, if we are at least one byte before the end. 64 | if (crIndex < lastIndex) { 65 | if (data[crIndex+1] == '\n') { 66 | possiblyMacRoman = NO; 67 | } 68 | else { 69 | possiblyMacRoman = YES; 70 | } 71 | } 72 | else { 73 | possiblyMacRoman = YES; 74 | } 75 | } 76 | } 77 | else { 78 | possiblyMacRoman = NO; 79 | } 80 | 81 | [charsetName release]; 82 | charsetName=nil; 83 | } 84 | 85 | -(void)reset 86 | { 87 | UniversalDetectorReset(detectorPtr); 88 | } 89 | 90 | -(BOOL)done 91 | { 92 | return UniversalDetectorDone(detectorPtr); 93 | } 94 | 95 | -(NSString *)MIMECharset 96 | { 97 | if(!charsetName) 98 | { 99 | const char *cstr=UniversalDetectorCharset(detectorPtr, &confidence); 100 | if(!cstr) return nil; 101 | charsetName=[[NSString alloc] initWithUTF8String:cstr]; 102 | } 103 | return charsetName; 104 | } 105 | 106 | -(NSStringEncoding)encoding 107 | { 108 | NSString *mimecharset=[self MIMECharset]; 109 | if(!mimecharset) return 0; 110 | 111 | CFStringEncoding cfenc=CFStringConvertIANACharSetNameToEncoding((CFStringRef)mimecharset); 112 | if(cfenc==kCFStringEncodingInvalidId) return 0; 113 | 114 | // UniversalDetector detects CP949 but returns "EUC-KR" because CP949 lacks an IANA name. 115 | // Kludge to make strings decode properly anyway. 116 | if(cfenc==kCFStringEncodingEUC_KR) cfenc=kCFStringEncodingDOSKorean; 117 | // Something similar happens with "Shift_JIS". 118 | if(cfenc==kCFStringEncodingShiftJIS) cfenc=kCFStringEncodingDOSJapanese; 119 | 120 | NSStringEncoding encoding = CFStringConvertEncodingToNSStringEncoding(cfenc); 121 | 122 | if (possiblyMacRoman && 123 | (encoding == NSWindowsCP1252StringEncoding || 124 | encoding == NSShiftJISStringEncoding)) { 125 | encoding = NSMacOSRomanStringEncoding; 126 | } 127 | 128 | return encoding; 129 | } 130 | 131 | -(float)confidence 132 | { 133 | if(!charsetName) [self MIMECharset]; 134 | return confidence; 135 | } 136 | 137 | @end 138 | -------------------------------------------------------------------------------- /UniversalDetector.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /UniversalDetector.xcodeproj/xcuserdata/NILTSH.xcuserdatad/xcschemes/DetectorTest.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 44 | 45 | 51 | 52 | 53 | 54 | 55 | 56 | 64 | 65 | 71 | 72 | 73 | 74 | 76 | 77 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /UniversalDetector.xcodeproj/xcuserdata/NILTSH.xcuserdatad/xcschemes/UniversalDetector.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 44 | 45 | 46 | 47 | 55 | 56 | 58 | 59 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /UniversalDetector.xcodeproj/xcuserdata/NILTSH.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | DetectorTest.xcscheme 8 | 9 | orderHint 10 | 0 11 | 12 | UniversalDetector.xcscheme 13 | 14 | orderHint 15 | 1 16 | 17 | 18 | SuppressBuildableAutocreation 19 | 20 | 1B0DDCAA0A2D0AD10009B697 21 | 22 | primary 23 | 24 | 25 | 3DBF4F560BCD396800DA401A 26 | 27 | primary 28 | 29 | 30 | 8DC2EF4F0486A6940098B216 31 | 32 | primary 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /UniversalDetector_Prefix.pch: -------------------------------------------------------------------------------- 1 | // 2 | // Prefix header for all source files of the 'UniversalCharDet' target in the 'UniversalCharDet' project. 3 | // 4 | 5 | #ifdef __OBJC__ 6 | #import 7 | #endif 8 | 9 | #include 10 | 11 | #ifndef HAS_MOVE_SEMANTICS // Crude C++11 detection. 12 | # define nullptr 0 13 | #endif 14 | 15 | #define OS_X_FRAMEWORK_BUILD 1 -------------------------------------------------------------------------------- /WrappedUniversalDetector.cpp: -------------------------------------------------------------------------------- 1 | #include "WrappedUniversalDetector.h" 2 | 3 | #include "nscore.h" 4 | #include "nsUniversalDetector.h" 5 | #include "nsCharSetProber.h" 6 | 7 | // You are welcome to fix this ObjC wrapper to allow initializing nsUniversalDetector with a non-zero value for aLanguageFilter! 8 | 9 | class wrappedUniversalDetector:public nsUniversalDetector 10 | { 11 | public: 12 | wrappedUniversalDetector():nsUniversalDetector(NS_FILTER_ALL) {} 13 | 14 | void Report(const char* aCharset) {} 15 | 16 | const char *charset(float &confidence) 17 | { 18 | if(!mGotData) 19 | { 20 | confidence=0; 21 | return 0; 22 | } 23 | 24 | if(mDetectedCharset) 25 | { 26 | confidence=1; 27 | return mDetectedCharset; 28 | } 29 | 30 | switch(mInputState) 31 | { 32 | case eHighbyte: 33 | { 34 | float proberConfidence; 35 | float maxProberConfidence = (float)0.0; 36 | PRInt32 maxProber = 0; 37 | 38 | for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 39 | { 40 | if (mCharSetProbers[i]) 41 | { 42 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 43 | if (proberConfidence > maxProberConfidence) 44 | { 45 | maxProberConfidence = proberConfidence; 46 | maxProber = i; 47 | } 48 | } 49 | } 50 | 51 | if (mCharSetProbers[maxProber]) { 52 | confidence=maxProberConfidence; 53 | return mCharSetProbers[maxProber]->GetCharSetName(); 54 | } 55 | } 56 | break; 57 | 58 | case ePureAscii: 59 | confidence=1.0; 60 | return "UTF-8"; 61 | default: 62 | break; 63 | } 64 | 65 | confidence=0; 66 | return 0; 67 | } 68 | 69 | bool done() 70 | { 71 | if(mDetectedCharset) return true; 72 | return false; 73 | } 74 | 75 | /* 76 | void debug() 77 | { 78 | for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 79 | { 80 | // If no data was received the array might stay filled with nulls 81 | // the way it was initialized in the constructor. 82 | if (mCharSetProbers[i]) 83 | mCharSetProbers[i]->DumpStatus(); 84 | } 85 | } 86 | */ 87 | 88 | void reset() { Reset(); } 89 | }; 90 | 91 | 92 | 93 | extern "C" { 94 | 95 | void *AllocUniversalDetector() 96 | { 97 | return (void *)new wrappedUniversalDetector; 98 | } 99 | 100 | void FreeUniversalDetector(void *detectorPtr) 101 | { 102 | delete (wrappedUniversalDetector *)detectorPtr; 103 | } 104 | 105 | void UniversalDetectorHandleData(void *detectorPtr,const char *data,int length) 106 | { 107 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorPtr; 108 | if(detector->done()) return; 109 | detector->HandleData(data,length); 110 | } 111 | 112 | void UniversalDetectorReset(void *detectorPtr) 113 | { 114 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorPtr; 115 | detector->reset(); 116 | } 117 | 118 | int UniversalDetectorDone(void *detectorPtr) 119 | { 120 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorPtr; 121 | return detector->done()?1:0; 122 | } 123 | 124 | const char *UniversalDetectorCharset(void *detectorPtr, float *confidence) 125 | { 126 | wrappedUniversalDetector *detector=(wrappedUniversalDetector *)detectorPtr; 127 | return detector->charset(*confidence); 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /WrappedUniversalDetector.h: -------------------------------------------------------------------------------- 1 | #ifndef __WrappedUniversalDetector_h__ 2 | #define __WrappedUniversalDetector_h__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void *AllocUniversalDetector(void); 9 | void FreeUniversalDetector(void *detectorPtr); 10 | void UniversalDetectorHandleData(void *detectorPtr,const char *data,int length); 11 | void UniversalDetectorReset(void *detectorPtr); 12 | int UniversalDetectorDone(void *detectorPtr); 13 | const char *UniversalDetectorCharset(void *detectorPtr,float *confidence); 14 | 15 | 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /kludge/kludge.c: -------------------------------------------------------------------------------- 1 | #include "prmem.h" // nose core nose core nose core nose core 2 | 3 | PR_IMPLEMENT(void *) PR_Malloc(PRUint32 size) 4 | { 5 | return malloc(size); 6 | } 7 | 8 | PR_IMPLEMENT(void *) PR_Calloc(PRUint32 nelem, PRUint32 elsize) 9 | { 10 | return calloc(nelem, elsize); 11 | } 12 | 13 | PR_IMPLEMENT(void *) PR_Realloc(void *ptr, PRUint32 size) 14 | { 15 | return realloc(ptr, size); 16 | } 17 | 18 | PR_IMPLEMENT(void) PR_Free(void *ptr) 19 | { 20 | free(ptr); 21 | } 22 | -------------------------------------------------------------------------------- /kludge/nscore.h: -------------------------------------------------------------------------------- 1 | #include "prtypes.h" 2 | 3 | typedef PRUint32 nsresult; 4 | 5 | #define nsnull 0 6 | #define NS_COM 7 | 8 | #include "nsError.h" -------------------------------------------------------------------------------- /kludge/prcpucfg.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | #ifndef nspr_cpucfg___ 39 | #define nspr_cpucfg___ 40 | 41 | #ifndef XP_MAC 42 | #define XP_MAC 43 | #endif 44 | 45 | #undef IS_LITTLE_ENDIAN 46 | #define IS_BIG_ENDIAN 1 47 | 48 | #define HAVE_LONG_LONG 49 | 50 | #define PR_AF_INET6 30 /* same as AF_INET6 */ 51 | 52 | #define PR_BYTES_PER_BYTE 1L 53 | #define PR_BYTES_PER_SHORT 2L 54 | #define PR_BYTES_PER_INT 4L 55 | #define PR_BYTES_PER_INT64 8L 56 | #define PR_BYTES_PER_LONG 4L 57 | #define PR_BYTES_PER_FLOAT 4L 58 | #define PR_BYTES_PER_DOUBLE 8L 59 | #define PR_BYTES_PER_WORD 4L 60 | #define PR_BYTES_PER_DWORD 8L 61 | 62 | #define PR_BITS_PER_BYTE 8L 63 | #define PR_BITS_PER_SHORT 16L 64 | #define PR_BITS_PER_INT 32L 65 | #define PR_BITS_PER_INT64 64L 66 | #define PR_BITS_PER_LONG 32L 67 | #define PR_BITS_PER_FLOAT 32L 68 | #define PR_BITS_PER_DOUBLE 64L 69 | #define PR_BITS_PER_WORD 32L 70 | 71 | #define PR_BITS_PER_BYTE_LOG2 3L 72 | #define PR_BITS_PER_SHORT_LOG2 4L 73 | #define PR_BITS_PER_INT_LOG2 5L 74 | #define PR_BITS_PER_INT64_LOG2 6L 75 | #define PR_BITS_PER_LONG_LOG2 5L 76 | #define PR_BITS_PER_FLOAT_LOG2 5L 77 | #define PR_BITS_PER_DOUBLE_LOG2 6L 78 | #define PR_BITS_PER_WORD_LOG2 5L 79 | 80 | #define PR_ALIGN_OF_SHORT 2L 81 | #define PR_ALIGN_OF_INT 4L 82 | #define PR_ALIGN_OF_LONG 4L 83 | #define PR_ALIGN_OF_INT64 2L 84 | #define PR_ALIGN_OF_FLOAT 4L 85 | #define PR_ALIGN_OF_DOUBLE 4L 86 | #define PR_ALIGN_OF_POINTER 4L 87 | #define PR_ALIGN_OF_WORD 4L 88 | 89 | #define PR_BYTES_PER_WORD_LOG2 2L 90 | #define PR_BYTES_PER_DWORD_LOG2 3L 91 | #define PR_WORDS_PER_DWORD_LOG2 1L 92 | 93 | #ifndef NO_NSPR_10_SUPPORT 94 | #define BYTES_PER_BYTE PR_BYTES_PER_BYTE 95 | #define BYTES_PER_SHORT PR_BYTES_PER_SHORT 96 | #define BYTES_PER_INT PR_BYTES_PER_INT 97 | #define BYTES_PER_INT64 PR_BYTES_PER_INT64 98 | #define BYTES_PER_LONG PR_BYTES_PER_LONG 99 | #define BYTES_PER_FLOAT PR_BYTES_PER_FLOAT 100 | #define BYTES_PER_DOUBLE PR_BYTES_PER_DOUBLE 101 | #define BYTES_PER_WORD PR_BYTES_PER_WORD 102 | #define BYTES_PER_DWORD PR_BYTES_PER_DWORD 103 | 104 | #define BITS_PER_BYTE PR_BITS_PER_BYTE 105 | #define BITS_PER_SHORT PR_BITS_PER_SHORT 106 | #define BITS_PER_INT PR_BITS_PER_INT 107 | #define BITS_PER_INT64 PR_BITS_PER_INT64 108 | #define BITS_PER_LONG PR_BITS_PER_LONG 109 | #define BITS_PER_FLOAT PR_BITS_PER_FLOAT 110 | #define BITS_PER_DOUBLE PR_BITS_PER_DOUBLE 111 | #define BITS_PER_WORD PR_BITS_PER_WORD 112 | 113 | #define BITS_PER_BYTE_LOG2 PR_BITS_PER_BYTE_LOG2 114 | #define BITS_PER_SHORT_LOG2 PR_BITS_PER_SHORT_LOG2 115 | #define BITS_PER_INT_LOG2 PR_BITS_PER_INT_LOG2 116 | #define BITS_PER_INT64_LOG2 PR_BITS_PER_INT64_LOG2 117 | #define BITS_PER_LONG_LOG2 PR_BITS_PER_LONG_LOG2 118 | #define BITS_PER_FLOAT_LOG2 PR_BITS_PER_FLOAT_LOG2 119 | #define BITS_PER_DOUBLE_LOG2 PR_BITS_PER_DOUBLE_LOG2 120 | #define BITS_PER_WORD_LOG2 PR_BITS_PER_WORD_LOG2 121 | 122 | #define ALIGN_OF_SHORT PR_ALIGN_OF_SHORT 123 | #define ALIGN_OF_INT PR_ALIGN_OF_INT 124 | #define ALIGN_OF_LONG PR_ALIGN_OF_LONG 125 | #define ALIGN_OF_INT64 PR_ALIGN_OF_INT64 126 | #define ALIGN_OF_FLOAT PR_ALIGN_OF_FLOAT 127 | #define ALIGN_OF_DOUBLE PR_ALIGN_OF_DOUBLE 128 | #define ALIGN_OF_POINTER PR_ALIGN_OF_POINTER 129 | #define ALIGN_OF_WORD PR_ALIGN_OF_WORD 130 | 131 | #define BYTES_PER_WORD_LOG2 PR_BYTES_PER_WORD_LOG2 132 | #define BYTES_PER_DWORD_LOG2 PR_BYTES_PER_DWORD_LOG2 133 | #define WORDS_PER_DWORD_LOG2 PR_WORDS_PER_DWORD_LOG2 134 | #endif /* NO_NSPR_10_SUPPORT */ 135 | 136 | #endif /* nspr_cpucfg___ */ 137 | -------------------------------------------------------------------------------- /kludge/prmem.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | /* 39 | ** File: prmem.h 40 | ** Description: API to NSPR memory management functions 41 | ** 42 | */ 43 | #ifndef prmem_h___ 44 | #define prmem_h___ 45 | 46 | #include "prtypes.h" 47 | #include 48 | 49 | PR_BEGIN_EXTERN_C 50 | 51 | /* 52 | ** Thread safe memory allocation. 53 | ** 54 | ** NOTE: pr wraps up malloc, free, calloc, realloc so they are already 55 | ** thread safe (and are not declared here - look in stdlib.h). 56 | */ 57 | 58 | /* 59 | ** PR_Malloc, PR_Calloc, PR_Realloc, and PR_Free have the same signatures 60 | ** as their libc equivalent malloc, calloc, realloc, and free, and have 61 | ** the same semantics. (Note that the argument type size_t is replaced 62 | ** by PRUint32.) Memory allocated by PR_Malloc, PR_Calloc, or PR_Realloc 63 | ** must be freed by PR_Free. 64 | */ 65 | 66 | NSPR_API(void *) PR_Malloc(PRUint32 size); 67 | 68 | NSPR_API(void *) PR_Calloc(PRUint32 nelem, PRUint32 elsize); 69 | 70 | NSPR_API(void *) PR_Realloc(void *ptr, PRUint32 size); 71 | 72 | NSPR_API(void) PR_Free(void *ptr); 73 | 74 | /* 75 | ** The following are some convenience macros defined in terms of 76 | ** PR_Malloc, PR_Calloc, PR_Realloc, and PR_Free. 77 | */ 78 | 79 | /*********************************************************************** 80 | ** FUNCTION: PR_MALLOC() 81 | ** DESCRIPTION: 82 | ** PR_NEW() allocates an untyped item of size _size from the heap. 83 | ** INPUTS: _size: size in bytes of item to be allocated 84 | ** OUTPUTS: untyped pointer to the node allocated 85 | ** RETURN: pointer to node or error returned from malloc(). 86 | ***********************************************************************/ 87 | #define PR_MALLOC(_bytes) (PR_Malloc((_bytes))) 88 | 89 | /*********************************************************************** 90 | ** FUNCTION: PR_NEW() 91 | ** DESCRIPTION: 92 | ** PR_NEW() allocates an item of type _struct from the heap. 93 | ** INPUTS: _struct: a data type 94 | ** OUTPUTS: pointer to _struct 95 | ** RETURN: pointer to _struct or error returns from malloc(). 96 | ***********************************************************************/ 97 | #define PR_NEW(_struct) ((_struct *) PR_MALLOC(sizeof(_struct))) 98 | 99 | /*********************************************************************** 100 | ** FUNCTION: PR_REALLOC() 101 | ** DESCRIPTION: 102 | ** PR_REALLOC() re-allocates _ptr bytes from the heap as a _size 103 | ** untyped item. 104 | ** INPUTS: _ptr: pointer to node to reallocate 105 | ** _size: size of node to allocate 106 | ** OUTPUTS: pointer to node allocated 107 | ** RETURN: pointer to node allocated 108 | ***********************************************************************/ 109 | #define PR_REALLOC(_ptr, _size) (PR_Realloc((_ptr), (_size))) 110 | 111 | /*********************************************************************** 112 | ** FUNCTION: PR_CALLOC() 113 | ** DESCRIPTION: 114 | ** PR_CALLOC() allocates a _size bytes untyped item from the heap 115 | ** and sets the allocated memory to all 0x00. 116 | ** INPUTS: _size: size of node to allocate 117 | ** OUTPUTS: pointer to node allocated 118 | ** RETURN: pointer to node allocated 119 | ***********************************************************************/ 120 | #define PR_CALLOC(_size) (PR_Calloc(1, (_size))) 121 | 122 | /*********************************************************************** 123 | ** FUNCTION: PR_NEWZAP() 124 | ** DESCRIPTION: 125 | ** PR_NEWZAP() allocates an item of type _struct from the heap 126 | ** and sets the allocated memory to all 0x00. 127 | ** INPUTS: _struct: a data type 128 | ** OUTPUTS: pointer to _struct 129 | ** RETURN: pointer to _struct 130 | ***********************************************************************/ 131 | #define PR_NEWZAP(_struct) ((_struct*)PR_Calloc(1, sizeof(_struct))) 132 | 133 | /*********************************************************************** 134 | ** FUNCTION: PR_DELETE() 135 | ** DESCRIPTION: 136 | ** PR_DELETE() unallocates an object previosly allocated via PR_NEW() 137 | ** or PR_NEWZAP() to the heap. 138 | ** INPUTS: pointer to previously allocated object 139 | ** OUTPUTS: the referenced object is returned to the heap 140 | ** RETURN: void 141 | ***********************************************************************/ 142 | #define PR_DELETE(_ptr) { PR_Free(_ptr); (_ptr) = NULL; } 143 | 144 | /*********************************************************************** 145 | ** FUNCTION: PR_FREEIF() 146 | ** DESCRIPTION: 147 | ** PR_FREEIF() conditionally unallocates an object previously allocated 148 | ** vial PR_NEW() or PR_NEWZAP(). If the pointer to the object is 149 | ** equal to zero (0), the object is not released. 150 | ** INPUTS: pointer to previously allocated object 151 | ** OUTPUTS: the referenced object is conditionally returned to the heap 152 | ** RETURN: void 153 | ***********************************************************************/ 154 | #define PR_FREEIF(_ptr) if (_ptr) PR_DELETE(_ptr) 155 | 156 | PR_END_EXTERN_C 157 | 158 | #endif /* prmem_h___ */ 159 | -------------------------------------------------------------------------------- /kludge/protypes.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* ***** BEGIN LICENSE BLOCK ***** 3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 | * 5 | * The contents of this file are subject to the Mozilla Public License Version 6 | * 1.1 (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.mozilla.org/MPL/ 9 | * 10 | * Software distributed under the License is distributed on an "AS IS" basis, 11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 | * for the specific language governing rights and limitations under the 13 | * License. 14 | * 15 | * The Original Code is the Netscape Portable Runtime (NSPR). 16 | * 17 | * The Initial Developer of the Original Code is 18 | * Netscape Communications Corporation. 19 | * Portions created by the Initial Developer are Copyright (C) 1998-2000 20 | * the Initial Developer. All Rights Reserved. 21 | * 22 | * Contributor(s): 23 | * 24 | * Alternatively, the contents of this file may be used under the terms of 25 | * either the GNU General Public License Version 2 or later (the "GPL"), or 26 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 27 | * in which case the provisions of the GPL or the LGPL are applicable instead 28 | * of those above. If you wish to allow use of your version of this file only 29 | * under the terms of either the GPL or the LGPL, and not to allow others to 30 | * use your version of this file under the terms of the MPL, indicate your 31 | * decision by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL or the LGPL. If you do not delete 33 | * the provisions above, a recipient may use your version of this file under 34 | * the terms of any one of the MPL, the GPL or the LGPL. 35 | * 36 | * ***** END LICENSE BLOCK ***** */ 37 | 38 | /* 39 | * This header typedefs the old 'native' types to the new PRs. 40 | * These definitions are scheduled to be eliminated at the earliest 41 | * possible time. The NSPR API is implemented and documented using 42 | * the new definitions. 43 | */ 44 | 45 | #if !defined(PROTYPES_H) 46 | #define PROTYPES_H 47 | 48 | typedef PRUintn uintn; 49 | #ifndef _XP_Core_ 50 | typedef PRIntn intn; 51 | #endif 52 | 53 | /* 54 | * It is trickier to define uint, int8, uint8, int16, uint16, 55 | * int32, uint32, int64, and uint64 because some of these int 56 | * types are defined by standard header files on some platforms. 57 | * Our strategy here is to include all such standard headers 58 | * first, and then define these int types only if they are not 59 | * defined by those standard headers. 60 | */ 61 | 62 | /* 63 | * BeOS defines all the int types below in its standard header 64 | * file SupportDefs.h. 65 | */ 66 | #ifdef XP_BEOS 67 | #include 68 | #endif 69 | 70 | /* 71 | * OpenVMS defines all the int types below in its standard 72 | * header files ints.h and types.h. 73 | */ 74 | #ifdef VMS 75 | #include 76 | #include 77 | #endif 78 | 79 | /* 80 | * SVR4 typedef of uint is commonly found on UNIX machines. 81 | * 82 | * On AIX 4.3, sys/inttypes.h (which is included by sys/types.h) 83 | * defines the types int8, int16, int32, and int64. 84 | */ 85 | #ifdef XP_UNIX 86 | #include 87 | #endif 88 | 89 | /* model.h on HP-UX defines int8, int16, and int32. */ 90 | #ifdef HPUX 91 | #include 92 | #endif 93 | 94 | /* 95 | * uint 96 | */ 97 | 98 | #if !defined(XP_BEOS) && !defined(VMS) \ 99 | && !defined(XP_UNIX) || defined(NTO) 100 | typedef PRUintn uint; 101 | #endif 102 | 103 | /* 104 | * uint64 105 | */ 106 | 107 | #if !defined(XP_BEOS) && !defined(VMS) 108 | typedef PRUint64 uint64; 109 | #endif 110 | 111 | /* 112 | * uint32 113 | */ 114 | 115 | #if !defined(XP_BEOS) && !defined(VMS) && !defined(MAC_OS_X_VERSION_10_5) 116 | #if !defined(XP_MAC) && !defined(_WIN32) && !defined(XP_OS2) && !defined(NTO) 117 | typedef PRUint32 uint32; 118 | #else 119 | typedef unsigned long uint32; 120 | #endif 121 | #endif 122 | 123 | /* 124 | * uint16 125 | */ 126 | 127 | #if !defined(XP_BEOS) && !defined(VMS) 128 | typedef PRUint16 uint16; 129 | #endif 130 | 131 | /* 132 | * uint8 133 | */ 134 | 135 | #if !defined(XP_BEOS) && !defined(VMS) 136 | typedef PRUint8 uint8; 137 | #endif 138 | 139 | /* 140 | * int64 141 | */ 142 | 143 | #if !defined(XP_BEOS) && !defined(VMS) \ 144 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) 145 | typedef PRInt64 int64; 146 | #endif 147 | 148 | /* 149 | * int32 150 | */ 151 | 152 | #if !defined(XP_BEOS) && !defined(VMS) \ 153 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 154 | && !defined(HPUX) 155 | #if !defined(WIN32) || !defined(_WINSOCK2API_) /* defines its own "int32" */ 156 | #if !defined(XP_MAC) && !defined(_WIN32) && !defined(XP_OS2) && !defined(NTO) 157 | typedef PRInt32 int32; 158 | #else 159 | typedef long int32; 160 | #endif 161 | #endif 162 | #endif 163 | 164 | /* 165 | * int16 166 | */ 167 | 168 | #if !defined(XP_BEOS) && !defined(VMS) \ 169 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 170 | && !defined(HPUX) 171 | typedef PRInt16 int16; 172 | #endif 173 | 174 | /* 175 | * int8 176 | */ 177 | 178 | #if !defined(XP_BEOS) && !defined(VMS) \ 179 | && !defined(_PR_AIX_HAVE_BSD_INT_TYPES) \ 180 | && !defined(HPUX) 181 | typedef PRInt8 int8; 182 | #endif 183 | 184 | typedef PRFloat64 float64; 185 | typedef PRUptrdiff uptrdiff_t; 186 | typedef PRUword uprword_t; 187 | typedef PRWord prword_t; 188 | 189 | 190 | /* Re: prbit.h */ 191 | #define TEST_BIT PR_TEST_BIT 192 | #define SET_BIT PR_SET_BIT 193 | #define CLEAR_BIT PR_CLEAR_BIT 194 | 195 | /* Re: prarena.h->plarena.h */ 196 | #define PRArena PLArena 197 | #define PRArenaPool PLArenaPool 198 | #define PRArenaStats PLArenaStats 199 | #define PR_ARENA_ALIGN PL_ARENA_ALIGN 200 | #define PR_INIT_ARENA_POOL PL_INIT_ARENA_POOL 201 | #define PR_ARENA_ALLOCATE PL_ARENA_ALLOCATE 202 | #define PR_ARENA_GROW PL_ARENA_GROW 203 | #define PR_ARENA_MARK PL_ARENA_MARK 204 | #define PR_CLEAR_UNUSED PL_CLEAR_UNUSED 205 | #define PR_CLEAR_ARENA PL_CLEAR_ARENA 206 | #define PR_ARENA_RELEASE PL_ARENA_RELEASE 207 | #define PR_COUNT_ARENA PL_COUNT_ARENA 208 | #define PR_ARENA_DESTROY PL_ARENA_DESTROY 209 | #define PR_InitArenaPool PL_InitArenaPool 210 | #define PR_FreeArenaPool PL_FreeArenaPool 211 | #define PR_FinishArenaPool PL_FinishArenaPool 212 | #define PR_CompactArenaPool PL_CompactArenaPool 213 | #define PR_ArenaFinish PL_ArenaFinish 214 | #define PR_ArenaAllocate PL_ArenaAllocate 215 | #define PR_ArenaGrow PL_ArenaGrow 216 | #define PR_ArenaRelease PL_ArenaRelease 217 | #define PR_ArenaCountAllocation PL_ArenaCountAllocation 218 | #define PR_ArenaCountInplaceGrowth PL_ArenaCountInplaceGrowth 219 | #define PR_ArenaCountGrowth PL_ArenaCountGrowth 220 | #define PR_ArenaCountRelease PL_ArenaCountRelease 221 | #define PR_ArenaCountRetract PL_ArenaCountRetract 222 | 223 | /* Re: prhash.h->plhash.h */ 224 | #define PRHashEntry PLHashEntry 225 | #define PRHashTable PLHashTable 226 | #define PRHashNumber PLHashNumber 227 | #define PRHashFunction PLHashFunction 228 | #define PRHashComparator PLHashComparator 229 | #define PRHashEnumerator PLHashEnumerator 230 | #define PRHashAllocOps PLHashAllocOps 231 | #define PR_NewHashTable PL_NewHashTable 232 | #define PR_HashTableDestroy PL_HashTableDestroy 233 | #define PR_HashTableRawLookup PL_HashTableRawLookup 234 | #define PR_HashTableRawAdd PL_HashTableRawAdd 235 | #define PR_HashTableRawRemove PL_HashTableRawRemove 236 | #define PR_HashTableAdd PL_HashTableAdd 237 | #define PR_HashTableRemove PL_HashTableRemove 238 | #define PR_HashTableEnumerateEntries PL_HashTableEnumerateEntries 239 | #define PR_HashTableLookup PL_HashTableLookup 240 | #define PR_HashTableDump PL_HashTableDump 241 | #define PR_HashString PL_HashString 242 | #define PR_CompareStrings PL_CompareStrings 243 | #define PR_CompareValues PL_CompareValues 244 | 245 | #if defined(XP_MAC) 246 | #ifndef TRUE /* Mac standard is lower case true */ 247 | #define TRUE 1 248 | #endif 249 | #ifndef FALSE /* Mac standard is lower case false */ 250 | #define FALSE 0 251 | #endif 252 | #endif 253 | 254 | #endif /* !defined(PROTYPES_H) */ 255 | -------------------------------------------------------------------------------- /mozilla-release/xpcom/glue/nsMemory.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsMemory_h__ 7 | #define nsMemory_h__ 8 | 9 | #ifndef OS_X_FRAMEWORK_BUILD 10 | 11 | #include "nsXPCOM.h" 12 | 13 | class nsIMemory; 14 | 15 | #define NS_MEMORY_CONTRACTID "@mozilla.org/xpcom/memory-service;1" 16 | #define NS_MEMORY_CID \ 17 | { /* 30a04e40-38e7-11d4-8cf5-0060b0fc14a3 */ \ 18 | 0x30a04e40, \ 19 | 0x38e7, \ 20 | 0x11d4, \ 21 | {0x8c, 0xf5, 0x00, 0x60, 0xb0, 0xfc, 0x14, 0xa3} \ 22 | } 23 | 24 | 25 | /** 26 | * Static helper routines to manage memory. These routines allow easy access 27 | * to xpcom's built-in (global) nsIMemory implementation, without needing 28 | * to go through the service manager to get it. However this requires clients 29 | * to link with the xpcom DLL. 30 | * 31 | * This class is not threadsafe and is intented for use only on the main 32 | * thread. 33 | */ 34 | class nsMemory 35 | { 36 | public: 37 | static NS_HIDDEN_(void*) Alloc(size_t size) 38 | { return NS_Alloc(size); } 39 | 40 | static NS_HIDDEN_(void*) Realloc(void* ptr, size_t size) 41 | { return NS_Realloc(ptr, size); } 42 | 43 | static NS_HIDDEN_(void) Free(void* ptr) 44 | { NS_Free(ptr); } 45 | 46 | static NS_COM_GLUE nsresult HeapMinimize(bool aImmediate); 47 | static NS_COM_GLUE void* Clone(const void* ptr, size_t size); 48 | static NS_COM_GLUE nsIMemory* GetGlobalMemoryService(); // AddRefs 49 | }; 50 | 51 | /** 52 | * Macro to free all elements of an XPCOM array of a given size using 53 | * freeFunc, then frees the array itself using nsMemory::Free(). 54 | * 55 | * Note that this macro (and its wrappers) can be used to deallocate a 56 | * partially- or completely-built array while unwinding an error 57 | * condition inside the XPCOM routine that was going to return the 58 | * array. For this to work on a partially-built array, your code 59 | * needs to be building the array from index 0 upwards, and simply 60 | * pass the number of elements that have already been built (and thus 61 | * need to be freed) as |size|. 62 | * 63 | * Thanks to for suggesting this form, which 64 | * allows the macro to be used with NS_RELEASE / NS_RELEASE_IF in 65 | * addition to nsMemory::Free. 66 | * 67 | * @param size Number of elements in the array. If not a constant, this 68 | * should be a int32_t. Note that this means this macro 69 | * will not work if size >= 2^31. 70 | * @param array The array to be freed. 71 | * @param freeFunc The function or macro to be used to free it. 72 | * For arrays of nsISupports (or any class derived 73 | * from it), NS_IF_RELEASE (or NS_RELEASE) should be 74 | * passed as freeFunc. For most (all?) other pointer 75 | * types (including XPCOM strings and wstrings), 76 | * nsMemory::Free should be used, since the 77 | * shared-allocator (nsMemory) is what will have been 78 | * used to allocate the memory. 79 | */ 80 | #define NS_FREE_XPCOM_POINTER_ARRAY(size, array, freeFunc) \ 81 | PR_BEGIN_MACRO \ 82 | int32_t iter_ = int32_t(size); \ 83 | while (--iter_ >= 0) \ 84 | freeFunc((array)[iter_]); \ 85 | NS_Free((array)); \ 86 | PR_END_MACRO 87 | 88 | // convenience macros for commonly used calls. mmmmm. syntactic sugar. 89 | 90 | /** 91 | * Macro to free arrays of non-refcounted objects allocated by the 92 | * shared allocator (nsMemory) such as strings and wstrings. A 93 | * convenience wrapper around NS_FREE_XPCOM_POINTER_ARRAY. 94 | * 95 | * @param size Number of elements in the array. If not a constant, this 96 | * should be a int32_t. Note that this means this macro 97 | * will not work if size >= 2^31. 98 | * @param array The array to be freed. 99 | */ 100 | #define NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(size, array) \ 101 | NS_FREE_XPCOM_POINTER_ARRAY((size), (array), NS_Free) 102 | 103 | /** 104 | * Macro to free an array of pointers to nsISupports (or classes 105 | * derived from it). A convenience wrapper around 106 | * NS_FREE_XPCOM_POINTER_ARRAY. 107 | * 108 | * Note that if you know that none of your nsISupports pointers are 109 | * going to be 0, you can gain a bit of speed by calling 110 | * NS_FREE_XPCOM_POINTER_ARRAY directly and using NS_RELEASE as your 111 | * free function. 112 | * 113 | * @param size Number of elements in the array. If not a constant, this 114 | * should be a int32_t. Note that this means this macro 115 | * will not work if size >= 2^31. 116 | * @param array The array to be freed. 117 | */ 118 | #define NS_FREE_XPCOM_ISUPPORTS_POINTER_ARRAY(size, array) \ 119 | NS_FREE_XPCOM_POINTER_ARRAY((size), (array), NS_IF_RELEASE) 120 | 121 | #endif /* OS_X_FRAMEWORK_BUILD */ 122 | 123 | /** 124 | * Helpful array length function for calculating the length of a 125 | * statically declared array. 126 | */ 127 | 128 | #define NS_ARRAY_LENGTH(array_) \ 129 | (sizeof(array_)/sizeof(array_[0])) 130 | 131 | #ifndef OS_X_FRAMEWORK_BUILD 132 | 133 | /** 134 | * A macro, NS_ALIGNMENT_OF(t_) that determines the alignment 135 | * requirements of a type. 136 | */ 137 | namespace mozilla { 138 | template 139 | struct AlignmentTestStruct 140 | { 141 | char c; 142 | T t; 143 | }; 144 | } 145 | 146 | #define NS_ALIGNMENT_OF(t_) \ 147 | (sizeof(mozilla::AlignmentTestStruct) - sizeof(t_)) 148 | 149 | /** 150 | * An enumeration type used to represent a method of assignment. 151 | */ 152 | enum nsAssignmentType { 153 | NS_ASSIGNMENT_COPY, // copy by value 154 | NS_ASSIGNMENT_DEPEND, // copy by reference 155 | NS_ASSIGNMENT_ADOPT // copy by reference (take ownership of resource) 156 | }; 157 | 158 | #endif /* OS_X_FRAMEWORK_BUILD */ 159 | 160 | #endif // nsMemory_h__ 161 | 162 | -------------------------------------------------------------------------------- /scan.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | my %charsets; 6 | 7 | for(@ARGV) 8 | { 9 | open FILE,$_ or die; 10 | $_=do {local $/; }; 11 | 12 | # $charsets{$1}=1 while(/SequenceModel.*?=.*?\{[^}"]+"([^"]*)"[^}]+\}/gs); 13 | $charsets{$1}=1 while(/"([A-Za-z0-9_\-]+)"/g); 14 | } 15 | 16 | print join "\n",sort keys %charsets; 17 | print "\n"; -------------------------------------------------------------------------------- /universalchardet/moz.build: -------------------------------------------------------------------------------- 1 | # -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- 2 | # vim: set filetype=python: 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | DIRS += ['src'] 8 | TEST_DIRS += ['tests'] 9 | 10 | MODULE = 'universalchardet' 11 | 12 | -------------------------------------------------------------------------------- /universalchardet/src/base/CharDistribution.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "CharDistribution.h" 7 | 8 | #include "JISFreq.tab" 9 | #include "Big5Freq.tab" 10 | #include "EUCKRFreq.tab" 11 | #include "EUCTWFreq.tab" 12 | #include "GB2312Freq.tab" 13 | #include "nsMemory.h" 14 | 15 | #define SURE_YES 0.99f 16 | #define SURE_NO 0.01f 17 | 18 | //return confidence base on received data 19 | float CharDistributionAnalysis::GetConfidence(void) 20 | { 21 | //if we didn't receive any character in our consideration range, or the 22 | // number of frequent characters is below the minimum threshold, return 23 | // negative answer 24 | if (mTotalChars <= 0 || mFreqChars <= mDataThreshold) 25 | return SURE_NO; 26 | 27 | if (mTotalChars != mFreqChars) { 28 | float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio); 29 | 30 | if (r < SURE_YES) 31 | return r; 32 | } 33 | //normalize confidence, (we don't want to be 100% sure) 34 | return SURE_YES; 35 | } 36 | 37 | EUCTWDistributionAnalysis::EUCTWDistributionAnalysis() 38 | { 39 | mCharToFreqOrder = EUCTWCharToFreqOrder; 40 | mTableSize = NS_ARRAY_LENGTH(EUCTWCharToFreqOrder); 41 | mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO; 42 | } 43 | 44 | EUCKRDistributionAnalysis::EUCKRDistributionAnalysis() 45 | { 46 | mCharToFreqOrder = EUCKRCharToFreqOrder; 47 | mTableSize = NS_ARRAY_LENGTH(EUCKRCharToFreqOrder); 48 | mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO; 49 | } 50 | 51 | GB2312DistributionAnalysis::GB2312DistributionAnalysis() 52 | { 53 | mCharToFreqOrder = GB2312CharToFreqOrder; 54 | mTableSize = NS_ARRAY_LENGTH(GB2312CharToFreqOrder); 55 | mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO; 56 | } 57 | 58 | Big5DistributionAnalysis::Big5DistributionAnalysis() 59 | { 60 | mCharToFreqOrder = Big5CharToFreqOrder; 61 | mTableSize = NS_ARRAY_LENGTH(Big5CharToFreqOrder); 62 | mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO; 63 | } 64 | 65 | SJISDistributionAnalysis::SJISDistributionAnalysis() 66 | { 67 | mCharToFreqOrder = JISCharToFreqOrder; 68 | mTableSize = NS_ARRAY_LENGTH(JISCharToFreqOrder); 69 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 70 | } 71 | 72 | EUCJPDistributionAnalysis::EUCJPDistributionAnalysis() 73 | { 74 | mCharToFreqOrder = JISCharToFreqOrder; 75 | mTableSize = NS_ARRAY_LENGTH(JISCharToFreqOrder); 76 | mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO; 77 | } 78 | 79 | -------------------------------------------------------------------------------- /universalchardet/src/base/CharDistribution.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef CharDistribution_h__ 7 | #define CharDistribution_h__ 8 | 9 | #include "nscore.h" 10 | 11 | #define ENOUGH_DATA_THRESHOLD 1024 12 | 13 | #define MINIMUM_DATA_THRESHOLD 4 14 | 15 | class CharDistributionAnalysis 16 | { 17 | public: 18 | CharDistributionAnalysis() {Reset(false);} 19 | 20 | //feed a block of data and do distribution analysis 21 | void HandleData(const char* aBuf, uint32_t aLen) {} 22 | 23 | //Feed a character with known length 24 | void HandleOneChar(const char* aStr, uint32_t aCharLen) 25 | { 26 | int32_t order; 27 | 28 | //we only care about 2-bytes character in our distribution analysis 29 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 30 | 31 | if (order >= 0) 32 | { 33 | mTotalChars++; 34 | //order is valid 35 | if ((uint32_t)order < mTableSize) 36 | { 37 | if (512 > mCharToFreqOrder[order]) 38 | mFreqChars++; 39 | } 40 | } 41 | } 42 | 43 | //return confidence base on existing data 44 | float GetConfidence(void); 45 | 46 | //Reset analyser, clear any state 47 | void Reset(bool aIsPreferredLanguage) 48 | { 49 | mDone = false; 50 | mTotalChars = 0; 51 | mFreqChars = 0; 52 | mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; 53 | } 54 | 55 | //It is not necessary to receive all data to draw conclusion. For charset detection, 56 | // certain amount of data is enough 57 | bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} 58 | 59 | protected: 60 | //we do not handle character base on its original encoding string, but 61 | //convert this encoding string to a number, here called order. 62 | //This allow multiple encoding of a language to share one frequency table 63 | virtual int32_t GetOrder(const char* str) {return -1;} 64 | 65 | //If this flag is set to true, detection is done and conclusion has been made 66 | bool mDone; 67 | 68 | //The number of characters whose frequency order is less than 512 69 | uint32_t mFreqChars; 70 | 71 | //Total character encounted. 72 | uint32_t mTotalChars; 73 | 74 | //Number of hi-byte characters needed to trigger detection 75 | uint32_t mDataThreshold; 76 | 77 | //Mapping table to get frequency order from char order (get from GetOrder()) 78 | const int16_t *mCharToFreqOrder; 79 | 80 | //Size of above table 81 | uint32_t mTableSize; 82 | 83 | //This is a constant value varies from language to language, it is used in 84 | //calculating confidence. See my paper for further detail. 85 | float mTypicalDistributionRatio; 86 | }; 87 | 88 | 89 | class EUCTWDistributionAnalysis: public CharDistributionAnalysis 90 | { 91 | public: 92 | EUCTWDistributionAnalysis(); 93 | protected: 94 | 95 | //for euc-TW encoding, we are interested 96 | // first byte range: 0xc4 -- 0xfe 97 | // second byte range: 0xa1 -- 0xfe 98 | //no validation needed here. State machine has done that 99 | int32_t GetOrder(const char* str) 100 | { if ((unsigned char)*str >= (unsigned char)0xc4) 101 | return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; 102 | else 103 | return -1; 104 | } 105 | }; 106 | 107 | 108 | class EUCKRDistributionAnalysis : public CharDistributionAnalysis 109 | { 110 | public: 111 | EUCKRDistributionAnalysis(); 112 | protected: 113 | //for euc-KR encoding, we are interested 114 | // first byte range: 0xb0 -- 0xfe 115 | // second byte range: 0xa1 -- 0xfe 116 | //no validation needed here. State machine has done that 117 | int32_t GetOrder(const char* str) 118 | { if ((unsigned char)*str >= (unsigned char)0xb0) 119 | return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 120 | else 121 | return -1; 122 | } 123 | }; 124 | 125 | class GB2312DistributionAnalysis : public CharDistributionAnalysis 126 | { 127 | public: 128 | GB2312DistributionAnalysis(); 129 | protected: 130 | //for GB2312 encoding, we are interested 131 | // first byte range: 0xb0 -- 0xfe 132 | // second byte range: 0xa1 -- 0xfe 133 | //no validation needed here. State machine has done that 134 | int32_t GetOrder(const char* str) 135 | { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) 136 | return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 137 | else 138 | return -1; 139 | } 140 | }; 141 | 142 | 143 | class Big5DistributionAnalysis : public CharDistributionAnalysis 144 | { 145 | public: 146 | Big5DistributionAnalysis(); 147 | protected: 148 | //for big5 encoding, we are interested 149 | // first byte range: 0xa4 -- 0xfe 150 | // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 151 | //no validation needed here. State machine has done that 152 | int32_t GetOrder(const char* str) 153 | { if ((unsigned char)*str >= (unsigned char)0xa4) 154 | if ((unsigned char)str[1] >= (unsigned char)0xa1) 155 | return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; 156 | else 157 | return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 158 | else 159 | return -1; 160 | } 161 | }; 162 | 163 | class SJISDistributionAnalysis : public CharDistributionAnalysis 164 | { 165 | public: 166 | SJISDistributionAnalysis(); 167 | protected: 168 | //for sjis encoding, we are interested 169 | // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 170 | // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 171 | //no validation needed here. State machine has done that 172 | int32_t GetOrder(const char* str) 173 | { 174 | int32_t order; 175 | if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) 176 | order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); 177 | else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) 178 | order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); 179 | else 180 | return -1; 181 | order += (unsigned char)*(str+1) - 0x40; 182 | if ((unsigned char)str[1] > (unsigned char)0x7f) 183 | order--; 184 | return order; 185 | } 186 | }; 187 | 188 | class EUCJPDistributionAnalysis : public CharDistributionAnalysis 189 | { 190 | public: 191 | EUCJPDistributionAnalysis(); 192 | protected: 193 | //for euc-JP encoding, we are interested 194 | // first byte range: 0xa0 -- 0xfe 195 | // second byte range: 0xa1 -- 0xfe 196 | //no validation needed here. State machine has done that 197 | int32_t GetOrder(const char* str) 198 | { if ((unsigned char)*str >= (unsigned char)0xa0) 199 | return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 200 | else 201 | return -1; 202 | } 203 | }; 204 | 205 | #endif //CharDistribution_h__ 206 | 207 | -------------------------------------------------------------------------------- /universalchardet/src/base/JpCntx.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef __JPCNTX_H__ 7 | #define __JPCNTX_H__ 8 | 9 | #define NUM_OF_CATEGORY 6 10 | 11 | #include "nscore.h" 12 | 13 | #define ENOUGH_REL_THRESHOLD 100 14 | #define MAX_REL_THRESHOLD 1000 15 | 16 | //hiragana frequency category table 17 | extern const uint8_t jp2CharContext[83][83]; 18 | 19 | class JapaneseContextAnalysis 20 | { 21 | public: 22 | JapaneseContextAnalysis() {Reset(false);} 23 | 24 | void HandleData(const char* aBuf, uint32_t aLen); 25 | 26 | void HandleOneChar(const char* aStr, uint32_t aCharLen) 27 | { 28 | int32_t order; 29 | 30 | //if we received enough data, stop here 31 | if (mTotalRel > MAX_REL_THRESHOLD) mDone = true; 32 | if (mDone) return; 33 | 34 | //Only 2-bytes characters are of our interest 35 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; 36 | if (order != -1 && mLastCharOrder != -1) 37 | { 38 | mTotalRel++; 39 | //count this sequence to its category counter 40 | mRelSample[jp2CharContext[mLastCharOrder][order]]++; 41 | } 42 | mLastCharOrder = order; 43 | } 44 | 45 | float GetConfidence(void); 46 | void Reset(bool aIsPreferredLanguage); 47 | bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} 48 | 49 | protected: 50 | virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0; 51 | virtual int32_t GetOrder(const char* str) = 0; 52 | 53 | //category counters, each integer counts sequences in its category 54 | uint32_t mRelSample[NUM_OF_CATEGORY]; 55 | 56 | //total sequence received 57 | uint32_t mTotalRel; 58 | 59 | //Number of sequences needed to trigger detection 60 | uint32_t mDataThreshold; 61 | 62 | //The order of previous char 63 | int32_t mLastCharOrder; 64 | 65 | //if last byte in current buffer is not the last byte of a character, we 66 | //need to know how many byte to skip in next buffer. 67 | uint32_t mNeedToSkipCharNum; 68 | 69 | //If this flag is set to true, detection is done and conclusion has been made 70 | bool mDone; 71 | }; 72 | 73 | 74 | class SJISContextAnalysis : public JapaneseContextAnalysis 75 | { 76 | //SJISContextAnalysis(){}; 77 | protected: 78 | int32_t GetOrder(const char* str, uint32_t *charLen); 79 | 80 | int32_t GetOrder(const char* str) 81 | { 82 | //We only interested in Hiragana, so first byte is '\202' 83 | if (*str == '\202' && 84 | (unsigned char)*(str+1) >= (unsigned char)0x9f && 85 | (unsigned char)*(str+1) <= (unsigned char)0xf1) 86 | return (unsigned char)*(str+1) - (unsigned char)0x9f; 87 | return -1; 88 | } 89 | }; 90 | 91 | class EUCJPContextAnalysis : public JapaneseContextAnalysis 92 | { 93 | protected: 94 | int32_t GetOrder(const char* str, uint32_t *charLen); 95 | int32_t GetOrder(const char* str) 96 | //We only interested in Hiragana, so first byte is '\244' 97 | { 98 | if (*str == '\244' && 99 | (unsigned char)*(str+1) >= (unsigned char)0xa1 && 100 | (unsigned char)*(str+1) <= (unsigned char)0xf3) 101 | return (unsigned char)*(str+1) - (unsigned char)0xa1; 102 | return -1; 103 | } 104 | }; 105 | 106 | #endif /* __JPCNTX_H__ */ 107 | 108 | -------------------------------------------------------------------------------- /universalchardet/src/base/moz.build: -------------------------------------------------------------------------------- 1 | # -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- 2 | # vim: set filetype=python: 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | MODULE = 'universalchardet_s' 8 | 9 | CPP_SOURCES += [ 10 | 'CharDistribution.cpp', 11 | 'JpCntx.cpp', 12 | 'LangBulgarianModel.cpp', 13 | 'LangCyrillicModel.cpp', 14 | 'LangGreekModel.cpp', 15 | 'LangHebrewModel.cpp', 16 | 'LangHungarianModel.cpp', 17 | 'LangThaiModel.cpp', 18 | 'nsBig5Prober.cpp', 19 | 'nsCharSetProber.cpp', 20 | 'nsEUCJPProber.cpp', 21 | 'nsEUCKRProber.cpp', 22 | 'nsEUCTWProber.cpp', 23 | 'nsEscCharsetProber.cpp', 24 | 'nsEscSM.cpp', 25 | 'nsGB2312Prober.cpp', 26 | 'nsHebrewProber.cpp', 27 | 'nsLatin1Prober.cpp', 28 | 'nsMBCSGroupProber.cpp', 29 | 'nsMBCSSM.cpp', 30 | 'nsSBCSGroupProber.cpp', 31 | 'nsSBCharSetProber.cpp', 32 | 'nsSJISProber.cpp', 33 | 'nsUTF8Prober.cpp', 34 | 'nsUniversalDetector.cpp', 35 | ] 36 | 37 | LIBRARY_NAME = 'universalchardet_s' 38 | 39 | LIBXUL_LIBRARY = True 40 | 41 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsBig5Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsBig5Prober.h" 7 | #include "nsDebug.h" 8 | 9 | void nsBig5Prober::Reset(void) 10 | { 11 | mCodingSM->Reset(); 12 | mState = eDetecting; 13 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 14 | } 15 | 16 | nsProbingState nsBig5Prober::HandleData(const char* aBuf, uint32_t aLen) 17 | { 18 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 19 | nsSMState codingState; 20 | 21 | for (uint32_t i = 0; i < aLen; i++) 22 | { 23 | codingState = mCodingSM->NextState(aBuf[i]); 24 | if (codingState == eItsMe) 25 | { 26 | mState = eFoundIt; 27 | break; 28 | } 29 | if (codingState == eStart) 30 | { 31 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 32 | 33 | if (i == 0) 34 | { 35 | mLastChar[1] = aBuf[0]; 36 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 37 | } 38 | else 39 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 40 | } 41 | } 42 | 43 | mLastChar[0] = aBuf[aLen-1]; 44 | 45 | if (mState == eDetecting) 46 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 47 | mState = eFoundIt; 48 | 49 | return mState; 50 | } 51 | 52 | float nsBig5Prober::GetConfidence(void) 53 | { 54 | float distribCf = mDistributionAnalyser.GetConfidence(); 55 | 56 | return (float)distribCf; 57 | } 58 | 59 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsBig5Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsBig5Prober_h__ 7 | #define nsBig5Prober_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | #include "CharDistribution.h" 12 | 13 | class nsBig5Prober: public nsCharSetProber { 14 | public: 15 | nsBig5Prober(bool aIsPreferredLanguage) 16 | :mIsPreferredLanguage(aIsPreferredLanguage) 17 | {mCodingSM = new nsCodingStateMachine(&Big5SMModel); 18 | Reset();} 19 | virtual ~nsBig5Prober(void){delete mCodingSM;} 20 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 21 | const char* GetCharSetName() {return "Big5";} 22 | nsProbingState GetState(void) {return mState;} 23 | void Reset(void); 24 | float GetConfidence(void); 25 | 26 | protected: 27 | void GetDistribution(uint32_t aCharLen, const char* aStr); 28 | 29 | nsCodingStateMachine* mCodingSM; 30 | nsProbingState mState; 31 | 32 | //Big5ContextAnalysis mContextAnalyser; 33 | Big5DistributionAnalysis mDistributionAnalyser; 34 | char mLastChar[2]; 35 | bool mIsPreferredLanguage; 36 | 37 | }; 38 | 39 | 40 | #endif /* nsBig5Prober_h__ */ 41 | 42 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsCharSetProber.h" 7 | #include "prmem.h" 8 | 9 | //This filter applies to all scripts which do not use English characters 10 | bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) 11 | { 12 | char *newptr; 13 | char *prevPtr, *curPtr; 14 | 15 | bool meetMSB = false; 16 | newptr = *newBuf = (char*)PR_Malloc(aLen); 17 | if (!newptr) 18 | return false; 19 | 20 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 21 | { 22 | if (*curPtr & 0x80) 23 | { 24 | meetMSB = true; 25 | } 26 | else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 27 | { 28 | //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 29 | if (meetMSB && curPtr > prevPtr) 30 | //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 31 | { 32 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 33 | prevPtr++; 34 | *newptr++ = ' '; 35 | meetMSB = false; 36 | } 37 | else //ignore current segment. (either because it is just a symbol or just an English word) 38 | prevPtr = curPtr+1; 39 | } 40 | } 41 | if (meetMSB && curPtr > prevPtr) 42 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 43 | 44 | newLen = (PRUint32)(newptr - *newBuf); 45 | 46 | return true; 47 | } 48 | 49 | //This filter applies to all scripts which contain both English characters and upper ASCII characters. 50 | bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen) 51 | { 52 | //do filtering to reduce load to probers 53 | char *newptr; 54 | char *prevPtr, *curPtr; 55 | bool isInTag = false; 56 | 57 | newptr = *newBuf = (char*)PR_Malloc(aLen); 58 | if (!newptr) 59 | return false; 60 | 61 | for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 62 | { 63 | if (*curPtr == '>') 64 | isInTag = false; 65 | else if (*curPtr == '<') 66 | isInTag = true; 67 | 68 | if (!(*curPtr & 0x80) && 69 | (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 70 | { 71 | if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 72 | // and it is not inside a tag, keep it. 73 | { 74 | while (prevPtr < curPtr) *newptr++ = *prevPtr++; 75 | prevPtr++; 76 | *newptr++ = ' '; 77 | } 78 | else 79 | prevPtr = curPtr+1; 80 | } 81 | } 82 | 83 | // If the current segment contains more than just a symbol 84 | // and it is not inside a tag then keep it. 85 | if (!isInTag) 86 | while (prevPtr < curPtr) 87 | *newptr++ = *prevPtr++; 88 | 89 | newLen = (PRUint32)(newptr - *newBuf); 90 | 91 | return true; 92 | } 93 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | #ifndef nsCharSetProber_h__ 6 | #define nsCharSetProber_h__ 7 | 8 | #include "nscore.h" 9 | 10 | //#define DEBUG_chardet // Uncomment this for debug dump. 11 | 12 | typedef enum { 13 | eDetecting = 0, //We are still detecting, no sure answer yet, but caller can ask for confidence. 14 | eFoundIt = 1, //That's a positive answer 15 | eNotMe = 2 //Negative answer 16 | } nsProbingState; 17 | 18 | #define SHORTCUT_THRESHOLD (float)0.95 19 | 20 | class nsCharSetProber { 21 | public: 22 | virtual ~nsCharSetProber() {} 23 | virtual const char* GetCharSetName() = 0; 24 | virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen) = 0; 25 | virtual nsProbingState GetState(void) = 0; 26 | virtual void Reset(void) = 0; 27 | virtual float GetConfidence(void) = 0; 28 | 29 | #ifdef DEBUG_chardet 30 | virtual void DumpStatus() {}; 31 | #endif 32 | 33 | // Helper functions used in the Latin1 and Group probers. 34 | // both functions Allocate a new buffer for newBuf. This buffer should be 35 | // freed by the caller using PR_FREEIF. 36 | // Both functions return false in case of memory allocation failure. 37 | static bool FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen); 38 | static bool FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen); 39 | 40 | }; 41 | 42 | #endif /* nsCharSetProber_h__ */ 43 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsCodingStateMachine.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | #ifndef nsCodingStateMachine_h__ 6 | #define nsCodingStateMachine_h__ 7 | 8 | #include "nsPkgInt.h" 9 | //#include "mozilla/Util.h" 10 | 11 | typedef enum { 12 | eStart = 0, 13 | eError = 1, 14 | eItsMe = 2 15 | } nsSMState; 16 | 17 | #define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable) 18 | 19 | //state machine model 20 | typedef struct 21 | { 22 | nsPkgInt classTable; 23 | uint32_t classFactor; 24 | nsPkgInt stateTable; 25 | const uint32_t* charLenTable; 26 | #ifdef DEBUG 27 | const size_t charLenTableLength; 28 | #endif 29 | const char* name; 30 | } SMModel; 31 | 32 | class nsCodingStateMachine { 33 | public: 34 | nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } 35 | nsSMState NextState(char c){ 36 | //for each byte we get its class , if it is first byte, we also get byte length 37 | uint32_t byteCls = GETCLASS(c); 38 | if (mCurrentState == eStart) 39 | { 40 | mCurrentBytePos = 0; 41 | //MOZ_ASSERT(byteCls < mModel->charLenTableLength); 42 | mCurrentCharLen = mModel->charLenTable[byteCls]; 43 | } 44 | //from byte's class and stateTable, we get its next state 45 | mCurrentState=(nsSMState)GETFROMPCK(mCurrentState*(mModel->classFactor)+byteCls, 46 | mModel->stateTable); 47 | mCurrentBytePos++; 48 | return mCurrentState; 49 | } 50 | uint32_t GetCurrentCharLen(void) {return mCurrentCharLen;} 51 | void Reset(void) {mCurrentState = eStart;} 52 | const char * GetCodingStateMachine() {return mModel->name;} 53 | 54 | protected: 55 | nsSMState mCurrentState; 56 | uint32_t mCurrentCharLen; 57 | uint32_t mCurrentBytePos; 58 | 59 | const SMModel *mModel; 60 | }; 61 | 62 | extern const SMModel UTF8SMModel; 63 | extern const SMModel Big5SMModel; 64 | extern const SMModel EUCJPSMModel; 65 | extern const SMModel EUCKRSMModel; 66 | extern const SMModel EUCTWSMModel; 67 | extern const SMModel GB18030SMModel; 68 | extern const SMModel SJISSMModel; 69 | 70 | 71 | extern const SMModel HZSMModel; 72 | extern const SMModel ISO2022CNSMModel; 73 | extern const SMModel ISO2022JPSMModel; 74 | extern const SMModel ISO2022KRSMModel; 75 | 76 | #undef CHAR_LEN_TABLE 77 | #ifdef DEBUG 78 | #define CHAR_LEN_TABLE(x) x, mozilla::ArrayLength(x) 79 | #else 80 | #define CHAR_LEN_TABLE(x) x 81 | #endif 82 | 83 | #endif /* nsCodingStateMachine_h__ */ 84 | 85 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCJPProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | // for japanese encoding, obeserve characteristic: 7 | // 1, kana character (or hankaku?) often have hight frequency of appereance 8 | // 2, kana character often exist in group 9 | // 3, certain combination of kana is never used in japanese language 10 | 11 | #include "nsEUCJPProber.h" 12 | #include "nsDebug.h" 13 | 14 | void nsEUCJPProber::Reset(void) 15 | { 16 | mCodingSM->Reset(); 17 | mState = eDetecting; 18 | mContextAnalyser.Reset(mIsPreferredLanguage); 19 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 20 | } 21 | 22 | nsProbingState nsEUCJPProber::HandleData(const char* aBuf, uint32_t aLen) 23 | { 24 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 25 | nsSMState codingState; 26 | 27 | for (uint32_t i = 0; i < aLen; i++) 28 | { 29 | codingState = mCodingSM->NextState(aBuf[i]); 30 | if (codingState == eItsMe) 31 | { 32 | mState = eFoundIt; 33 | break; 34 | } 35 | if (codingState == eStart) 36 | { 37 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 38 | 39 | if (i == 0) 40 | { 41 | mLastChar[1] = aBuf[0]; 42 | mContextAnalyser.HandleOneChar(mLastChar, charLen); 43 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 44 | } 45 | else 46 | { 47 | mContextAnalyser.HandleOneChar(aBuf+i-1, charLen); 48 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 49 | } 50 | } 51 | } 52 | 53 | mLastChar[0] = aBuf[aLen-1]; 54 | 55 | if (mState == eDetecting) 56 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 57 | mState = eFoundIt; 58 | 59 | return mState; 60 | } 61 | 62 | float nsEUCJPProber::GetConfidence(void) 63 | { 64 | float contxtCf = mContextAnalyser.GetConfidence(); 65 | float distribCf = mDistributionAnalyser.GetConfidence(); 66 | 67 | return (contxtCf > distribCf ? contxtCf : distribCf); 68 | } 69 | 70 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCJPProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | // for S-JIS encoding, obeserve characteristic: 7 | // 1, kana character (or hankaku?) often have hight frequency of appereance 8 | // 2, kana character often exist in group 9 | // 3, certain combination of kana is never used in japanese language 10 | 11 | #ifndef nsEUCJPProber_h__ 12 | #define nsEUCJPProber_h__ 13 | 14 | #include "nsCharSetProber.h" 15 | #include "nsCodingStateMachine.h" 16 | #include "JpCntx.h" 17 | #include "CharDistribution.h" 18 | 19 | class nsEUCJPProber: public nsCharSetProber { 20 | public: 21 | nsEUCJPProber(bool aIsPreferredLanguage) 22 | :mIsPreferredLanguage(aIsPreferredLanguage) 23 | {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); 24 | Reset();} 25 | virtual ~nsEUCJPProber(void){delete mCodingSM;} 26 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 27 | const char* GetCharSetName() {return "EUC-JP";} 28 | nsProbingState GetState(void) {return mState;} 29 | void Reset(void); 30 | float GetConfidence(void); 31 | 32 | protected: 33 | nsCodingStateMachine* mCodingSM; 34 | nsProbingState mState; 35 | 36 | EUCJPContextAnalysis mContextAnalyser; 37 | EUCJPDistributionAnalysis mDistributionAnalyser; 38 | 39 | char mLastChar[2]; 40 | bool mIsPreferredLanguage; 41 | }; 42 | 43 | 44 | #endif /* nsEUCJPProber_h__ */ 45 | 46 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCKRProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsEUCKRProber.h" 7 | #include "nsDebug.h" 8 | 9 | void nsEUCKRProber::Reset(void) 10 | { 11 | mCodingSM->Reset(); 12 | mState = eDetecting; 13 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 14 | //mContextAnalyser.Reset(); 15 | } 16 | 17 | nsProbingState nsEUCKRProber::HandleData(const char* aBuf, uint32_t aLen) 18 | { 19 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 20 | nsSMState codingState; 21 | 22 | for (uint32_t i = 0; i < aLen; i++) 23 | { 24 | codingState = mCodingSM->NextState(aBuf[i]); 25 | if (codingState == eItsMe) 26 | { 27 | mState = eFoundIt; 28 | break; 29 | } 30 | if (codingState == eStart) 31 | { 32 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 33 | 34 | if (i == 0) 35 | { 36 | mLastChar[1] = aBuf[0]; 37 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 38 | } 39 | else 40 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 41 | } 42 | } 43 | 44 | mLastChar[0] = aBuf[aLen-1]; 45 | 46 | if (mState == eDetecting) 47 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 48 | mState = eFoundIt; 49 | // else 50 | // mDistributionAnalyser.HandleData(aBuf, aLen); 51 | 52 | return mState; 53 | } 54 | 55 | float nsEUCKRProber::GetConfidence(void) 56 | { 57 | float distribCf = mDistributionAnalyser.GetConfidence(); 58 | 59 | return (float)distribCf; 60 | } 61 | 62 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCKRProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsEUCKRProber_h__ 7 | #define nsEUCKRProber_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | #include "CharDistribution.h" 12 | 13 | class nsEUCKRProber: public nsCharSetProber { 14 | public: 15 | nsEUCKRProber(bool aIsPreferredLanguage) 16 | :mIsPreferredLanguage(aIsPreferredLanguage) 17 | {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); 18 | Reset(); 19 | } 20 | virtual ~nsEUCKRProber(void){delete mCodingSM;} 21 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 22 | const char* GetCharSetName() {return "EUC-KR";} 23 | nsProbingState GetState(void) {return mState;} 24 | void Reset(void); 25 | float GetConfidence(void); 26 | 27 | protected: 28 | void GetDistribution(uint32_t aCharLen, const char* aStr); 29 | 30 | nsCodingStateMachine* mCodingSM; 31 | nsProbingState mState; 32 | 33 | //EUCKRContextAnalysis mContextAnalyser; 34 | EUCKRDistributionAnalysis mDistributionAnalyser; 35 | char mLastChar[2]; 36 | bool mIsPreferredLanguage; 37 | 38 | }; 39 | 40 | 41 | #endif /* nsEUCKRProber_h__ */ 42 | 43 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCTWProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsEUCTWProber.h" 7 | #include "nsDebug.h" 8 | 9 | void nsEUCTWProber::Reset(void) 10 | { 11 | mCodingSM->Reset(); 12 | mState = eDetecting; 13 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 14 | //mContextAnalyser.Reset(); 15 | } 16 | 17 | nsProbingState nsEUCTWProber::HandleData(const char* aBuf, uint32_t aLen) 18 | { 19 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 20 | nsSMState codingState; 21 | 22 | for (uint32_t i = 0; i < aLen; i++) 23 | { 24 | codingState = mCodingSM->NextState(aBuf[i]); 25 | if (codingState == eItsMe) 26 | { 27 | mState = eFoundIt; 28 | break; 29 | } 30 | if (codingState == eStart) 31 | { 32 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 33 | 34 | if (i == 0) 35 | { 36 | mLastChar[1] = aBuf[0]; 37 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 38 | } 39 | else 40 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 41 | } 42 | } 43 | 44 | mLastChar[0] = aBuf[aLen-1]; 45 | 46 | if (mState == eDetecting) 47 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 48 | mState = eFoundIt; 49 | // else 50 | // mDistributionAnalyser.HandleData(aBuf, aLen); 51 | 52 | return mState; 53 | } 54 | 55 | float nsEUCTWProber::GetConfidence(void) 56 | { 57 | float distribCf = mDistributionAnalyser.GetConfidence(); 58 | 59 | return (float)distribCf; 60 | } 61 | 62 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEUCTWProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsEUCTWProber_h__ 7 | #define nsEUCTWProber_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | #include "CharDistribution.h" 12 | 13 | class nsEUCTWProber: public nsCharSetProber { 14 | public: 15 | nsEUCTWProber(bool aIsPreferredLanguage) 16 | :mIsPreferredLanguage(aIsPreferredLanguage) 17 | {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); 18 | Reset();} 19 | virtual ~nsEUCTWProber(void){delete mCodingSM;} 20 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 21 | const char* GetCharSetName() {return "x-euc-tw";} 22 | nsProbingState GetState(void) {return mState;} 23 | void Reset(void); 24 | float GetConfidence(void); 25 | 26 | protected: 27 | void GetDistribution(uint32_t aCharLen, const char* aStr); 28 | 29 | nsCodingStateMachine* mCodingSM; 30 | nsProbingState mState; 31 | 32 | //EUCTWContextAnalysis mContextAnalyser; 33 | EUCTWDistributionAnalysis mDistributionAnalyser; 34 | char mLastChar[2]; 35 | bool mIsPreferredLanguage; 36 | 37 | }; 38 | 39 | 40 | #endif /* nsEUCTWProber_h__ */ 41 | 42 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEscCharsetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | 7 | #include "nsEscCharsetProber.h" 8 | #include "nsUniversalDetector.h" 9 | 10 | nsEscCharSetProber::nsEscCharSetProber(uint32_t aLanguageFilter) 11 | { 12 | for (uint32_t i = 0; i < NUM_OF_ESC_CHARSETS; i++) 13 | mCodingSM[i] = nullptr; 14 | if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 15 | { 16 | mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); 17 | mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); 18 | } 19 | if (aLanguageFilter & NS_FILTER_JAPANESE) 20 | mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); 21 | if (aLanguageFilter & NS_FILTER_KOREAN) 22 | mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); 23 | mActiveSM = NUM_OF_ESC_CHARSETS; 24 | mState = eDetecting; 25 | mDetectedCharset = nullptr; 26 | } 27 | 28 | nsEscCharSetProber::~nsEscCharSetProber(void) 29 | { 30 | for (uint32_t i = 0; i < NUM_OF_ESC_CHARSETS; i++) 31 | delete mCodingSM[i]; 32 | } 33 | 34 | void nsEscCharSetProber::Reset(void) 35 | { 36 | mState = eDetecting; 37 | for (uint32_t i = 0; i < NUM_OF_ESC_CHARSETS; i++) 38 | if (mCodingSM[i]) 39 | mCodingSM[i]->Reset(); 40 | mActiveSM = NUM_OF_ESC_CHARSETS; 41 | mDetectedCharset = nullptr; 42 | } 43 | 44 | nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, uint32_t aLen) 45 | { 46 | nsSMState codingState; 47 | int32_t j; 48 | uint32_t i; 49 | 50 | for ( i = 0; i < aLen && mState == eDetecting; i++) 51 | { 52 | for (j = mActiveSM-1; j>= 0; j--) 53 | { 54 | if (mCodingSM[j]) 55 | { 56 | codingState = mCodingSM[j]->NextState(aBuf[i]); 57 | if (codingState == eItsMe) 58 | { 59 | mState = eFoundIt; 60 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 61 | return mState; 62 | } 63 | } 64 | } 65 | } 66 | 67 | return mState; 68 | } 69 | 70 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsEscCharsetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsEscCharSetProber_h__ 7 | #define nsEscCharSetProber_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | 12 | #define NUM_OF_ESC_CHARSETS 4 13 | 14 | class nsEscCharSetProber: public nsCharSetProber { 15 | public: 16 | nsEscCharSetProber(uint32_t aLanguageFilter); 17 | virtual ~nsEscCharSetProber(void); 18 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 19 | const char* GetCharSetName() {return mDetectedCharset;} 20 | nsProbingState GetState(void) {return mState;} 21 | void Reset(void); 22 | float GetConfidence(void){return (float)0.99;} 23 | 24 | protected: 25 | void GetDistribution(uint32_t aCharLen, const char* aStr); 26 | 27 | nsCodingStateMachine* mCodingSM[NUM_OF_ESC_CHARSETS] ; 28 | uint32_t mActiveSM; 29 | nsProbingState mState; 30 | const char * mDetectedCharset; 31 | }; 32 | 33 | #endif /* nsEscCharSetProber_h__ */ 34 | 35 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsGB2312Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | // for S-JIS encoding, obeserve characteristic: 7 | // 1, kana character (or hankaku?) often have hight frequency of appereance 8 | // 2, kana character often exist in group 9 | // 3, certain combination of kana is never used in japanese language 10 | 11 | #include "nsGB2312Prober.h" 12 | #include "nsDebug.h" 13 | 14 | void nsGB18030Prober::Reset(void) 15 | { 16 | mCodingSM->Reset(); 17 | mState = eDetecting; 18 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 19 | //mContextAnalyser.Reset(); 20 | } 21 | 22 | nsProbingState nsGB18030Prober::HandleData(const char* aBuf, uint32_t aLen) 23 | { 24 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 25 | nsSMState codingState; 26 | 27 | for (uint32_t i = 0; i < aLen; i++) 28 | { 29 | codingState = mCodingSM->NextState(aBuf[i]); 30 | if (codingState == eItsMe) 31 | { 32 | mState = eFoundIt; 33 | break; 34 | } 35 | if (codingState == eStart) 36 | { 37 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 38 | 39 | if (i == 0) 40 | { 41 | mLastChar[1] = aBuf[0]; 42 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 43 | } 44 | else 45 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 46 | } 47 | } 48 | 49 | mLastChar[0] = aBuf[aLen-1]; 50 | 51 | if (mState == eDetecting) 52 | if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 53 | mState = eFoundIt; 54 | // else 55 | // mDistributionAnalyser.HandleData(aBuf, aLen); 56 | 57 | return mState; 58 | } 59 | 60 | float nsGB18030Prober::GetConfidence(void) 61 | { 62 | float distribCf = mDistributionAnalyser.GetConfidence(); 63 | 64 | return (float)distribCf; 65 | } 66 | 67 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsGB2312Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsGB2312Prober_h__ 7 | #define nsGB2312Prober_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | #include "CharDistribution.h" 12 | 13 | // We use gb18030 to replace gb2312, because 18030 is a superset. 14 | 15 | class nsGB18030Prober: public nsCharSetProber { 16 | public: 17 | nsGB18030Prober(bool aIsPreferredLanguage) 18 | :mIsPreferredLanguage(aIsPreferredLanguage) 19 | {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); 20 | Reset();} 21 | virtual ~nsGB18030Prober(void){delete mCodingSM;} 22 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 23 | const char* GetCharSetName() {return "gb18030";} 24 | nsProbingState GetState(void) {return mState;} 25 | void Reset(void); 26 | float GetConfidence(void); 27 | 28 | protected: 29 | void GetDistribution(uint32_t aCharLen, const char* aStr); 30 | 31 | nsCodingStateMachine* mCodingSM; 32 | nsProbingState mState; 33 | 34 | //GB2312ContextAnalysis mContextAnalyser; 35 | GB2312DistributionAnalysis mDistributionAnalyser; 36 | char mLastChar[2]; 37 | bool mIsPreferredLanguage; 38 | 39 | }; 40 | 41 | 42 | #endif /* nsGB2312Prober_h__ */ 43 | 44 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsHebrewProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsHebrewProber.h" 7 | #include 8 | 9 | // windows-1255 / ISO-8859-8 code points of interest 10 | #define FINAL_KAF ('\xea') 11 | #define NORMAL_KAF ('\xeb') 12 | #define FINAL_MEM ('\xed') 13 | #define NORMAL_MEM ('\xee') 14 | #define FINAL_NUN ('\xef') 15 | #define NORMAL_NUN ('\xf0') 16 | #define FINAL_PE ('\xf3') 17 | #define NORMAL_PE ('\xf4') 18 | #define FINAL_TSADI ('\xf5') 19 | #define NORMAL_TSADI ('\xf6') 20 | 21 | // Minimum Visual vs Logical final letter score difference. 22 | // If the difference is below this, don't rely solely on the final letter score distance. 23 | #define MIN_FINAL_CHAR_DISTANCE (5) 24 | 25 | // Minimum Visual vs Logical model score difference. 26 | // If the difference is below this, don't rely at all on the model score distance. 27 | #define MIN_MODEL_DISTANCE (0.01) 28 | 29 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") 30 | #define LOGICAL_HEBREW_NAME ("windows-1255") 31 | 32 | bool nsHebrewProber::isFinal(char c) 33 | { 34 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 35 | } 36 | 37 | bool nsHebrewProber::isNonFinal(char c) 38 | { 39 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 40 | // The normal Tsadi is not a good Non-Final letter due to words like 41 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 42 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 43 | // the Non-Final tsadi to appear at an end of a word even though this is not 44 | // the case in the original text. 45 | // The letters Pe and Kaf rarely display a related behavior of not being a 46 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 47 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of 48 | // these letters as Non-Final letters outweighs the damage since these words 49 | // are quite rare. 50 | } 51 | 52 | /** HandleData 53 | * Final letter analysis for logical-visual decision. 54 | * Look for evidence that the received buffer is either logical Hebrew or 55 | * visual Hebrew. 56 | * The following cases are checked: 57 | * 1) A word longer than 1 letter, ending with a final letter. This is an 58 | * indication that the text is laid out "naturally" since the final letter 59 | * really appears at the end. +1 for logical score. 60 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 61 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 62 | * the Non-Final form of that letter. Exceptions to this rule are mentioned 63 | * above in isNonFinal(). This is an indication that the text is laid out 64 | * backwards. +1 for visual score 65 | * 3) A word longer than 1 letter, starting with a final letter. Final letters 66 | * should not appear at the beginning of a word. This is an indication that 67 | * the text is laid out backwards. +1 for visual score. 68 | * 69 | * The visual score and logical score are accumulated throughout the text and 70 | * are finally checked against each other in GetCharSetName(). 71 | * No checking for final letters in the middle of words is done since that case 72 | * is not an indication for either Logical or Visual text. 73 | * 74 | * The input buffer should not contain any white spaces that are not (' ') 75 | * or any low-ascii punctuation marks. 76 | */ 77 | nsProbingState nsHebrewProber::HandleData(const char* aBuf, uint32_t aLen) 78 | { 79 | // Both model probers say it's not them. No reason to continue. 80 | if (GetState() == eNotMe) 81 | return eNotMe; 82 | 83 | const char *curPtr, *endPtr = aBuf+aLen; 84 | char cur; 85 | 86 | for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) 87 | { 88 | cur = *curPtr; 89 | if (cur == ' ') // We stand on a space - a word just ended 90 | { 91 | if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word 92 | { 93 | if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] 94 | ++mFinalCharLogicalScore; 95 | else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] 96 | ++mFinalCharVisualScore; 97 | } 98 | } 99 | else // Not standing on a space 100 | { 101 | if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] 102 | ++mFinalCharVisualScore; 103 | } 104 | mBeforePrev = mPrev; 105 | mPrev = cur; 106 | } 107 | 108 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). 109 | return eDetecting; 110 | } 111 | 112 | // Make the decision: is it Logical or Visual? 113 | const char* nsHebrewProber::GetCharSetName() 114 | { 115 | // If the final letter score distance is dominant enough, rely on it. 116 | int32_t finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 117 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 118 | return LOGICAL_HEBREW_NAME; 119 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) 120 | return VISUAL_HEBREW_NAME; 121 | 122 | // It's not dominant enough, try to rely on the model scores instead. 123 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 124 | if (modelsub > MIN_MODEL_DISTANCE) 125 | return LOGICAL_HEBREW_NAME; 126 | if (modelsub < -(MIN_MODEL_DISTANCE)) 127 | return VISUAL_HEBREW_NAME; 128 | 129 | // Still no good, back to final letter distance, maybe it'll save the day. 130 | if (finalsub < 0) 131 | return VISUAL_HEBREW_NAME; 132 | 133 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 134 | return LOGICAL_HEBREW_NAME; 135 | } 136 | 137 | 138 | void nsHebrewProber::Reset(void) 139 | { 140 | mFinalCharLogicalScore = 0; 141 | mFinalCharVisualScore = 0; 142 | 143 | // mPrev and mBeforePrev are initialized to space in order to simulate a word 144 | // delimiter at the beginning of the data 145 | mPrev = ' '; 146 | mBeforePrev = ' '; 147 | } 148 | 149 | nsProbingState nsHebrewProber::GetState(void) 150 | { 151 | // Remain active as long as any of the model probers are active. 152 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) 153 | return eNotMe; 154 | return eDetecting; 155 | } 156 | 157 | #ifdef DEBUG_chardet 158 | void nsHebrewProber::DumpStatus() 159 | { 160 | printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 161 | } 162 | #endif 163 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsHebrewProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsHebrewProber_h__ 7 | #define nsHebrewProber_h__ 8 | 9 | #include "nsSBCharSetProber.h" 10 | 11 | // This prober doesn't actually recognize a language or a charset. 12 | // It is a helper prober for the use of the Hebrew model probers 13 | class nsHebrewProber: public nsCharSetProber 14 | { 15 | public: 16 | nsHebrewProber(void) :mLogicalProb(0), mVisualProb(0) { Reset(); } 17 | 18 | virtual ~nsHebrewProber(void) {} 19 | virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); 20 | virtual const char* GetCharSetName(); 21 | virtual void Reset(void); 22 | 23 | virtual nsProbingState GetState(void); 24 | 25 | virtual float GetConfidence(void) { return (float)0.0; } 26 | 27 | void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) 28 | { mLogicalProb = logicalPrb; mVisualProb = visualPrb; } 29 | 30 | #ifdef DEBUG_chardet 31 | virtual void DumpStatus(); 32 | #endif 33 | 34 | protected: 35 | static bool isFinal(char c); 36 | static bool isNonFinal(char c); 37 | 38 | int32_t mFinalCharLogicalScore, mFinalCharVisualScore; 39 | 40 | // The two last characters seen in the previous buffer. 41 | char mPrev, mBeforePrev; 42 | 43 | // These probers are owned by the group prober. 44 | nsCharSetProber *mLogicalProb, *mVisualProb; 45 | }; 46 | 47 | /** 48 | * ** General ideas of the Hebrew charset recognition ** 49 | * 50 | * Four main charsets exist in Hebrew: 51 | * "ISO-8859-8" - Visual Hebrew 52 | * "windows-1255" - Logical Hebrew 53 | * "ISO-8859-8-I" - Logical Hebrew 54 | * "x-mac-hebrew" - ?? Logical Hebrew ?? 55 | * 56 | * Both "ISO" charsets use a completely identical set of code points, whereas 57 | * "windows-1255" and "x-mac-hebrew" are two different proper supersets of 58 | * these code points. windows-1255 defines additional characters in the range 59 | * 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific 60 | * diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. 61 | * x-mac-hebrew defines similar additional code points but with a different 62 | * mapping. 63 | * 64 | * As far as an average Hebrew text with no diacritics is concerned, all four 65 | * charsets are identical with respect to code points. Meaning that for the 66 | * main Hebrew alphabet, all four map the same values to all 27 Hebrew letters 67 | * (including final letters). 68 | * 69 | * The dominant difference between these charsets is their directionality. 70 | * "Visual" directionality means that the text is ordered as if the renderer is 71 | * not aware of a BIDI rendering algorithm. The renderer sees the text and 72 | * draws it from left to right. The text itself when ordered naturally is read 73 | * backwards. A buffer of Visual Hebrew generally looks like so: 74 | * "[last word of first line spelled backwards] [whole line ordered backwards 75 | * and spelled backwards] [first word of first line spelled backwards] 76 | * [end of line] [last word of second line] ... etc' " 77 | * adding punctuation marks, numbers and English text to visual text is 78 | * naturally also "visual" and from left to right. 79 | * 80 | * "Logical" directionality means the text is ordered "naturally" according to 81 | * the order it is read. It is the responsibility of the renderer to display 82 | * the text from right to left. A BIDI algorithm is used to place general 83 | * punctuation marks, numbers and English text in the text. 84 | * 85 | * Texts in x-mac-hebrew are almost impossible to find on the Internet. From 86 | * what little evidence I could find, it seems that its general directionality 87 | * is Logical. 88 | * 89 | * To sum up all of the above, the Hebrew probing mechanism knows about two 90 | * charsets: 91 | * Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are 92 | * backwards while line order is natural. For charset recognition purposes 93 | * the line order is unimportant (In fact, for this implementation, even 94 | * word order is unimportant). 95 | * Logical Hebrew - "windows-1255" - normal, naturally ordered text. 96 | * 97 | * "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be 98 | * specifically identified. 99 | * "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew 100 | * that contain special punctuation marks or diacritics is displayed with 101 | * some unconverted characters showing as question marks. This problem might 102 | * be corrected using another model prober for x-mac-hebrew. Due to the fact 103 | * that x-mac-hebrew texts are so rare, writing another model prober isn't 104 | * worth the effort and performance hit. 105 | * 106 | * *** The Prober *** 107 | * 108 | * The prober is divided between two nsSBCharSetProbers and an nsHebrewProber, 109 | * all of which are managed, created, fed data, inquired and deleted by the 110 | * nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in 111 | * fact some kind of Hebrew, Logical or Visual. The final decision about which 112 | * one is it is made by the nsHebrewProber by combining final-letter scores 113 | * with the scores of the two nsSBCharSetProbers to produce a final answer. 114 | * 115 | * The nsSBCSGroupProber is responsible for stripping the original text of HTML 116 | * tags, English characters, numbers, low-ASCII punctuation characters, spaces 117 | * and new lines. It reduces any sequence of such characters to a single space. 118 | * The buffer fed to each prober in the SBCS group prober is pure text in 119 | * high-ASCII. 120 | * The two nsSBCharSetProbers (model probers) share the same language model: 121 | * Win1255Model. 122 | * The first nsSBCharSetProber uses the model normally as any other 123 | * nsSBCharSetProber does, to recognize windows-1255, upon which this model was 124 | * built. The second nsSBCharSetProber is told to make the pair-of-letter 125 | * lookup in the language model backwards. This in practice exactly simulates 126 | * a visual Hebrew model using the windows-1255 logical Hebrew model. 127 | * 128 | * The nsHebrewProber is not using any language model. All it does is look for 129 | * final-letter evidence suggesting the text is either logical Hebrew or visual 130 | * Hebrew. Disjointed from the model probers, the results of the nsHebrewProber 131 | * alone are meaningless. nsHebrewProber always returns 0.00 as confidence 132 | * since it never identifies a charset by itself. Instead, the pointer to the 133 | * nsHebrewProber is passed to the model probers as a helper "Name Prober". 134 | * When the Group prober receives a positive identification from any prober, 135 | * it asks for the name of the charset identified. If the prober queried is a 136 | * Hebrew model prober, the model prober forwards the call to the 137 | * nsHebrewProber to make the final decision. In the nsHebrewProber, the 138 | * decision is made according to the final-letters scores maintained and Both 139 | * model probers scores. The answer is returned in the form of the name of the 140 | * charset identified, either "windows-1255" or "ISO-8859-8". 141 | * 142 | */ 143 | #endif /* nsHebrewProber_h__ */ 144 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsLatin1Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsLatin1Prober.h" 7 | #include "prmem.h" 8 | #include 9 | 10 | #define UDF 0 // undefined 11 | #define OTH 1 //other 12 | #define ASC 2 // ascii capital letter 13 | #define ASS 3 // ascii small letter 14 | #define ACV 4 // accent capital vowel 15 | #define ACO 5 // accent capital other 16 | #define ASV 6 // accent small vowel 17 | #define ASO 7 // accent small other 18 | #define CLASS_NUM 8 // total classes 19 | 20 | static const unsigned char Latin1_CharToClass[] = 21 | { 22 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 23 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 24 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 25 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 26 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 27 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 28 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 29 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 30 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 31 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 32 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 33 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 34 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 35 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 36 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 37 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 38 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 39 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 40 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 41 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 42 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 43 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 44 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 46 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 47 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 48 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 49 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 50 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 51 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 52 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 53 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 54 | }; 55 | 56 | 57 | /* 0 : illegal 58 | 1 : very unlikely 59 | 2 : normal 60 | 3 : very likely 61 | */ 62 | static const unsigned char Latin1ClassModel[] = 63 | { 64 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ 65 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 66 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 67 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 68 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 69 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 70 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 71 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 72 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 73 | }; 74 | 75 | void nsLatin1Prober::Reset(void) 76 | { 77 | mState = eDetecting; 78 | mLastCharClass = OTH; 79 | for (int i = 0; i < FREQ_CAT_NUM; i++) 80 | mFreqCounter[i] = 0; 81 | } 82 | 83 | 84 | nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen) 85 | { 86 | char *newBuf1 = 0; 87 | uint32_t newLen1 = 0; 88 | 89 | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 90 | newBuf1 = (char*)aBuf; 91 | newLen1 = aLen; 92 | } 93 | 94 | unsigned char charClass; 95 | unsigned char freq; 96 | for (uint32_t i = 0; i < newLen1; i++) 97 | { 98 | charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 99 | freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; 100 | if (freq == 0) { 101 | mState = eNotMe; 102 | break; 103 | } 104 | mFreqCounter[freq]++; 105 | mLastCharClass = charClass; 106 | } 107 | 108 | if (newBuf1 != aBuf) 109 | PR_FREEIF(newBuf1); 110 | 111 | return mState; 112 | } 113 | 114 | float nsLatin1Prober::GetConfidence(void) 115 | { 116 | if (mState == eNotMe) 117 | return 0.01f; 118 | 119 | float confidence; 120 | uint32_t total = 0; 121 | for (int32_t i = 0; i < FREQ_CAT_NUM; i++) 122 | total += mFreqCounter[i]; 123 | 124 | if(!total) 125 | confidence = 0.0f; 126 | else 127 | { 128 | confidence = mFreqCounter[3]*1.0f / total; 129 | confidence -= mFreqCounter[1]*20.0f/total; 130 | } 131 | 132 | if (confidence < 0.0f) 133 | confidence = 0.0f; 134 | 135 | // lower the confidence of latin1 so that other more accurate detector 136 | // can take priority. 137 | confidence *= 0.50f; 138 | 139 | return confidence; 140 | } 141 | 142 | #ifdef DEBUG_chardet 143 | void nsLatin1Prober::DumpStatus() 144 | { 145 | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 146 | } 147 | #endif 148 | 149 | 150 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsLatin1Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsLatin1Prober_h__ 7 | #define nsLatin1Prober_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | 11 | #define FREQ_CAT_NUM 4 12 | 13 | class nsLatin1Prober: public nsCharSetProber { 14 | public: 15 | nsLatin1Prober(void){Reset();} 16 | virtual ~nsLatin1Prober(void){} 17 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 18 | const char* GetCharSetName() {return "windows-1252";} 19 | nsProbingState GetState(void) {return mState;} 20 | void Reset(void); 21 | float GetConfidence(void); 22 | 23 | #ifdef DEBUG_chardet 24 | virtual void DumpStatus(); 25 | #endif 26 | 27 | protected: 28 | 29 | nsProbingState mState; 30 | char mLastCharClass; 31 | uint32_t mFreqCounter[FREQ_CAT_NUM]; 32 | }; 33 | 34 | 35 | #endif /* nsLatin1Prober_h__ */ 36 | 37 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsMBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | #include 6 | 7 | #include "nsMBCSGroupProber.h" 8 | #include "nsUniversalDetector.h" 9 | 10 | #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) 11 | const char *ProberName[] = 12 | { 13 | "UTF8", 14 | "SJIS", 15 | "EUCJP", 16 | "GB18030", 17 | "EUCKR", 18 | "Big5", 19 | "EUCTW", 20 | }; 21 | 22 | #endif 23 | 24 | nsMBCSGroupProber::nsMBCSGroupProber(uint32_t aLanguageFilter) 25 | { 26 | for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 27 | mProbers[i] = nullptr; 28 | 29 | mProbers[0] = new nsUTF8Prober(); 30 | if (aLanguageFilter & NS_FILTER_JAPANESE) 31 | { 32 | mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE); 33 | mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE); 34 | } 35 | if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 36 | mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED); 37 | if (aLanguageFilter & NS_FILTER_KOREAN) 38 | mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN); 39 | if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 40 | { 41 | mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); 42 | mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); 43 | } 44 | Reset(); 45 | } 46 | 47 | nsMBCSGroupProber::~nsMBCSGroupProber() 48 | { 49 | for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 50 | { 51 | delete mProbers[i]; 52 | } 53 | } 54 | 55 | const char* nsMBCSGroupProber::GetCharSetName() 56 | { 57 | if (mBestGuess == -1) 58 | { 59 | GetConfidence(); 60 | if (mBestGuess == -1) 61 | mBestGuess = 0; 62 | } 63 | return mProbers[mBestGuess]->GetCharSetName(); 64 | } 65 | 66 | void nsMBCSGroupProber::Reset(void) 67 | { 68 | mActiveNum = 0; 69 | for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 70 | { 71 | if (mProbers[i]) 72 | { 73 | mProbers[i]->Reset(); 74 | mIsActive[i] = true; 75 | ++mActiveNum; 76 | } 77 | else 78 | mIsActive[i] = false; 79 | } 80 | mBestGuess = -1; 81 | mState = eDetecting; 82 | mKeepNext = 0; 83 | } 84 | 85 | nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen) 86 | { 87 | nsProbingState st; 88 | uint32_t start = 0; 89 | uint32_t keepNext = mKeepNext; 90 | 91 | //do filtering to reduce load to probers 92 | for (uint32_t pos = 0; pos < aLen; ++pos) 93 | { 94 | if (aBuf[pos] & 0x80) 95 | { 96 | if (!keepNext) 97 | start = pos; 98 | keepNext = 2; 99 | } 100 | else if (keepNext) 101 | { 102 | if (--keepNext == 0) 103 | { 104 | for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 105 | { 106 | if (!mIsActive[i]) 107 | continue; 108 | st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); 109 | if (st == eFoundIt) 110 | { 111 | mBestGuess = i; 112 | mState = eFoundIt; 113 | return mState; 114 | } 115 | } 116 | } 117 | } 118 | } 119 | 120 | if (keepNext) { 121 | for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 122 | { 123 | if (!mIsActive[i]) 124 | continue; 125 | st = mProbers[i]->HandleData(aBuf + start, aLen - start); 126 | if (st == eFoundIt) 127 | { 128 | mBestGuess = i; 129 | mState = eFoundIt; 130 | return mState; 131 | } 132 | } 133 | } 134 | mKeepNext = keepNext; 135 | 136 | return mState; 137 | } 138 | 139 | float nsMBCSGroupProber::GetConfidence(void) 140 | { 141 | uint32_t i; 142 | float bestConf = 0.0, cf; 143 | 144 | switch (mState) 145 | { 146 | case eFoundIt: 147 | return (float)0.99; 148 | case eNotMe: 149 | return (float)0.01; 150 | default: 151 | for (i = 0; i < NUM_OF_PROBERS; i++) 152 | { 153 | if (!mIsActive[i]) 154 | continue; 155 | cf = mProbers[i]->GetConfidence(); 156 | if (bestConf < cf) 157 | { 158 | bestConf = cf; 159 | mBestGuess = i; 160 | } 161 | } 162 | } 163 | return bestConf; 164 | } 165 | 166 | #ifdef DEBUG_chardet 167 | void nsMBCSGroupProber::DumpStatus() 168 | { 169 | uint32_t i; 170 | float cf; 171 | 172 | GetConfidence(); 173 | for (i = 0; i < NUM_OF_PROBERS; i++) 174 | { 175 | if (!mIsActive[i]) 176 | printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 177 | else 178 | { 179 | cf = mProbers[i]->GetConfidence(); 180 | printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); 181 | } 182 | } 183 | } 184 | #endif 185 | 186 | #ifdef DEBUG_jgmyers 187 | void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], uint32_t &offset) 188 | { 189 | for (uint32_t i = 0; i < NUM_OF_PROBERS; ++i) { 190 | states[offset].name = ProberName[i]; 191 | states[offset].isActive = mIsActive[i]; 192 | states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; 193 | ++offset; 194 | } 195 | } 196 | #endif /* DEBUG_jgmyers */ 197 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsMBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsMBCSGroupProber_h__ 7 | #define nsMBCSGroupProber_h__ 8 | 9 | #include "nsSJISProber.h" 10 | #include "nsUTF8Prober.h" 11 | #include "nsEUCJPProber.h" 12 | #include "nsGB2312Prober.h" 13 | #include "nsEUCKRProber.h" 14 | #include "nsBig5Prober.h" 15 | #include "nsEUCTWProber.h" 16 | 17 | #define NUM_OF_PROBERS 7 18 | 19 | class nsMBCSGroupProber: public nsCharSetProber { 20 | public: 21 | nsMBCSGroupProber(uint32_t aLanguageFilter); 22 | virtual ~nsMBCSGroupProber(); 23 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 24 | const char* GetCharSetName(); 25 | nsProbingState GetState(void) {return mState;} 26 | void Reset(void); 27 | float GetConfidence(void); 28 | 29 | #ifdef DEBUG_chardet 30 | void DumpStatus(); 31 | #endif 32 | #ifdef DEBUG_jgmyers 33 | void GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], uint32_t &offset); 34 | #endif 35 | 36 | protected: 37 | nsProbingState mState; 38 | nsCharSetProber* mProbers[NUM_OF_PROBERS]; 39 | bool mIsActive[NUM_OF_PROBERS]; 40 | int32_t mBestGuess; 41 | uint32_t mActiveNum; 42 | uint32_t mKeepNext; 43 | }; 44 | 45 | #endif /* nsMBCSGroupProber_h__ */ 46 | 47 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsPkgInt.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsPkgInt_h__ 7 | #define nsPkgInt_h__ 8 | #include "nscore.h" 9 | 10 | typedef enum { 11 | eIdxSft4bits = 3, 12 | eIdxSft8bits = 2, 13 | eIdxSft16bits = 1 14 | } nsIdxSft; 15 | 16 | typedef enum { 17 | eSftMsk4bits = 7, 18 | eSftMsk8bits = 3, 19 | eSftMsk16bits = 1 20 | } nsSftMsk; 21 | 22 | typedef enum { 23 | eBitSft4bits = 2, 24 | eBitSft8bits = 3, 25 | eBitSft16bits = 4 26 | } nsBitSft; 27 | 28 | typedef enum { 29 | eUnitMsk4bits = 0x0000000FL, 30 | eUnitMsk8bits = 0x000000FFL, 31 | eUnitMsk16bits = 0x0000FFFFL 32 | } nsUnitMsk; 33 | 34 | typedef struct nsPkgInt { 35 | nsIdxSft idxsft; 36 | nsSftMsk sftmsk; 37 | nsBitSft bitsft; 38 | nsUnitMsk unitmsk; 39 | const uint32_t* const data; 40 | } nsPkgInt; 41 | 42 | 43 | #define PCK16BITS(a,b) ((uint32_t)(((b) << 16) | (a))) 44 | 45 | #define PCK8BITS(a,b,c,d) PCK16BITS( ((uint32_t)(((b) << 8) | (a))), \ 46 | ((uint32_t)(((d) << 8) | (c)))) 47 | 48 | #define PCK4BITS(a,b,c,d,e,f,g,h) PCK8BITS( ((uint32_t)(((b) << 4) | (a))), \ 49 | ((uint32_t)(((d) << 4) | (c))), \ 50 | ((uint32_t)(((f) << 4) | (e))), \ 51 | ((uint32_t)(((h) << 4) | (g))) ) 52 | 53 | #define GETFROMPCK(i, c) \ 54 | (((((c).data)[(i)>>(c).idxsft])>>(((i)&(c).sftmsk)<<(c).bitsft))&(c).unitmsk) 55 | 56 | #endif /* nsPkgInt_h__ */ 57 | 58 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSBCSGroupProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include 7 | #include "prmem.h" 8 | 9 | #include "nsSBCharSetProber.h" 10 | #include "nsSBCSGroupProber.h" 11 | 12 | #include "nsHebrewProber.h" 13 | 14 | nsSBCSGroupProber::nsSBCSGroupProber() 15 | { 16 | mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); 17 | mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); 18 | mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); 19 | mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); 20 | mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); 21 | mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); 22 | mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); 23 | mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); 24 | mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); 25 | mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); 26 | mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel); 27 | 28 | nsHebrewProber *hebprober = new nsHebrewProber(); 29 | // Notice: Any change in these indexes - 10,11,12 must be reflected 30 | // in the code below as well. 31 | mProbers[11] = hebprober; 32 | mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew 33 | mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew 34 | // Tell the Hebrew prober about the logical and visual probers 35 | if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null 36 | { 37 | hebprober->SetModelProbers(mProbers[12], mProbers[13]); 38 | } 39 | else // One or more is null. avoid any Hebrew probing, null them all 40 | { 41 | for (uint32_t i = 11; i <= 13; ++i) 42 | { 43 | delete mProbers[i]; 44 | mProbers[i] = 0; 45 | } 46 | } 47 | 48 | // disable latin2 before latin1 is available, otherwise all latin1 49 | // will be detected as latin2 because of their similarity. 50 | //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); 51 | //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); 52 | 53 | Reset(); 54 | } 55 | 56 | nsSBCSGroupProber::~nsSBCSGroupProber() 57 | { 58 | for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++) 59 | { 60 | delete mProbers[i]; 61 | } 62 | } 63 | 64 | 65 | const char* nsSBCSGroupProber::GetCharSetName() 66 | { 67 | //if we have no answer yet 68 | if (mBestGuess == -1) 69 | { 70 | GetConfidence(); 71 | //no charset seems positive 72 | if (mBestGuess == -1) 73 | //we will use default. 74 | mBestGuess = 0; 75 | } 76 | return mProbers[mBestGuess]->GetCharSetName(); 77 | } 78 | 79 | void nsSBCSGroupProber::Reset(void) 80 | { 81 | mActiveNum = 0; 82 | for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++) 83 | { 84 | if (mProbers[i]) // not null 85 | { 86 | mProbers[i]->Reset(); 87 | mIsActive[i] = true; 88 | ++mActiveNum; 89 | } 90 | else 91 | mIsActive[i] = false; 92 | } 93 | mBestGuess = -1; 94 | mState = eDetecting; 95 | } 96 | 97 | 98 | nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen) 99 | { 100 | nsProbingState st; 101 | uint32_t i; 102 | char *newBuf1 = 0; 103 | uint32_t newLen1 = 0; 104 | 105 | //apply filter to original buffer, and we got new buffer back 106 | //depend on what script it is, we will feed them the new buffer 107 | //we got after applying proper filter 108 | //this is done without any consideration to KeepEnglishLetters 109 | //of each prober since as of now, there are no probers here which 110 | //recognize languages with English characters. 111 | if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) 112 | goto done; 113 | 114 | if (newLen1 == 0) 115 | goto done; // Nothing to see here, move on. 116 | 117 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 118 | { 119 | if (!mIsActive[i]) 120 | continue; 121 | st = mProbers[i]->HandleData(newBuf1, newLen1); 122 | if (st == eFoundIt) 123 | { 124 | mBestGuess = i; 125 | mState = eFoundIt; 126 | break; 127 | } 128 | else if (st == eNotMe) 129 | { 130 | mIsActive[i] = false; 131 | mActiveNum--; 132 | if (mActiveNum <= 0) 133 | { 134 | mState = eNotMe; 135 | break; 136 | } 137 | } 138 | } 139 | 140 | done: 141 | PR_FREEIF(newBuf1); 142 | 143 | return mState; 144 | } 145 | 146 | float nsSBCSGroupProber::GetConfidence(void) 147 | { 148 | uint32_t i; 149 | float bestConf = 0.0, cf; 150 | 151 | switch (mState) 152 | { 153 | case eFoundIt: 154 | return (float)0.99; //sure yes 155 | case eNotMe: 156 | return (float)0.01; //sure no 157 | default: 158 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 159 | { 160 | if (!mIsActive[i]) 161 | continue; 162 | cf = mProbers[i]->GetConfidence(); 163 | if (bestConf < cf) 164 | { 165 | bestConf = cf; 166 | mBestGuess = i; 167 | } 168 | } 169 | } 170 | return bestConf; 171 | } 172 | 173 | #ifdef DEBUG_chardet 174 | void nsSBCSGroupProber::DumpStatus() 175 | { 176 | uint32_t i; 177 | float cf; 178 | 179 | cf = GetConfidence(); 180 | printf(" SBCS Group Prober --------begin status \r\n"); 181 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 182 | { 183 | if (!mIsActive[i]) 184 | printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); 185 | else 186 | mProbers[i]->DumpStatus(); 187 | } 188 | printf(" SBCS Group found best match [%s] confidence %f.\r\n", 189 | mProbers[mBestGuess]->GetCharSetName(), cf); 190 | } 191 | #endif 192 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSBCSGroupProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsSBCSGroupProber_h__ 7 | #define nsSBCSGroupProber_h__ 8 | 9 | 10 | #define NUM_OF_SBCS_PROBERS 14 11 | 12 | class nsCharSetProber; 13 | class nsSBCSGroupProber: public nsCharSetProber { 14 | public: 15 | nsSBCSGroupProber(); 16 | virtual ~nsSBCSGroupProber(); 17 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 18 | const char* GetCharSetName(); 19 | nsProbingState GetState(void) {return mState;} 20 | void Reset(void); 21 | float GetConfidence(void); 22 | 23 | #ifdef DEBUG_chardet 24 | void DumpStatus(); 25 | #endif 26 | 27 | protected: 28 | nsProbingState mState; 29 | nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS]; 30 | bool mIsActive[NUM_OF_SBCS_PROBERS]; 31 | int32_t mBestGuess; 32 | uint32_t mActiveNum; 33 | }; 34 | 35 | #endif /* nsSBCSGroupProber_h__ */ 36 | 37 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSBCharSetProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | #include 6 | #include "nsSBCharSetProber.h" 7 | 8 | nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen) 9 | { 10 | unsigned char order; 11 | 12 | for (uint32_t i = 0; i < aLen; i++) 13 | { 14 | order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; 15 | 16 | if (order < SYMBOL_CAT_ORDER) 17 | mTotalChar++; 18 | if (order < SAMPLE_SIZE) 19 | { 20 | mFreqChar++; 21 | 22 | if (mLastOrder < SAMPLE_SIZE) 23 | { 24 | mTotalSeqs++; 25 | if (!mReversed) 26 | ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); 27 | else // reverse the order of the letters in the lookup 28 | ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); 29 | } 30 | } 31 | mLastOrder = order; 32 | } 33 | 34 | if (mState == eDetecting) 35 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) 36 | { 37 | float cf = GetConfidence(); 38 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) 39 | mState = eFoundIt; 40 | else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) 41 | mState = eNotMe; 42 | } 43 | 44 | return mState; 45 | } 46 | 47 | void nsSingleByteCharSetProber::Reset(void) 48 | { 49 | mState = eDetecting; 50 | mLastOrder = 255; 51 | for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++) 52 | mSeqCounters[i] = 0; 53 | mTotalSeqs = 0; 54 | mTotalChar = 0; 55 | mFreqChar = 0; 56 | } 57 | 58 | //#define NEGATIVE_APPROACH 1 59 | 60 | float nsSingleByteCharSetProber::GetConfidence(void) 61 | { 62 | #ifdef NEGATIVE_APPROACH 63 | if (mTotalSeqs > 0) 64 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) 65 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; 66 | return (float)0.01; 67 | #else //POSITIVE_APPROACH 68 | float r; 69 | 70 | if (mTotalSeqs > 0) { 71 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; 72 | r = r*mFreqChar/mTotalChar; 73 | if (r >= (float)1.00) 74 | r = (float)0.99; 75 | return r; 76 | } 77 | return (float)0.01; 78 | #endif 79 | } 80 | 81 | const char* nsSingleByteCharSetProber::GetCharSetName() 82 | { 83 | if (!mNameProber) 84 | return mModel->charsetName; 85 | return mNameProber->GetCharSetName(); 86 | } 87 | 88 | #ifdef DEBUG_chardet 89 | void nsSingleByteCharSetProber::DumpStatus() 90 | { 91 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 92 | } 93 | #endif 94 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSBCharSetProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | #ifndef nsSingleByteCharSetProber_h__ 6 | #define nsSingleByteCharSetProber_h__ 7 | 8 | #include "nsCharSetProber.h" 9 | 10 | #define SAMPLE_SIZE 64 11 | #define SB_ENOUGH_REL_THRESHOLD 1024 12 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 13 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 14 | #define SYMBOL_CAT_ORDER 250 15 | #define NUMBER_OF_SEQ_CAT 4 16 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) 17 | #define NEGATIVE_CAT 0 18 | 19 | typedef struct 20 | { 21 | const unsigned char* const charToOrderMap; // [256] table use to find a char's order 22 | const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 23 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 24 | bool keepEnglishLetter; // says if this script contains English characters (not implemented) 25 | const char* const charsetName; 26 | } SequenceModel; 27 | 28 | 29 | class nsSingleByteCharSetProber : public nsCharSetProber{ 30 | public: 31 | nsSingleByteCharSetProber(const SequenceModel *model) 32 | :mModel(model), mReversed(false), mNameProber(0) { Reset(); } 33 | nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber) 34 | :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } 35 | 36 | virtual const char* GetCharSetName(); 37 | virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); 38 | virtual nsProbingState GetState(void) {return mState;} 39 | virtual void Reset(void); 40 | virtual float GetConfidence(void); 41 | 42 | // This feature is not implemented yet. any current language model 43 | // contain this parameter as false. No one is looking at this 44 | // parameter or calling this method. 45 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this 46 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 47 | // of the English letters. 48 | bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) 49 | 50 | #ifdef DEBUG_chardet 51 | virtual void DumpStatus(); 52 | #endif 53 | 54 | protected: 55 | nsProbingState mState; 56 | const SequenceModel* const mModel; 57 | const bool mReversed; // true if we need to reverse every pair in the model lookup 58 | 59 | //char order of last character 60 | unsigned char mLastOrder; 61 | 62 | uint32_t mTotalSeqs; 63 | uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT]; 64 | 65 | uint32_t mTotalChar; 66 | //characters that fall in our sampling range 67 | uint32_t mFreqChar; 68 | 69 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 70 | nsCharSetProber* mNameProber; 71 | 72 | }; 73 | 74 | 75 | extern const SequenceModel Koi8rModel; 76 | extern const SequenceModel Win1251Model; 77 | extern const SequenceModel Latin5Model; 78 | extern const SequenceModel MacCyrillicModel; 79 | extern const SequenceModel Ibm866Model; 80 | extern const SequenceModel Ibm855Model; 81 | extern const SequenceModel Latin7Model; 82 | extern const SequenceModel Win1253Model; 83 | extern const SequenceModel Latin5BulgarianModel; 84 | extern const SequenceModel Win1251BulgarianModel; 85 | extern const SequenceModel Latin2HungarianModel; 86 | extern const SequenceModel Win1250HungarianModel; 87 | extern const SequenceModel Win1255Model; 88 | extern const SequenceModel TIS620ThaiModel; 89 | 90 | #endif /* nsSingleByteCharSetProber_h__ */ 91 | 92 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSJISProber.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | // for S-JIS encoding, obeserve characteristic: 7 | // 1, kana character (or hankaku?) often have hight frequency of appereance 8 | // 2, kana character often exist in group 9 | // 3, certain combination of kana is never used in japanese language 10 | 11 | #include "nsSJISProber.h" 12 | #include "nsDebug.h" 13 | 14 | void nsSJISProber::Reset(void) 15 | { 16 | mCodingSM->Reset(); 17 | mState = eDetecting; 18 | mContextAnalyser.Reset(mIsPreferredLanguage); 19 | mDistributionAnalyser.Reset(mIsPreferredLanguage); 20 | } 21 | 22 | nsProbingState nsSJISProber::HandleData(const char* aBuf, uint32_t aLen) 23 | { 24 | NS_ASSERTION(aLen, "HandleData called with empty buffer"); 25 | nsSMState codingState; 26 | 27 | for (uint32_t i = 0; i < aLen; i++) 28 | { 29 | codingState = mCodingSM->NextState(aBuf[i]); 30 | if (codingState == eItsMe) 31 | { 32 | mState = eFoundIt; 33 | break; 34 | } 35 | if (codingState == eStart) 36 | { 37 | uint32_t charLen = mCodingSM->GetCurrentCharLen(); 38 | if (i == 0) 39 | { 40 | mLastChar[1] = aBuf[0]; 41 | mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen); 42 | mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 43 | } 44 | else 45 | { 46 | mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen); 47 | mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen); 48 | } 49 | } 50 | } 51 | 52 | mLastChar[0] = aBuf[aLen-1]; 53 | 54 | if (mState == eDetecting) 55 | if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 56 | mState = eFoundIt; 57 | 58 | return mState; 59 | } 60 | 61 | float nsSJISProber::GetConfidence(void) 62 | { 63 | float contxtCf = mContextAnalyser.GetConfidence(); 64 | float distribCf = mDistributionAnalyser.GetConfidence(); 65 | 66 | return (contxtCf > distribCf ? contxtCf : distribCf); 67 | } 68 | 69 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsSJISProber.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | // for S-JIS encoding, obeserve characteristic: 7 | // 1, kana character (or hankaku?) often have hight frequency of appereance 8 | // 2, kana character often exist in group 9 | // 3, certain combination of kana is never used in japanese language 10 | 11 | #ifndef nsSJISProber_h__ 12 | #define nsSJISProber_h__ 13 | 14 | #include "nsCharSetProber.h" 15 | #include "nsCodingStateMachine.h" 16 | #include "JpCntx.h" 17 | #include "CharDistribution.h" 18 | 19 | 20 | class nsSJISProber: public nsCharSetProber { 21 | public: 22 | nsSJISProber(bool aIsPreferredLanguage) 23 | :mIsPreferredLanguage(aIsPreferredLanguage) 24 | {mCodingSM = new nsCodingStateMachine(&SJISSMModel); 25 | Reset();} 26 | virtual ~nsSJISProber(void){delete mCodingSM;} 27 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 28 | const char* GetCharSetName() {return "Shift_JIS";} 29 | nsProbingState GetState(void) {return mState;} 30 | void Reset(void); 31 | float GetConfidence(void); 32 | 33 | protected: 34 | nsCodingStateMachine* mCodingSM; 35 | nsProbingState mState; 36 | 37 | SJISContextAnalysis mContextAnalyser; 38 | SJISDistributionAnalysis mDistributionAnalyser; 39 | 40 | char mLastChar[2]; 41 | bool mIsPreferredLanguage; 42 | 43 | }; 44 | 45 | 46 | #endif /* nsSJISProber_h__ */ 47 | 48 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsUTF8Prober.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nsUTF8Prober.h" 7 | 8 | void nsUTF8Prober::Reset(void) 9 | { 10 | mCodingSM->Reset(); 11 | mNumOfMBChar = 0; 12 | mState = eDetecting; 13 | } 14 | 15 | nsProbingState nsUTF8Prober::HandleData(const char* aBuf, uint32_t aLen) 16 | { 17 | nsSMState codingState; 18 | 19 | for (uint32_t i = 0; i < aLen; i++) 20 | { 21 | codingState = mCodingSM->NextState(aBuf[i]); 22 | if (codingState == eItsMe) 23 | { 24 | mState = eFoundIt; 25 | break; 26 | } 27 | if (codingState == eStart) 28 | { 29 | if (mCodingSM->GetCurrentCharLen() >= 2) 30 | mNumOfMBChar++; 31 | } 32 | } 33 | 34 | if (mState == eDetecting) 35 | if (GetConfidence() > SHORTCUT_THRESHOLD) 36 | mState = eFoundIt; 37 | return mState; 38 | } 39 | 40 | #define ONE_CHAR_PROB (float)0.50 41 | 42 | float nsUTF8Prober::GetConfidence(void) 43 | { 44 | float unlike = (float)0.99; 45 | 46 | if (mNumOfMBChar < 6) 47 | { 48 | for (uint32_t i = 0; i < mNumOfMBChar; i++) 49 | unlike *= ONE_CHAR_PROB; 50 | return (float)1.0 - unlike; 51 | } 52 | else 53 | return (float)0.99; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsUTF8Prober.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsUTF8Prober_h__ 7 | #define nsUTF8Prober_h__ 8 | 9 | #include "nsCharSetProber.h" 10 | #include "nsCodingStateMachine.h" 11 | 12 | class nsUTF8Prober: public nsCharSetProber { 13 | public: 14 | nsUTF8Prober(){mNumOfMBChar = 0; 15 | mCodingSM = new nsCodingStateMachine(&UTF8SMModel); 16 | Reset(); } 17 | virtual ~nsUTF8Prober(){delete mCodingSM;} 18 | nsProbingState HandleData(const char* aBuf, uint32_t aLen); 19 | const char* GetCharSetName() {return "UTF-8";} 20 | nsProbingState GetState(void) {return mState;} 21 | void Reset(void); 22 | float GetConfidence(void); 23 | 24 | protected: 25 | nsCodingStateMachine* mCodingSM; 26 | nsProbingState mState; 27 | uint32_t mNumOfMBChar; 28 | }; 29 | 30 | #endif /* nsUTF8Prober_h__ */ 31 | 32 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsUniversalDetector.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nscore.h" 7 | 8 | #include "nsUniversalDetector.h" 9 | 10 | #include "nsMBCSGroupProber.h" 11 | #include "nsSBCSGroupProber.h" 12 | #include "nsEscCharsetProber.h" 13 | #include "nsLatin1Prober.h" 14 | 15 | nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter) 16 | { 17 | mDone = false; 18 | mBestGuess = -1; //illegal value as signal 19 | mInTag = false; 20 | mEscCharSetProber = nullptr; 21 | 22 | mStart = true; 23 | mDetectedCharset = nullptr; 24 | mGotData = false; 25 | mInputState = ePureAscii; 26 | mLastChar = '\0'; 27 | mLanguageFilter = aLanguageFilter; 28 | 29 | uint32_t i; 30 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 31 | mCharSetProbers[i] = nullptr; 32 | } 33 | 34 | nsUniversalDetector::~nsUniversalDetector() 35 | { 36 | for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 37 | delete mCharSetProbers[i]; 38 | 39 | delete mEscCharSetProber; 40 | } 41 | 42 | void 43 | nsUniversalDetector::Reset() 44 | { 45 | mDone = false; 46 | mBestGuess = -1; //illegal value as signal 47 | mInTag = false; 48 | 49 | mStart = true; 50 | mDetectedCharset = nullptr; 51 | mGotData = false; 52 | mInputState = ePureAscii; 53 | mLastChar = '\0'; 54 | 55 | if (mEscCharSetProber) 56 | mEscCharSetProber->Reset(); 57 | 58 | uint32_t i; 59 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 60 | if (mCharSetProbers[i]) 61 | mCharSetProbers[i]->Reset(); 62 | } 63 | 64 | //--------------------------------------------------------------------- 65 | #define SHORTCUT_THRESHOLD (float)0.95 66 | #define MINIMUM_THRESHOLD (float)0.20 67 | 68 | nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) 69 | { 70 | if(mDone) 71 | return NS_OK; 72 | 73 | if (aLen > 0) 74 | mGotData = true; 75 | 76 | //If the data starts with BOM, we know it is UTF 77 | if (mStart) 78 | { 79 | mStart = false; 80 | if (aLen >= 2) { 81 | switch (aBuf[0]) { 82 | case '\xEF': 83 | if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) { 84 | // EF BB BF UTF-8 encoded BOM 85 | mDetectedCharset = "UTF-8"; 86 | } 87 | break; 88 | case '\xFE': 89 | if ('\xFF' == aBuf[1]) { 90 | // FE FF UTF-16, big endian BOM 91 | mDetectedCharset = "UTF-16BE"; 92 | } 93 | break; 94 | case '\xFF': 95 | if ('\xFE' == aBuf[1]) { 96 | // FF FE UTF-16, little endian BOM 97 | mDetectedCharset = "UTF-16LE"; 98 | } 99 | break; 100 | } // switch 101 | } 102 | 103 | if (mDetectedCharset) 104 | { 105 | mDone = true; 106 | return NS_OK; 107 | } 108 | } 109 | 110 | uint32_t i; 111 | for (i = 0; i < aLen; i++) 112 | { 113 | //other than 0xa0, if every othe character is ascii, the page is ascii 114 | if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 115 | { 116 | //we got a non-ascii byte (high-byte) 117 | if (mInputState != eHighbyte) 118 | { 119 | //adjust state 120 | mInputState = eHighbyte; 121 | 122 | //kill mEscCharSetProber if it is active 123 | if (mEscCharSetProber) { 124 | delete mEscCharSetProber; 125 | mEscCharSetProber = nullptr; 126 | } 127 | 128 | //start multibyte and singlebyte charset prober 129 | if (nullptr == mCharSetProbers[0]) 130 | { 131 | mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); 132 | if (nullptr == mCharSetProbers[0]) 133 | return NS_ERROR_OUT_OF_MEMORY; 134 | } 135 | if (nullptr == mCharSetProbers[1] && 136 | (mLanguageFilter & NS_FILTER_NON_CJK)) 137 | { 138 | mCharSetProbers[1] = new nsSBCSGroupProber; 139 | if (nullptr == mCharSetProbers[1]) 140 | return NS_ERROR_OUT_OF_MEMORY; 141 | } 142 | if (nullptr == mCharSetProbers[2]) 143 | { 144 | mCharSetProbers[2] = new nsLatin1Prober; 145 | if (nullptr == mCharSetProbers[2]) 146 | return NS_ERROR_OUT_OF_MEMORY; 147 | } 148 | } 149 | } 150 | else 151 | { 152 | //ok, just pure ascii so far 153 | if ( ePureAscii == mInputState && 154 | (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 155 | { 156 | //found escape character or HZ "~{" 157 | mInputState = eEscAscii; 158 | } 159 | mLastChar = aBuf[i]; 160 | } 161 | } 162 | 163 | nsProbingState st; 164 | switch (mInputState) 165 | { 166 | case eEscAscii: 167 | if (nullptr == mEscCharSetProber) { 168 | mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); 169 | if (nullptr == mEscCharSetProber) 170 | return NS_ERROR_OUT_OF_MEMORY; 171 | } 172 | st = mEscCharSetProber->HandleData(aBuf, aLen); 173 | if (st == eFoundIt) 174 | { 175 | mDone = true; 176 | mDetectedCharset = mEscCharSetProber->GetCharSetName(); 177 | } 178 | break; 179 | case eHighbyte: 180 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 181 | { 182 | if (mCharSetProbers[i]) 183 | { 184 | st = mCharSetProbers[i]->HandleData(aBuf, aLen); 185 | if (st == eFoundIt) 186 | { 187 | mDone = true; 188 | mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 189 | return NS_OK; 190 | } 191 | } 192 | } 193 | break; 194 | 195 | default: //pure ascii 196 | ;//do nothing here 197 | } 198 | return NS_OK; 199 | } 200 | 201 | 202 | //--------------------------------------------------------------------- 203 | void nsUniversalDetector::DataEnd() 204 | { 205 | if (!mGotData) 206 | { 207 | // we haven't got any data yet, return immediately 208 | // caller program sometimes call DataEnd before anything has been sent to detector 209 | return; 210 | } 211 | 212 | if (mDetectedCharset) 213 | { 214 | mDone = true; 215 | Report(mDetectedCharset); 216 | return; 217 | } 218 | 219 | switch (mInputState) 220 | { 221 | case eHighbyte: 222 | { 223 | float proberConfidence; 224 | float maxProberConfidence = (float)0.0; 225 | int32_t maxProber = 0; 226 | 227 | for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 228 | { 229 | if (mCharSetProbers[i]) 230 | { 231 | proberConfidence = mCharSetProbers[i]->GetConfidence(); 232 | if (proberConfidence > maxProberConfidence) 233 | { 234 | maxProberConfidence = proberConfidence; 235 | maxProber = i; 236 | } 237 | } 238 | } 239 | //do not report anything because we are not confident of it, that's in fact a negative answer 240 | if (maxProberConfidence > MINIMUM_THRESHOLD) 241 | { 242 | if (mCharSetProbers[maxProber]) 243 | { 244 | Report(mCharSetProbers[maxProber]->GetCharSetName()); 245 | } 246 | } 247 | } 248 | break; 249 | case eEscAscii: 250 | break; 251 | default: 252 | ; 253 | } 254 | return; 255 | } 256 | -------------------------------------------------------------------------------- /universalchardet/src/base/nsUniversalDetector.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsUniversalDetector_h__ 7 | #define nsUniversalDetector_h__ 8 | 9 | class nsCharSetProber; 10 | 11 | #define NUM_OF_CHARSET_PROBERS 3 12 | 13 | typedef enum { 14 | ePureAscii = 0, 15 | eEscAscii = 1, 16 | eHighbyte = 2 17 | } nsInputState; 18 | 19 | #define NS_FILTER_CHINESE_SIMPLIFIED 0x01 20 | #define NS_FILTER_CHINESE_TRADITIONAL 0x02 21 | #define NS_FILTER_JAPANESE 0x04 22 | #define NS_FILTER_KOREAN 0x08 23 | #define NS_FILTER_NON_CJK 0x10 24 | #define NS_FILTER_ALL 0x1F 25 | #define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \ 26 | NS_FILTER_CHINESE_TRADITIONAL) 27 | #define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \ 28 | NS_FILTER_CHINESE_TRADITIONAL | \ 29 | NS_FILTER_JAPANESE | \ 30 | NS_FILTER_KOREAN) 31 | 32 | class nsUniversalDetector { 33 | public: 34 | nsUniversalDetector(uint32_t aLanguageFilter); 35 | virtual ~nsUniversalDetector(); 36 | virtual nsresult HandleData(const char* aBuf, uint32_t aLen); 37 | virtual void DataEnd(void); 38 | 39 | protected: 40 | virtual void Report(const char* aCharset) = 0; 41 | virtual void Reset(); 42 | nsInputState mInputState; 43 | bool mDone; 44 | bool mInTag; 45 | bool mStart; 46 | bool mGotData; 47 | char mLastChar; 48 | const char * mDetectedCharset; 49 | int32_t mBestGuess; 50 | uint32_t mLanguageFilter; 51 | 52 | nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; 53 | nsCharSetProber *mEscCharSetProber; 54 | }; 55 | 56 | #endif 57 | 58 | -------------------------------------------------------------------------------- /universalchardet/src/moz.build: -------------------------------------------------------------------------------- 1 | # -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- 2 | # vim: set filetype=python: 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | DIRS += ['base', 'xpcom'] 8 | 9 | MODULE = 'universalchardet' 10 | 11 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/Makefile.in: -------------------------------------------------------------------------------- 1 | # 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this 4 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 5 | 6 | EXPORT_LIBRARY = 1 7 | LOCAL_INCLUDES = -I$(srcdir)/../base 8 | 9 | SHARED_LIBRARY_LIBS = \ 10 | ../base/$(LIB_PREFIX)universalchardet_s.$(LIB_SUFFIX) \ 11 | $(NULL) 12 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/moz.build: -------------------------------------------------------------------------------- 1 | # -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- 2 | # vim: set filetype=python: 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | MODULE = 'universalchardet' 8 | 9 | CPP_SOURCES += [ 10 | 'nsUdetXPCOMWrapper.cpp', 11 | 'nsUniversalCharDetModule.cpp', 12 | ] 13 | 14 | LIBRARY_NAME = 'universalchardet' 15 | 16 | LIBXUL_LIBRARY = True 17 | 18 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/nsUdetXPCOMWrapper.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "nscore.h" 7 | 8 | #include "nsUniversalDetector.h" 9 | #include "nsUdetXPCOMWrapper.h" 10 | #include "nsCharSetProber.h" // for DumpStatus 11 | 12 | #include "nsUniversalCharDetDll.h" 13 | //---- for XPCOM 14 | #include "nsIFactory.h" 15 | #include "nsISupports.h" 16 | #include "nsCOMPtr.h" 17 | 18 | static NS_DEFINE_CID(kUniversalDetectorCID, NS_UNIVERSAL_DETECTOR_CID); 19 | static NS_DEFINE_CID(kUniversalStringDetectorCID, NS_UNIVERSAL_STRING_DETECTOR_CID); 20 | 21 | //--------------------------------------------------------------------- 22 | nsXPCOMDetector:: nsXPCOMDetector(uint32_t aLanguageFilter) 23 | : nsUniversalDetector(aLanguageFilter) 24 | { 25 | } 26 | //--------------------------------------------------------------------- 27 | nsXPCOMDetector::~nsXPCOMDetector() 28 | { 29 | } 30 | //--------------------------------------------------------------------- 31 | 32 | NS_IMPL_ISUPPORTS1(nsXPCOMDetector, nsICharsetDetector) 33 | 34 | //--------------------------------------------------------------------- 35 | NS_IMETHODIMP nsXPCOMDetector::Init( 36 | nsICharsetDetectionObserver* aObserver) 37 | { 38 | NS_ASSERTION(mObserver == nullptr , "Init twice"); 39 | if(nullptr == aObserver) 40 | return NS_ERROR_ILLEGAL_VALUE; 41 | 42 | mObserver = aObserver; 43 | return NS_OK; 44 | } 45 | //---------------------------------------------------------- 46 | NS_IMETHODIMP nsXPCOMDetector::DoIt(const char* aBuf, 47 | uint32_t aLen, bool* oDontFeedMe) 48 | { 49 | NS_ASSERTION(mObserver != nullptr , "have not init yet"); 50 | 51 | if((nullptr == aBuf) || (nullptr == oDontFeedMe)) 52 | return NS_ERROR_ILLEGAL_VALUE; 53 | 54 | this->Reset(); 55 | nsresult rv = this->HandleData(aBuf, aLen); 56 | if (NS_FAILED(rv)) 57 | return rv; 58 | 59 | if (mDone) 60 | { 61 | if (mDetectedCharset) 62 | Report(mDetectedCharset); 63 | 64 | *oDontFeedMe = true; 65 | } 66 | *oDontFeedMe = false; 67 | return NS_OK; 68 | } 69 | //---------------------------------------------------------- 70 | NS_IMETHODIMP nsXPCOMDetector::Done() 71 | { 72 | NS_ASSERTION(mObserver != nullptr , "have not init yet"); 73 | #ifdef DEBUG_chardet 74 | for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 75 | { 76 | // If no data was received the array might stay filled with nulls 77 | // the way it was initialized in the constructor. 78 | if (mCharSetProbers[i]) 79 | mCharSetProbers[i]->DumpStatus(); 80 | } 81 | #endif 82 | 83 | this->DataEnd(); 84 | return NS_OK; 85 | } 86 | //---------------------------------------------------------- 87 | void nsXPCOMDetector::Report(const char* aCharset) 88 | { 89 | NS_ASSERTION(mObserver != nullptr , "have not init yet"); 90 | #ifdef DEBUG_chardet 91 | printf("Universal Charset Detector report charset %s . \r\n", aCharset); 92 | #endif 93 | mObserver->Notify(aCharset, eBestAnswer); 94 | } 95 | 96 | 97 | //--------------------------------------------------------------------- 98 | nsXPCOMStringDetector:: nsXPCOMStringDetector(uint32_t aLanguageFilter) 99 | : nsUniversalDetector(aLanguageFilter) 100 | { 101 | } 102 | //--------------------------------------------------------------------- 103 | nsXPCOMStringDetector::~nsXPCOMStringDetector() 104 | { 105 | } 106 | //--------------------------------------------------------------------- 107 | NS_IMPL_ISUPPORTS1(nsXPCOMStringDetector, nsIStringCharsetDetector) 108 | //--------------------------------------------------------------------- 109 | void nsXPCOMStringDetector::Report(const char *aCharset) 110 | { 111 | mResult = aCharset; 112 | #ifdef DEBUG_chardet 113 | printf("New Charset Prober report charset %s . \r\n", aCharset); 114 | #endif 115 | } 116 | //--------------------------------------------------------------------- 117 | NS_IMETHODIMP nsXPCOMStringDetector::DoIt(const char* aBuf, 118 | uint32_t aLen, const char** oCharset, 119 | nsDetectionConfident &oConf) 120 | { 121 | mResult = nullptr; 122 | this->Reset(); 123 | nsresult rv = this->HandleData(aBuf, aLen); 124 | if (NS_FAILED(rv)) 125 | return rv; 126 | this->DataEnd(); 127 | if (mResult) 128 | { 129 | *oCharset=mResult; 130 | oConf = eBestAnswer; 131 | } 132 | return NS_OK; 133 | } 134 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/nsUdetXPCOMWrapper.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef _nsUdetXPCOMWrapper_h__ 7 | #define _nsUdetXPCOMWrapper_h__ 8 | #include "nsISupports.h" 9 | #include "nsICharsetDetector.h" 10 | #include "nsIStringCharsetDetector.h" 11 | #include "nsICharsetDetectionObserver.h" 12 | #include "nsCOMPtr.h" 13 | 14 | #include "nsIFactory.h" 15 | 16 | // {374E0CDE-F605-4259-8C92-E639C6C2EEEF} 17 | #define NS_UNIVERSAL_DETECTOR_CID \ 18 | { 0x374e0cde, 0xf605, 0x4259, { 0x8c, 0x92, 0xe6, 0x39, 0xc6, 0xc2, 0xee, 0xef } } 19 | 20 | // {6EE5301A-3981-49bd-85F8-1A2CC228CF3E} 21 | #define NS_UNIVERSAL_STRING_DETECTOR_CID \ 22 | { 0x6ee5301a, 0x3981, 0x49bd, { 0x85, 0xf8, 0x1a, 0x2c, 0xc2, 0x28, 0xcf, 0x3e } } 23 | 24 | // {12BB8F1B-2389-11d3-B3BF-00805F8A6670} 25 | #define NS_JA_PSMDETECTOR_CID \ 26 | { 0x12bb8f1b, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 27 | 28 | // {12BB8F1C-2389-11d3-B3BF-00805F8A6670} 29 | #define NS_JA_STRING_PSMDETECTOR_CID \ 30 | { 0x12bb8f1c, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 31 | 32 | // {EA06D4E1-2B3D-11d3-B3BF-00805F8A6670} 33 | #define NS_KO_PSMDETECTOR_CID \ 34 | { 0xea06d4e1, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 35 | 36 | // {EA06D4E2-2B3D-11d3-B3BF-00805F8A6670} 37 | #define NS_ZHCN_PSMDETECTOR_CID \ 38 | { 0xea06d4e2, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 39 | 40 | // {EA06D4E3-2B3D-11d3-B3BF-00805F8A6670} 41 | #define NS_ZHTW_PSMDETECTOR_CID \ 42 | { 0xea06d4e3, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 43 | 44 | 45 | // {EA06D4E4-2B3D-11d3-B3BF-00805F8A6670} 46 | #define NS_KO_STRING_PSMDETECTOR_CID \ 47 | { 0xea06d4e4, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 48 | 49 | // {EA06D4E5-2B3D-11d3-B3BF-00805F8A6670} 50 | #define NS_ZHCN_STRING_PSMDETECTOR_CID \ 51 | { 0xea06d4e5, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 52 | 53 | // {EA06D4E6-2B3D-11d3-B3BF-00805F8A6670} 54 | #define NS_ZHTW_STRING_PSMDETECTOR_CID \ 55 | { 0xea06d4e6, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 56 | 57 | 58 | // {FCACEF21-2B40-11d3-B3BF-00805F8A6670} 59 | #define NS_ZH_STRING_PSMDETECTOR_CID \ 60 | { 0xfcacef21, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 61 | 62 | // {FCACEF22-2B40-11d3-B3BF-00805F8A6670} 63 | #define NS_CJK_STRING_PSMDETECTOR_CID \ 64 | { 0xfcacef22, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 65 | 66 | 67 | // {FCACEF23-2B40-11d3-B3BF-00805F8A6670} 68 | #define NS_ZH_PSMDETECTOR_CID \ 69 | { 0xfcacef23, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 70 | 71 | // {FCACEF24-2B40-11d3-B3BF-00805F8A6670} 72 | #define NS_CJK_PSMDETECTOR_CID \ 73 | { 0xfcacef24, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } 74 | 75 | //===================================================================== 76 | class nsXPCOMDetector : 77 | public nsUniversalDetector, 78 | public nsICharsetDetector 79 | { 80 | NS_DECL_ISUPPORTS 81 | public: 82 | nsXPCOMDetector(uint32_t aLanguageFilter); 83 | virtual ~nsXPCOMDetector(); 84 | NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver); 85 | NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, bool *oDontFeedMe); 86 | NS_IMETHOD Done(); 87 | protected: 88 | virtual void Report(const char* aCharset); 89 | private: 90 | nsCOMPtr mObserver; 91 | }; 92 | 93 | 94 | //===================================================================== 95 | class nsXPCOMStringDetector : 96 | public nsUniversalDetector, 97 | public nsIStringCharsetDetector 98 | { 99 | NS_DECL_ISUPPORTS 100 | public: 101 | nsXPCOMStringDetector(uint32_t aLanguageFilter); 102 | virtual ~nsXPCOMStringDetector(); 103 | NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, 104 | const char** oCharset, nsDetectionConfident &oConf); 105 | protected: 106 | virtual void Report(const char* aCharset); 107 | private: 108 | nsCOMPtr mObserver; 109 | const char* mResult; 110 | }; 111 | 112 | //===================================================================== 113 | class nsUniversalXPCOMDetector : public nsXPCOMDetector 114 | { 115 | public: 116 | nsUniversalXPCOMDetector() 117 | : nsXPCOMDetector(NS_FILTER_ALL) {} 118 | }; 119 | 120 | class nsUniversalXPCOMStringDetector : public nsXPCOMStringDetector 121 | { 122 | public: 123 | nsUniversalXPCOMStringDetector() 124 | : nsXPCOMStringDetector(NS_FILTER_ALL) {} 125 | }; 126 | 127 | class nsJAPSMDetector : public nsXPCOMDetector 128 | { 129 | public: 130 | nsJAPSMDetector() 131 | : nsXPCOMDetector(NS_FILTER_JAPANESE) {} 132 | }; 133 | 134 | class nsJAStringPSMDetector : public nsXPCOMStringDetector 135 | { 136 | public: 137 | nsJAStringPSMDetector() 138 | : nsXPCOMStringDetector(NS_FILTER_JAPANESE) {} 139 | }; 140 | 141 | class nsKOPSMDetector : public nsXPCOMDetector 142 | { 143 | public: 144 | nsKOPSMDetector() 145 | : nsXPCOMDetector(NS_FILTER_KOREAN) {} 146 | }; 147 | 148 | class nsKOStringPSMDetector : public nsXPCOMStringDetector 149 | { 150 | public: 151 | nsKOStringPSMDetector() 152 | : nsXPCOMStringDetector(NS_FILTER_KOREAN) {} 153 | }; 154 | 155 | class nsZHTWPSMDetector : public nsXPCOMDetector 156 | { 157 | public: 158 | nsZHTWPSMDetector() 159 | : nsXPCOMDetector(NS_FILTER_CHINESE_TRADITIONAL) {} 160 | }; 161 | 162 | class nsZHTWStringPSMDetector : public nsXPCOMStringDetector 163 | { 164 | public: 165 | nsZHTWStringPSMDetector() 166 | : nsXPCOMStringDetector(NS_FILTER_CHINESE_TRADITIONAL) {} 167 | }; 168 | 169 | class nsZHCNPSMDetector : public nsXPCOMDetector 170 | { 171 | public: 172 | nsZHCNPSMDetector() 173 | : nsXPCOMDetector(NS_FILTER_CHINESE_SIMPLIFIED) {} 174 | }; 175 | 176 | class nsZHCNStringPSMDetector : public nsXPCOMStringDetector 177 | { 178 | public: 179 | nsZHCNStringPSMDetector() 180 | : nsXPCOMStringDetector(NS_FILTER_CHINESE_SIMPLIFIED) {} 181 | }; 182 | 183 | class nsZHPSMDetector : public nsXPCOMDetector 184 | { 185 | public: 186 | nsZHPSMDetector() 187 | : nsXPCOMDetector(NS_FILTER_CHINESE) {} 188 | }; 189 | 190 | class nsZHStringPSMDetector : public nsXPCOMStringDetector 191 | { 192 | public: 193 | nsZHStringPSMDetector() 194 | : nsXPCOMStringDetector(NS_FILTER_CHINESE) {} 195 | }; 196 | 197 | class nsCJKPSMDetector : public nsXPCOMDetector 198 | { 199 | public: 200 | nsCJKPSMDetector() 201 | : nsXPCOMDetector(NS_FILTER_CJK) {} 202 | }; 203 | 204 | class nsCJKStringPSMDetector : public nsXPCOMStringDetector 205 | { 206 | public: 207 | nsCJKStringPSMDetector() 208 | : nsXPCOMStringDetector(NS_FILTER_CJK) {} 209 | }; 210 | 211 | #endif //_nsUdetXPCOMWrapper_h__ 212 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/nsUniversalCharDetDll.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #ifndef nsCharDetDll_h__ 7 | #define nsCharDetDll_h__ 8 | 9 | #include "prtypes.h" 10 | 11 | #endif /* nsCharDetDll_h__ */ 12 | -------------------------------------------------------------------------------- /universalchardet/src/xpcom/nsUniversalCharDetModule.cpp: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 | /* This Source Code Form is subject to the terms of the Mozilla Public 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 | 6 | #include "mozilla/ModuleUtils.h" 7 | 8 | #include "nsCOMPtr.h" 9 | 10 | #include "nspr.h" 11 | #include "nsString.h" 12 | #include "nsUniversalCharDetDll.h" 13 | #include "nsISupports.h" 14 | #include "nsICategoryManager.h" 15 | #include "nsIComponentManager.h" 16 | #include "nsIServiceManager.h" 17 | #include "nsICharsetDetector.h" 18 | #include "nsIStringCharsetDetector.h" 19 | 20 | #include "nsUniversalDetector.h" 21 | #include "nsUdetXPCOMWrapper.h" 22 | 23 | 24 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsUniversalXPCOMDetector) 25 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsUniversalXPCOMStringDetector) 26 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsJAPSMDetector) 27 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsJAStringPSMDetector) 28 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsKOPSMDetector) 29 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsKOStringPSMDetector) 30 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHTWPSMDetector) 31 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHTWStringPSMDetector) 32 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHCNPSMDetector) 33 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHCNStringPSMDetector) 34 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHPSMDetector) 35 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsZHStringPSMDetector) 36 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsCJKPSMDetector) 37 | NS_GENERIC_FACTORY_CONSTRUCTOR(nsCJKStringPSMDetector) 38 | NS_DEFINE_NAMED_CID(NS_UNIVERSAL_DETECTOR_CID); 39 | NS_DEFINE_NAMED_CID(NS_UNIVERSAL_STRING_DETECTOR_CID); 40 | NS_DEFINE_NAMED_CID(NS_JA_PSMDETECTOR_CID); 41 | NS_DEFINE_NAMED_CID(NS_JA_STRING_PSMDETECTOR_CID); 42 | NS_DEFINE_NAMED_CID(NS_KO_PSMDETECTOR_CID); 43 | NS_DEFINE_NAMED_CID(NS_KO_STRING_PSMDETECTOR_CID); 44 | NS_DEFINE_NAMED_CID(NS_ZHTW_PSMDETECTOR_CID); 45 | NS_DEFINE_NAMED_CID(NS_ZHTW_STRING_PSMDETECTOR_CID); 46 | NS_DEFINE_NAMED_CID(NS_ZHCN_PSMDETECTOR_CID); 47 | NS_DEFINE_NAMED_CID(NS_ZHCN_STRING_PSMDETECTOR_CID); 48 | NS_DEFINE_NAMED_CID(NS_ZH_PSMDETECTOR_CID); 49 | NS_DEFINE_NAMED_CID(NS_ZH_STRING_PSMDETECTOR_CID); 50 | NS_DEFINE_NAMED_CID(NS_CJK_PSMDETECTOR_CID); 51 | NS_DEFINE_NAMED_CID(NS_CJK_STRING_PSMDETECTOR_CID); 52 | 53 | static const mozilla::Module::CIDEntry kChardetCIDs[] = { 54 | { &kNS_UNIVERSAL_DETECTOR_CID, false, nullptr, nsUniversalXPCOMDetectorConstructor }, 55 | { &kNS_UNIVERSAL_STRING_DETECTOR_CID, false, nullptr, nsUniversalXPCOMStringDetectorConstructor }, 56 | { &kNS_JA_PSMDETECTOR_CID, false, nullptr, nsJAPSMDetectorConstructor }, 57 | { &kNS_JA_STRING_PSMDETECTOR_CID, false, nullptr, nsJAStringPSMDetectorConstructor }, 58 | { &kNS_KO_PSMDETECTOR_CID, false, nullptr, nsKOPSMDetectorConstructor }, 59 | { &kNS_KO_STRING_PSMDETECTOR_CID, false, nullptr, nsKOStringPSMDetectorConstructor }, 60 | { &kNS_ZHTW_PSMDETECTOR_CID, false, nullptr, nsZHTWPSMDetectorConstructor }, 61 | { &kNS_ZHTW_STRING_PSMDETECTOR_CID, false, nullptr, nsZHTWStringPSMDetectorConstructor }, 62 | { &kNS_ZHCN_PSMDETECTOR_CID, false, nullptr, nsZHCNPSMDetectorConstructor }, 63 | { &kNS_ZHCN_STRING_PSMDETECTOR_CID, false, nullptr, nsZHCNStringPSMDetectorConstructor }, 64 | { &kNS_ZH_PSMDETECTOR_CID, false, nullptr, nsZHPSMDetectorConstructor }, 65 | { &kNS_ZH_STRING_PSMDETECTOR_CID, false, nullptr, nsZHStringPSMDetectorConstructor }, 66 | { &kNS_CJK_PSMDETECTOR_CID, false, nullptr, nsCJKPSMDetectorConstructor }, 67 | { &kNS_CJK_STRING_PSMDETECTOR_CID, false, nullptr, nsCJKStringPSMDetectorConstructor }, 68 | { nullptr } 69 | }; 70 | 71 | static const mozilla::Module::ContractIDEntry kChardetContracts[] = { 72 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "universal_charset_detector", &kNS_UNIVERSAL_DETECTOR_CID }, 73 | { NS_STRCDETECTOR_CONTRACTID_BASE "universal_charset_detector", &kNS_UNIVERSAL_STRING_DETECTOR_CID }, 74 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ja_parallel_state_machine", &kNS_JA_PSMDETECTOR_CID }, 75 | { NS_STRCDETECTOR_CONTRACTID_BASE "ja_parallel_state_machine", &kNS_JA_STRING_PSMDETECTOR_CID }, 76 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "ko_parallel_state_machine", &kNS_KO_PSMDETECTOR_CID }, 77 | { NS_STRCDETECTOR_CONTRACTID_BASE "ko_parallel_state_machine", &kNS_KO_STRING_PSMDETECTOR_CID }, 78 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "zhtw_parallel_state_machine", &kNS_ZHTW_PSMDETECTOR_CID }, 79 | { NS_STRCDETECTOR_CONTRACTID_BASE "zhtw_parallel_state_machine", &kNS_ZHTW_STRING_PSMDETECTOR_CID }, 80 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "zhcn_parallel_state_machine", &kNS_ZHCN_PSMDETECTOR_CID }, 81 | { NS_STRCDETECTOR_CONTRACTID_BASE "zhcn_parallel_state_machine", &kNS_ZHCN_STRING_PSMDETECTOR_CID }, 82 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "zh_parallel_state_machine", &kNS_ZH_PSMDETECTOR_CID }, 83 | { NS_STRCDETECTOR_CONTRACTID_BASE "zh_parallel_state_machine", &kNS_ZH_STRING_PSMDETECTOR_CID }, 84 | { NS_CHARSET_DETECTOR_CONTRACTID_BASE "cjk_parallel_state_machine", &kNS_CJK_PSMDETECTOR_CID }, 85 | { NS_STRCDETECTOR_CONTRACTID_BASE "cjk_parallel_state_machine", &kNS_CJK_STRING_PSMDETECTOR_CID }, 86 | { nullptr } 87 | }; 88 | 89 | static const mozilla::Module::CategoryEntry kChardetCategories[] = { 90 | { NS_CHARSET_DETECTOR_CATEGORY, "universal_charset_detector", NS_CHARSET_DETECTOR_CONTRACTID_BASE "universal_charset_detector" }, 91 | { NS_CHARSET_DETECTOR_CATEGORY, "ja_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ja_parallel_state_machine" }, 92 | { NS_CHARSET_DETECTOR_CATEGORY, "ko_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "ko_parallel_state_machine" }, 93 | { NS_CHARSET_DETECTOR_CATEGORY, "zhtw_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "zhtw_parallel_state_machine" }, 94 | { NS_CHARSET_DETECTOR_CATEGORY, "zhcn_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "zhcn_parallel_state_machine" }, 95 | { NS_CHARSET_DETECTOR_CATEGORY, "zh_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "zh_parallel_state_machine" }, 96 | { NS_CHARSET_DETECTOR_CATEGORY, "cjk_parallel_state_machine", NS_CHARSET_DETECTOR_CONTRACTID_BASE "cjk_parallel_state_machine" }, 97 | { nullptr } 98 | }; 99 | 100 | static const mozilla::Module kChardetModule = { 101 | mozilla::Module::kVersion, 102 | kChardetCIDs, 103 | kChardetContracts, 104 | kChardetCategories 105 | }; 106 | 107 | NSMODULE_DEFN(nsUniversalCharDetModule) = &kChardetModule; 108 | -------------------------------------------------------------------------------- /universalchardet/tests/CharsetDetectionTests.js: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* vim: set ts=8 et sw=4 tw=80: */ 3 | var gExpectedCharset; 4 | var gOldPref; 5 | var gDetectorList; 6 | var gTestIndex; 7 | var gLocalDir; 8 | const Cc = Components.classes; 9 | const Ci = Components.interfaces; 10 | 11 | function CharsetDetectionTests(aTestFile, aExpectedCharset, aDetectorList) 12 | { 13 | gExpectedCharset = aExpectedCharset; 14 | gDetectorList = aDetectorList; 15 | 16 | InitDetectorTests(); 17 | 18 | var fileURI = gLocalDir + aTestFile; 19 | $("testframe").src = fileURI; 20 | 21 | SimpleTest.waitForExplicitFinish(); 22 | } 23 | 24 | function InitDetectorTests() 25 | { 26 | var prefService = Cc["@mozilla.org/preferences-service;1"] 27 | .getService(Ci.nsIPrefBranch); 28 | var str = Cc["@mozilla.org/supports-string;1"] 29 | .createInstance(Ci.nsISupportsString); 30 | var loader = Cc["@mozilla.org/moz/jssubscript-loader;1"] 31 | .getService(Ci.mozIJSSubScriptLoader); 32 | var ioService = Cc['@mozilla.org/network/io-service;1'] 33 | .getService(Ci.nsIIOService); 34 | loader.loadSubScript("chrome://mochikit/content/chrome-harness.js"); 35 | 36 | try { 37 | gOldPref = prefService 38 | .getComplexValue("intl.charset.detector", 39 | Ci.nsIPrefLocalizedString).data; 40 | } catch (e) { 41 | gOldPref = ""; 42 | } 43 | SetDetectorPref(gDetectorList[0]); 44 | gTestIndex = 0; 45 | $("testframe").onload = DoDetectionTest; 46 | 47 | if (gExpectedCharset == "default") { 48 | try { 49 | gExpectedCharset = prefService 50 | .getComplexValue("intl.charset.default", 51 | Ci.nsIPrefLocalizedString) 52 | .data; 53 | if (gExpectedCharset == "ISO-8859-1") { 54 | gExpectedCharset = "windows-1252"; 55 | } 56 | } catch (e) { 57 | gExpectedCharset = "windows-1252"; 58 | } 59 | } 60 | 61 | // Get the local directory. This needs to be a file: URI because chrome: 62 | // URIs are always UTF-8 (bug 617339) and we are testing decoding from other 63 | // charsets. 64 | var jar = getJar(getRootDirectory(window.location.href)); 65 | var dir = jar ? 66 | extractJarToTmp(jar) : 67 | getChromeDir(getResolvedURI(window.location.href)); 68 | gLocalDir = ioService.newFileURI(dir).spec; 69 | } 70 | 71 | function SetDetectorPref(aPrefValue) 72 | { 73 | var prefService = Cc["@mozilla.org/preferences-service;1"] 74 | .getService(Ci.nsIPrefBranch); 75 | var str = Cc["@mozilla.org/supports-string;1"] 76 | .createInstance(Ci.nsISupportsString); 77 | str.data = aPrefValue; 78 | prefService.setComplexValue("intl.charset.detector", 79 | Ci.nsISupportsString, str); 80 | gCurrentDetector = aPrefValue; 81 | } 82 | 83 | function DoDetectionTest() { 84 | var iframeDoc = $("testframe").contentDocument; 85 | var charset = iframeDoc.characterSet; 86 | 87 | is(charset, gExpectedCharset, 88 | "decoded as " + gExpectedCharset + " by " + gDetectorList[gTestIndex]); 89 | 90 | if (++gTestIndex < gDetectorList.length) { 91 | SetDetectorPref(gDetectorList[gTestIndex]); 92 | iframeDoc.location.reload(); 93 | } else { 94 | CleanUpDetectionTests(); 95 | } 96 | } 97 | 98 | function CleanUpDetectionTests() { 99 | SetDetectorPref(gOldPref); 100 | SimpleTest.finish(); 101 | } 102 | 103 | -------------------------------------------------------------------------------- /universalchardet/tests/Makefile.in: -------------------------------------------------------------------------------- 1 | # 2 | # This Source Code Form is subject to the terms of the Mozilla Public 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this 4 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 5 | 6 | MOCHITEST_CHROME_FILES = \ 7 | CharsetDetectionTests.js \ 8 | bug9357_text.html \ 9 | test_bug9357.html \ 10 | bug171813_text.html \ 11 | test_bug171813.html \ 12 | bug306272_text.html \ 13 | test_bug306272.html \ 14 | bug421271_text.html \ 15 | test_bug421271.html \ 16 | bug426271_text-euc-jp.html \ 17 | test_bug426271-euc-jp.html \ 18 | bug426271_text-utf-8.html \ 19 | test_bug426271-utf-8.html \ 20 | bug431054_text.html \ 21 | test_bug431054.html \ 22 | test_bug431054-japanese.html \ 23 | bug488426_text.html \ 24 | test_bug488426.html \ 25 | bug547487_text.html \ 26 | test_bug547487.html \ 27 | bug620106_text.html \ 28 | test_bug620106.html \ 29 | bug631751le_text.html \ 30 | test_bug631751le.html \ 31 | bug631751be_text.html \ 32 | test_bug631751be.html \ 33 | bug638318_text.html \ 34 | test_bug638318.html \ 35 | bug811363-1.text \ 36 | bug811363-2.text \ 37 | bug811363-3.text \ 38 | bug811363-4.text \ 39 | bug811363-5.text \ 40 | bug811363-6.text \ 41 | bug811363-7.text \ 42 | bug811363-8.text \ 43 | bug811363-9.text \ 44 | bug811363-invalid-1.text \ 45 | bug811363-invalid-2.text \ 46 | bug811363-invalid-3.text \ 47 | bug811363-invalid-4.text \ 48 | bug811363-invalid-5.text \ 49 | test_bug811363-1-1.html \ 50 | test_bug811363-1-2.html \ 51 | test_bug811363-1-3.html \ 52 | test_bug811363-1-4.html \ 53 | test_bug811363-1-5.html \ 54 | test_bug811363-2-1.html \ 55 | test_bug811363-2-2.html \ 56 | test_bug811363-2-3.html \ 57 | test_bug811363-2-4.html \ 58 | test_bug811363-2-5.html \ 59 | test_bug811363-2-6.html \ 60 | test_bug811363-2-7.html \ 61 | test_bug811363-2-8.html \ 62 | test_bug811363-2-9.html \ 63 | $(NULL) 64 | -------------------------------------------------------------------------------- /universalchardet/tests/bug171813_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug171813_text.html -------------------------------------------------------------------------------- /universalchardet/tests/bug306272_text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 306272 5 | 6 | 7 | 8 | Antti Näyhä <Antti.Nayha@somewhere.fi> 9 | -------------------------------------------------------------------------------- /universalchardet/tests/bug421271_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug421271_text.html -------------------------------------------------------------------------------- /universalchardet/tests/bug426271_text-euc-jp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug426271_text-euc-jp.html -------------------------------------------------------------------------------- /universalchardet/tests/bug426271_text-utf-8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 日本語エンコードテスト 5 | 6 | 7 | これはUTF-8です昔々、ある所に子供のいない老夫婦が住んでいた。ある日、お婆さんが川で洗濯をしていると、大きな桃が流れて来たので、お爺さんと食べようと持ち帰った。二人で桃を割ると中から男の子が生まれたので、「桃太郎」と名付けて大事に育てた。 8 | 9 | 成長した桃太郎は、鬼ヶ島の鬼が人々を苦しめていることを知り、鬼退治を決意する。両親から黍団子を餞別に貰い、道中にそれを分け与えてイヌ、サル、キジを家来に従える。鬼ヶ島で鬼と戦い、見事に勝利を収め、鬼が方々から奪っていった財宝を持ち帰り、お爺さん・お婆さんの元に返り、幸せに暮らしたという。出典: フリー百科事典『ウィキペディア(Wikipedia)』 10 | 11 | 12 | -------------------------------------------------------------------------------- /universalchardet/tests/bug431054_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug431054_text.html -------------------------------------------------------------------------------- /universalchardet/tests/bug488426_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug488426_text.html -------------------------------------------------------------------------------- /universalchardet/tests/bug547487_text.html: -------------------------------------------------------------------------------- 1 | The quick brown fox jumps over the lazy dog. 2 | -------------------------------------------------------------------------------- /universalchardet/tests/bug620106_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug620106_text.html -------------------------------------------------------------------------------- /universalchardet/tests/bug631751be_text.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html> 3 | <head> 4 | <title>Big-endian BOMless UTF-16 with Basic Latin content</title> 5 | </head> 6 | <body> 7 | Big-endian BOMless UTF-16 with Basic Latin content 8 | <body> 9 | </html> 10 | -------------------------------------------------------------------------------- /universalchardet/tests/bug631751le_text.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html> 3 | <head> 4 | <title>Little-endian BOMless UTF-16 with Basic Latin content</title> 5 | </head> 6 | <body> 7 | Little-endian BOMless UTF-16 with Basic Latin content 8 | <body> 9 | </html> 10 | -------------------------------------------------------------------------------- /universalchardet/tests/bug638318_text.html: -------------------------------------------------------------------------------- 1 | aNOT UTF-16!NOT UTF-16! -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-1.text: -------------------------------------------------------------------------------- 1 | Two-byte UTF-8 including the first and last characters in the range: €Шерлок߿ 2 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-2.text: -------------------------------------------------------------------------------- 1 | Three byte UTF-8, first byte 0xE0, including first and last characters 2 | in the range: ࠀशर्लक࿿ 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-3.text: -------------------------------------------------------------------------------- 1 | Three byte UTF-8, first byte 0xE1-EC, including first and last characters 2 | in the range: ကシャーロック쿿 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-4.text: -------------------------------------------------------------------------------- 1 | Three byte UTF-8, first byte 0xED, including first and last characters 2 | in the range: 퀀홈하홈탐퟿ 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-5.text: -------------------------------------------------------------------------------- 1 | Three byte UTF-8, first byte 0xEE-EF, including first and last characters 2 | in the range: ﴍﻟﻮﻙ￿ 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-6.text: -------------------------------------------------------------------------------- 1 | Four byte UTF-8, first byte 0xF0, including first and last characters 2 | in the range: 𐀀𐌲𐌿𐍄𐌹𐍃𐌺 𿿿 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-7.text: -------------------------------------------------------------------------------- 1 | Four byte UTF-8, first byte 0xF1-F3, including first and last characters 2 | in the range: 񀀀񠀀 񠀁 񠀂󿿿 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-8.text: -------------------------------------------------------------------------------- 1 | Four byte UTF-8, first byte 0xF4, including first and last characters 2 | in the range:􀀀􈀀 􈀁 􈀂􏿿 3 | 4 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-9.text: -------------------------------------------------------------------------------- 1 | Four byte UTF-8, first byte 0xF0, including BMP only:𐤔𐤓𐤋𐤅𐤒 2 | 3 | -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-invalid-1.text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug811363-invalid-1.text -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-invalid-2.text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug811363-invalid-2.text -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-invalid-3.text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug811363-invalid-3.text -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-invalid-4.text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug811363-invalid-4.text -------------------------------------------------------------------------------- /universalchardet/tests/bug811363-invalid-5.text: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug811363-invalid-5.text -------------------------------------------------------------------------------- /universalchardet/tests/bug9357_text.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanX2/UniversalDetector/1c8cc1378a7938a2b96e0133e1c76e83030aa03f/universalchardet/tests/bug9357_text.html -------------------------------------------------------------------------------- /universalchardet/tests/moz.build: -------------------------------------------------------------------------------- 1 | # -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- 2 | # vim: set filetype=python: 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug171813.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 171813 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 171813 17 |

18 | 20 | 21 |
22 | 
31 | 
32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug306272.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 306272 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 306272 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug421271.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 421271 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 421271 17 |

18 | 20 | 21 |
22 | 
34 | 
35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug426271-euc-jp.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 426271 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 426271 17 |

18 | 20 | 21 |
22 | 
30 | 
31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug426271-utf-8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 426271 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 426271 17 |

18 | 20 | 21 |
22 | 
34 | 
35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug431054-japanese.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 431054 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 431054 17 |

18 | 20 | 21 |
22 | 
28 | 
29 | 30 | 31 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug431054.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 431054 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 431054 17 |

18 | 20 | 21 |
22 | 
32 | 
33 | 34 | 35 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug488426.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 488426 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 488426 17 |

18 | 20 | 21 |
22 | 
28 | 
29 | 30 | 31 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug547487.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 547487 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 547487 17 |

18 | 20 | 21 |
22 | 
34 | 
35 | 36 | 37 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug620106.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 620106 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 620106 17 |

18 | 20 | 21 |
22 | 
28 | 
29 | 30 | 31 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug631751be.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 631751 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 631751 17 |

18 | 20 | 21 |
22 | 
29 | 
30 | 31 | 32 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug631751le.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 631751 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 631751 17 |

18 | 20 | 21 |
22 | 
29 | 
30 | 31 | 32 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug638318.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 638318 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 638318 17 |

18 | 20 | 21 |
22 | 
29 | 
30 | 31 | 32 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-1-1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-1-2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-1-3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-1-4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-1-5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-6.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-7.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug811363-2-9.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 811363 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 811363 17 |

18 | 20 | 21 |
22 | 
33 | 
34 | 35 | 36 | -------------------------------------------------------------------------------- /universalchardet/tests/test_bug9357.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | Test for Bug 9357 8 | 11 | 12 | 14 | 15 | 16 | Mozilla Bug 9357 17 |

18 | 20 | 21 |
22 | 
30 | 
31 | 32 | 33 | 34 | --------------------------------------------------------------------------------