├── .npmrc ├── .npmignore ├── .prettierrc.json ├── src ├── fs │ ├── browser.ts │ └── node.ts ├── test │ └── data │ │ ├── collation.zip │ │ └── encodings │ │ ├── big5 │ │ ├── euc_jp │ │ ├── euc_kr │ │ ├── koi8r │ │ ├── gb18030 │ │ ├── iso88598 │ │ ├── shiftjis │ │ ├── utf16be │ │ ├── utf16le │ │ ├── utf32be │ │ ├── utf32le │ │ ├── iso88592_cs │ │ ├── iso88595_ru │ │ ├── iso88596_ar │ │ ├── iso88597_el │ │ ├── iso88598_he │ │ ├── iso88599_tr │ │ ├── ascii │ │ ├── windows_1250 │ │ ├── windows_1251 │ │ ├── windows_1252 │ │ ├── windows_1253 │ │ ├── windows_1254 │ │ ├── windows_1255 │ │ ├── windows_1256 │ │ ├── lang_chinese │ │ ├── utf8 │ │ ├── lang_russian │ │ ├── lang_greek │ │ ├── lang_japanese │ │ ├── lang_korean │ │ ├── lang_czech │ │ ├── iso2022cn │ │ ├── lang_arabic │ │ ├── lang_hebrew │ │ ├── lang_turkish │ │ ├── iso2022jp │ │ ├── iso88591_en │ │ └── iso2022kr ├── encoding │ ├── utf8.test.ts │ ├── ascii.test.ts │ ├── index.ts │ ├── ascii.ts │ ├── unicode.test.ts │ ├── mbcs.test.ts │ ├── iso2022.test.ts │ ├── utf8.ts │ ├── sbcs.test.ts │ ├── unicode.ts │ ├── iso2022.ts │ ├── mbcs.ts │ └── sbcs.ts ├── utils.ts ├── utils.test.ts ├── match.ts ├── index.test.ts └── index.ts ├── .github └── workflows │ ├── test-build.sh │ ├── build.yml │ ├── release.yml │ ├── test-build.js │ └── test-build.ts ├── .gitignore ├── jest.config.js ├── renovate.json ├── tsconfig.json ├── LICENSE ├── package.json └── README.md /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | test 2 | scripts 3 | yarn.lock 4 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "printWidth": 80 4 | } 5 | -------------------------------------------------------------------------------- /src/fs/browser.ts: -------------------------------------------------------------------------------- 1 | export default () => { 2 | throw new Error('File system is not available'); 3 | }; 4 | -------------------------------------------------------------------------------- /src/test/data/collation.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/collation.zip -------------------------------------------------------------------------------- /src/test/data/encodings/big5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/big5 -------------------------------------------------------------------------------- /src/test/data/encodings/euc_jp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/euc_jp -------------------------------------------------------------------------------- /src/test/data/encodings/euc_kr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/euc_kr -------------------------------------------------------------------------------- /src/test/data/encodings/koi8r: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/koi8r -------------------------------------------------------------------------------- /src/test/data/encodings/gb18030: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/gb18030 -------------------------------------------------------------------------------- /src/test/data/encodings/iso88598: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88598 -------------------------------------------------------------------------------- /src/test/data/encodings/shiftjis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/shiftjis -------------------------------------------------------------------------------- /src/test/data/encodings/utf16be: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf16be -------------------------------------------------------------------------------- /src/test/data/encodings/utf16le: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf16le -------------------------------------------------------------------------------- /src/test/data/encodings/utf32be: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf32be -------------------------------------------------------------------------------- /src/test/data/encodings/utf32le: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf32le -------------------------------------------------------------------------------- /src/test/data/encodings/iso88592_cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88592_cs -------------------------------------------------------------------------------- /src/test/data/encodings/iso88595_ru: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88595_ru -------------------------------------------------------------------------------- /src/test/data/encodings/iso88596_ar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88596_ar -------------------------------------------------------------------------------- /src/test/data/encodings/iso88597_el: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88597_el -------------------------------------------------------------------------------- /src/test/data/encodings/iso88598_he: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88598_he -------------------------------------------------------------------------------- /src/test/data/encodings/iso88599_tr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88599_tr -------------------------------------------------------------------------------- /src/test/data/encodings/ascii: -------------------------------------------------------------------------------- 1 | !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1250: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1250 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1251: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1251 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1252: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1252 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1253: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1253 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1254: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1254 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1255: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1255 -------------------------------------------------------------------------------- /src/test/data/encodings/windows_1256: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1256 -------------------------------------------------------------------------------- /.github/workflows/test-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -ex 2 | 3 | node ./.github/workflows/test-build.js 4 | npx ts-node ./.github/workflows/test-build.ts 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | testing.js 2 | .DS_Store 3 | node_modules 4 | coverage 5 | npm-debug.log 6 | lib 7 | TODO.md 8 | package-lock.json 9 | .vscode 10 | -------------------------------------------------------------------------------- /src/encoding/utf8.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | 3 | describe('UTF-8', () => { 4 | it('should return UTF-8', () => { 5 | expect( 6 | chardet.detectFileSync(__dirname + '/../test/data/encodings/utf8'), 7 | ).toBe('UTF-8'); 8 | }); 9 | }); 10 | -------------------------------------------------------------------------------- /src/encoding/ascii.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | 3 | describe('ASCII', () => { 4 | it('should return ASCII', () => { 5 | expect( 6 | chardet.detectFileSync(__dirname + '/../test/data/encodings/ascii'), 7 | ).toBe('ASCII'); 8 | }); 9 | }); 10 | -------------------------------------------------------------------------------- /src/fs/node.ts: -------------------------------------------------------------------------------- 1 | let fsModule: any; 2 | 3 | export default () => { 4 | if (typeof module === 'object' && typeof module.exports === 'object') { 5 | fsModule = fsModule ? fsModule : require('fs'); 6 | return fsModule; 7 | } 8 | throw new Error('File system is not available'); 9 | }; 10 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | testEnvironment: 'node', 3 | testRegex: '.*test.ts$', 4 | transform: { '^.+\\.ts?$': 'ts-jest' }, 5 | moduleFileExtensions: ['ts', 'js', 'json'], 6 | rootDir: 'src', 7 | collectCoverage: true, 8 | coverageDirectory: '/../coverage', 9 | }; 10 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | // May also check if every element is a number <= 255 but 2 | // it a little bit slower 3 | export const isByteArray = (input: any): input is Uint8Array => { 4 | if (input == null || typeof input != 'object') return false; 5 | 6 | return isFinite(input.length) && input.length >= 0; 7 | }; 8 | -------------------------------------------------------------------------------- /src/test/data/encodings/lang_chinese: -------------------------------------------------------------------------------- 1 | 政府資訊科技總監辦公室和平等機會委員會合辦無障礙網頁嘉許計劃,希望透過表彰採用無障礙網頁設計的網站,推動更多企業和機構在其網站採用無障礙網頁設計,讓社會各階層包括殘疾人士更方便地獲取網上資訊和使用網上服務。無障礙網頁嘉許計劃頒獎典禮將於2013年4月15日舉行,為首次舉辦的「國際IT匯」的精彩活動之一。有關詳情,請瀏覽這裡。 2 | 政府一向致力推動長者在生活上更廣泛應用資訊科技。政府資訊科技總監辦公室已開展一項全港性嘉許計劃「智醒長者嘉許計劃」,表揚在日常生活中積極使用資訊及通訊科技的長者,以鼓勵他們繼續使用資訊及通訊科技。嘉許計劃設有金、銀、銅獎,長者於特定期間完成指定要求,可獲頒贈嘉許證書及獎牌。有關詳情,請瀏覽這裡。 -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:base"], 3 | "dependencyDashboard": false, 4 | "schedule": "on the first day of the month", 5 | "packageRules": [ 6 | { 7 | "updateTypes": ["minor", "patch", "pin", "digest"], 8 | "automerge": true 9 | }, 10 | { 11 | "depTypeList": ["devDependencies"], 12 | "automerge": true 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /src/encoding/index.ts: -------------------------------------------------------------------------------- 1 | import type { EncodingName, Match } from '../match'; 2 | 3 | export interface Recogniser { 4 | match(input: Context): Match | null; 5 | name(input?: Context): EncodingName; 6 | language?(): string | undefined; 7 | } 8 | 9 | export interface Context { 10 | byteStats: number[]; 11 | c1Bytes: boolean; 12 | rawInput: Uint8Array; 13 | rawLen: number; 14 | inputBytes: Uint8Array; 15 | inputLen: number; 16 | } 17 | -------------------------------------------------------------------------------- /src/encoding/ascii.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type EncodingName, type Match } from '../match'; 3 | 4 | export default class Ascii implements Recogniser { 5 | name(): EncodingName { 6 | return 'ASCII'; 7 | } 8 | 9 | match(det: Context): Match | null { 10 | const input = det.rawInput; 11 | 12 | for (let i = 0; i < det.rawLen; i++) { 13 | const b = input[i]; 14 | if (b < 32 || b > 126) { 15 | return match(det, this, 0); 16 | } 17 | } 18 | 19 | return match(det, this, 100); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "forceConsistentCasingInFileNames": true, 4 | "outDir": "lib", 5 | "rootDir": "src", 6 | "allowJs": false, 7 | "allowSyntheticDefaultImports": true, 8 | "declaration": true, 9 | "diagnostics": true, 10 | "esModuleInterop": true, 11 | "extendedDiagnostics": false, 12 | "listEmittedFiles": true, 13 | "module": "commonjs", 14 | "removeComments": true, 15 | "sourceMap": true, 16 | "strict": true, 17 | "target": "ES2019", 18 | "noUnusedLocals": true, 19 | "noUnusedParameters": true 20 | }, 21 | "exclude": ["node_modules", "**/*.spec.ts", "**/*.test.ts", "__mocks__", "lib"] 22 | } 23 | -------------------------------------------------------------------------------- /src/encoding/unicode.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | 3 | describe('Unicode', () => { 4 | const base = __dirname + '/../test/data/encodings'; 5 | 6 | it('should return UTF-16LE', () => { 7 | expect(chardet.detectFileSync(base + '/utf16le')).toBe('UTF-16LE'); 8 | }); 9 | 10 | it('should return UTF-16BE', () => { 11 | expect(chardet.detectFileSync(base + '/utf16be')).toBe('UTF-16BE'); 12 | }); 13 | 14 | it('should return UTF-32LE', () => { 15 | expect(chardet.detectFileSync(base + '/utf32le')).toBe('UTF-32LE'); 16 | }); 17 | 18 | it('should return UTF-32BE', () => { 19 | expect(chardet.detectFileSync(base + '/utf32be')).toBe('UTF-32BE'); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: ["*"] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | node-version: [18.x, 20.x, 22.x] 16 | 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v5 20 | with: 21 | fetch-depth: 0 22 | - name: Use Node.js ${{ matrix.node-version }} 23 | uses: actions/setup-node@v6 24 | with: 25 | node-version: ${{ matrix.node-version }} 26 | - run: npm i 27 | - run: npm test 28 | - run: npm run build 29 | - run: .github/workflows/test-build.sh 30 | -------------------------------------------------------------------------------- /src/test/data/encodings/utf8: -------------------------------------------------------------------------------- 1 | Euro Symbol: €. 2 | Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα. 3 | Íslenska / Icelandic: Ég get etið gler án þess að meiða mig. 4 | Polish: Mogę jeść szkło, i mi nie szkodzi. 5 | Romanian: Pot să mănânc sticlă și ea nu mă rănește. 6 | Ukrainian: Я можу їсти шкло, й воно мені не пошкодить. 7 | Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։ 8 | Georgian: მინას ვჭამ და არა მტკივა. 9 | Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती. 10 | Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי. 11 | Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ. 12 | Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني. 13 | Japanese: 私はガラスを食べられます。それは私を傷つけません。 14 | Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | push: 4 | branches: 5 | - master 6 | jobs: 7 | release: 8 | name: Release 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout 12 | uses: actions/checkout@v5 13 | with: 14 | fetch-depth: 0 15 | - name: Setup Node.js 16 | uses: actions/setup-node@v6 17 | with: 18 | node-version: 24 19 | - name: Install dependencies 20 | run: npm i 21 | - name: Build module 22 | run: npm run build 23 | - name: Release 24 | env: 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 27 | run: npx semantic-release 28 | -------------------------------------------------------------------------------- /src/utils.test.ts: -------------------------------------------------------------------------------- 1 | import { isByteArray } from './utils'; 2 | 3 | describe('isByteArray', () => { 4 | test('positives', () => { 5 | expect(isByteArray(Buffer.from('hello'))).toBe(true); 6 | expect(isByteArray(new Uint8Array(0))).toBe(true); 7 | expect(isByteArray(new Uint8Array(1))).toBe(true); 8 | expect(isByteArray([])).toBe(true); 9 | expect(isByteArray([1])).toBe(true); 10 | }); 11 | 12 | test('negatives', () => { 13 | expect(isByteArray(null)).toBe(false); 14 | expect(isByteArray('')).toBe(false); 15 | expect(isByteArray('hello')).toBe(false); 16 | expect(isByteArray(123)).toBe(false); 17 | expect(isByteArray('123')).toBe(false); 18 | expect(isByteArray({})).toBe(false); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/encoding/mbcs.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | 3 | describe('Multibyte Character Sets', () => { 4 | const base = __dirname + '/../test/data/encodings'; 5 | 6 | it('should return Shift_JIS', () => { 7 | expect(chardet.detectFileSync(base + '/shiftjis')).toBe('Shift_JIS'); 8 | }); 9 | 10 | it('should return GB18030', () => { 11 | expect(chardet.detectFileSync(base + '/gb18030')).toBe('GB18030'); 12 | }); 13 | 14 | it('should return Big5', () => { 15 | expect(chardet.detectFileSync(base + '/big5')).toBe('Big5'); 16 | }); 17 | 18 | it('should return EUC-JP', () => { 19 | expect(chardet.detectFileSync(base + '/euc_jp')).toBe('EUC-JP'); 20 | }); 21 | 22 | it('should return EUC-KR', () => { 23 | expect(chardet.detectFileSync(base + '/euc_kr')).toBe('EUC-KR'); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/encoding/iso2022.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | import fs from 'fs'; 3 | import path from 'path'; 4 | 5 | describe('ISO-2022', () => { 6 | const base = __dirname + '/../test/data/encodings'; 7 | 8 | const analyse = (asset: string) => 9 | chardet.analyse(fs.readFileSync(path.join(base, asset))).shift(); 10 | 11 | it('should return ISO-2022-JP', () => { 12 | expect(analyse('iso2022jp')).toEqual({ 13 | confidence: 100, 14 | lang: 'ja', 15 | name: 'ISO-2022-JP', 16 | }); 17 | }); 18 | 19 | it('should return ISO-2022-KR', () => { 20 | expect(analyse('iso2022kr')).toEqual({ 21 | confidence: 100, 22 | lang: 'kr', 23 | name: 'ISO-2022-KR', 24 | }); 25 | }); 26 | 27 | it('should return ISO-2022-CN', () => { 28 | expect(analyse('iso2022cn')).toEqual({ 29 | confidence: 100, 30 | lang: 'zh', 31 | name: 'ISO-2022-CN', 32 | }); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /.github/workflows/test-build.js: -------------------------------------------------------------------------------- 1 | const assert = require('assert'); 2 | 3 | const chardet = require(process.cwd()); 4 | 5 | assert(typeof chardet.analyse, 'function'); 6 | assert(typeof chardet.detect, 'function'); 7 | assert(typeof chardet.detectFile, 'function'); 8 | assert(typeof chardet.detectFileSync, 'function'); 9 | 10 | assert.deepStrictEqual(chardet.analyse(Buffer.from('This is a test')), [ 11 | { confidence: 100, name: 'ASCII', lang: undefined }, 12 | { confidence: 98, name: 'ISO-8859-1', lang: 'en' }, 13 | { confidence: 98, name: 'ISO-8859-2', lang: 'hu' }, 14 | { confidence: 10, name: 'UTF-8', lang: undefined }, 15 | { confidence: 10, name: 'Shift_JIS', lang: 'ja' }, 16 | { confidence: 10, name: 'Big5', lang: 'zh' }, 17 | { confidence: 10, name: 'EUC-JP', lang: 'ja' }, 18 | { confidence: 10, name: 'EUC-KR', lang: 'ko' }, 19 | { confidence: 10, name: 'GB18030', lang: 'zh' }, 20 | ]); 21 | 22 | console.log(' > test-build.js OK'); 23 | -------------------------------------------------------------------------------- /src/test/data/encodings/lang_russian: -------------------------------------------------------------------------------- 1 | Первомай в современном виде возник в конце XIX века в рабочем движении, выдвинувшем в качестве одного из основных требований введение восьмичасового рабочего дня. 1 мая 1886 года социалистические, коммунистические и анархические организации США и Канады устроили ряд митингов и демонстраций. При разгоне такой демонстрации в Чикаго 4 мая погибло шесть демонстрантов. В ходе последовавших за этим массовых выступлений протеста против жестоких действий полиции в результате взрыва бомбы последовавшей перестрелке было убито восемь полицейских и минимум четверо рабочих (по некоторым данным, до пятидесяти убитых и раненых[2]), несколько десятков человек получили ранения. По обвинению в организации взрыва четверо рабочих-анархистов были приговорены к повешению (впоследствии было доказано, что обвинение было ложным)[3]. Именно в память о казнённых Парижский конгресс II Интернационала (июль 1889) объявил 1 мая Днём солидарности рабочих всего мира и предложил ежегодно отмечать его демонстрациями с социальными требованиями. -------------------------------------------------------------------------------- /src/test/data/encodings/lang_greek: -------------------------------------------------------------------------------- 1 | Η Λαϊκή ή Δημώδης Λατινική (λατ. sermo vulgaris) είναι ένας όρος-ομπρέλα, ο οποίος καλύπτει τις διαλέκτους τής λατινικής γλώσσας που ομιλούνταν κυρίως στις δυτικές επαρχίες τής Ρωμαϊκής Αυτοκρατορίας, μέχρις ότου αυτές οι διάλεκτοι, αποκλίνοντας ακόμη περισσότερο, εξελίχθηκαν στις πρώιμες ρομανικές γλώσσες κατά τον 9ο αιώνα. 2 | Η ομιλουμένη Λατινική διέφερε από τη λογοτεχνική κλασική Λατινική στην προφορά, το λεξιλόγιο και τη γραμματική. Κάποια χαρακτηριστικά της δημώδους Λατινικής δεν εμφανίστηκαν παρά στην ύστερη Αυτοκρατορία. Άλλα χαρακτηριστικά της υπήρχαν πιθανόν στην ομιλουμένη Λατινική, τουλάχιστον στις πρωτογενείς μορφές τους, πολύ νωρίτερα. Οι περισσότεροι ορισμοί τής δημώδους Λατινικής την παρουσιάζουν ως προφορική παρά ως γραπτή γλώσσα, επειδή οι μαρτυρίες οδηγούν στο συμπέρασμα ότι η ομιλουμένη Λατινική διασπάστηκε σε αποκλίνουσες διαλέκτους αυτή την περίοδο. Επειδή κανείς τότε δεν μετέγραψε φωνητικά την καθημερινή ομιλία των Λατίνων, οι μελετητές τής λαϊκής Λατινικής πρέπει να χρησιμοποιούν έμμεσες μεθόδους. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2024 Dmitry Shirokov 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /.github/workflows/test-build.ts: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | const main = async () => { 4 | const chardet = await import(process.cwd()); 5 | 6 | assert(typeof chardet.analyse, 'function'); 7 | assert(typeof chardet.detect, 'function'); 8 | assert(typeof chardet.detectFile, 'function'); 9 | assert(typeof chardet.detectFileSync, 'function'); 10 | 11 | assert.deepStrictEqual(chardet.analyse(Buffer.from('This is a test')), [ 12 | { confidence: 100, name: 'ASCII', lang: undefined }, 13 | { confidence: 98, name: 'ISO-8859-1', lang: 'en' }, 14 | { confidence: 98, name: 'ISO-8859-2', lang: 'hu' }, 15 | { confidence: 10, name: 'UTF-8', lang: undefined }, 16 | { confidence: 10, name: 'Shift_JIS', lang: 'ja' }, 17 | { confidence: 10, name: 'Big5', lang: 'zh' }, 18 | { confidence: 10, name: 'EUC-JP', lang: 'ja' }, 19 | { confidence: 10, name: 'EUC-KR', lang: 'ko' }, 20 | { confidence: 10, name: 'GB18030', lang: 'zh' }, 21 | ]); 22 | }; 23 | 24 | main() 25 | .then(() => console.log(' > test-build.ts OK')) 26 | .catch((err) => { 27 | console.error(err); 28 | process.exit(1); 29 | }); 30 | -------------------------------------------------------------------------------- /src/match.ts: -------------------------------------------------------------------------------- 1 | import { Context, Recogniser } from "./encoding"; 2 | 3 | export type EncodingName = 4 | | 'ASCII' 5 | | 'Big5' 6 | | 'EUC-JP' 7 | | 'EUC-KR' 8 | | 'GB18030' 9 | | 'ISO_2022' // TODO: Use hyphen 10 | | 'ISO-2022-CN' 11 | | 'ISO-2022-JP' 12 | | 'ISO-2022-KR' 13 | | 'ISO-8859-1' 14 | | 'ISO-8859-2' 15 | | 'ISO-8859-5' 16 | | 'ISO-8859-6' 17 | | 'ISO-8859-7' 18 | | 'ISO-8859-8' 19 | | 'ISO-8859-9' 20 | | 'ISO-8859-9' 21 | | 'KOI8-R' 22 | | 'mbcs' 23 | | 'sbcs' 24 | | 'Shift_JIS' // TODO: Use hyphen 25 | | 'UTF-16BE' 26 | | 'UTF-16LE' 27 | | 'UTF-32' 28 | | 'UTF-32BE' 29 | | 'UTF-32LE' 30 | | 'UTF-8' 31 | | 'windows-1250' 32 | | 'windows-1251' 33 | | 'windows-1252' 34 | | 'windows-1253' 35 | | 'windows-1254' 36 | | 'windows-1254' 37 | | 'windows-1255' 38 | | 'windows-1256' 39 | 40 | export interface Match { 41 | confidence: number; 42 | name: EncodingName; 43 | lang?: string; 44 | } 45 | 46 | export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({ 47 | confidence, 48 | name: rec.name(ctx), 49 | lang: rec.language ? rec.language() : undefined, 50 | }); 51 | -------------------------------------------------------------------------------- /src/test/data/encodings/lang_japanese: -------------------------------------------------------------------------------- 1 | コンピューターは、本質的には数字しか扱うことができません。コンピューターは、文字や記号などのそれぞれに番号を割り振ることによって扱えるようにします。ユニコードが出来るまでは、これらの番号を割り振る仕組みが何百種類も存在しました。どの一つをとっても、十分な文字を含んではいませんでした。例えば、欧州連合一つを見ても、そのすべての言語をカバーするためには、いくつかの異なる符号化の仕組みが必要でした。英語のような一つの言語に限っても、一つだけの符号化の仕組みでは、一般的に使われるすべての文字、句読点、技術的な記号などを扱うには不十分でした。 2 | 3 | これらの符号化の仕組みは、相互に矛盾するものでもありました。二つの異なる符号化の仕組みが、二つの異なる文字に同一の番号を付けることもできるし、同じ文字に異なる番号を付けることもできるのです。どのようなコンピューターも(特にサーバーは)多くの異なった符号化の仕組みをサポートする必要があります。たとえデータが異なる符号化の仕組みやプラットフォームを通過しても、いつどこでデータが乱れるか分からない危険を冒すことのなるのです。 4 | ユニコードはすべてを変えます 5 | 6 | ユニコードは、プラットフォームに係わらず、プログラムに係わらず、言語に係わらず、すべての文字に独立した番号を与えます。ユニコード標準は、アップル、ヒューレットパッカード、IBM、ジャストシステム、マイクロソフト、オラクル、SAP、サン、サイベースなどの産業界の主導的企業と他の多くの企業に採用されています。ユニコードは、XML、Java、ECMAScript(JavaScript)、LDAP、CORBA 3.0などの最先端の標準の前提となっており、ユニコードを実装すれば、ISO/IEC 10646に適合することになります。ユニコードは、多くのオペレーティングシステムとすべての最新のブラウザーと他の多くの製品でサポートされています。ユニコード標準の出現とユニコードをサポートするツール類は、昨今顕著になっているソフトウエア技術のグローバル化の流れに対して、特に役に立っています。 7 | 8 | ユニコードをクライアントサーバー型のアプリケーションや、多層構造を持つアプリケーション、ウェブサイトなどにに組み込むことで、従来の文字コードセットを用いるよりも明らかなコスト削減が可能です。ユニコードは、単一のソフトウエア製品、単一のウェブサイトに、何ら手を加えることなく、複数のプラットフォーム、複数の言語、複数の国をカバーすることが出来るのです。ユニコードは、データが多くの異なるシステムの間を、何の乱れもなしに転送することを可能とするのです。 9 | ユニコードコンソーシアムについて 10 | 11 | ユニコードコンソーシアムは、最新のソフトウエア製品と標準においてテキストを表現することを意味する“ユニコード標準”の構築、発展、普及、利用促進を目的として設立された非営利組織です。同コンソーシアムの会員は、コンピューターと情報処理に係わる広汎な企業や組織から構成されています。同コンソーシアムは、財政的には、純粋に会費のみによって運営されています。ユニコード標準を支持し、その拡張と実装を支援する世界中の組織や個人は、だれもがユニコードコンソーシアムの会員なることができます。 12 | -------------------------------------------------------------------------------- /src/test/data/encodings/lang_korean: -------------------------------------------------------------------------------- 1 | 기본적으로 컴퓨터는 숫자만 처리합니다. 글자나 다른 문자에도 숫자를 지정하여 저장합니다. 유니코드가 개발되기 전에는 이러한 숫자를 지정하기 위해 수백 가지의 다른 기호화 시스템을 사용했습니다. 단일 기호화 방법으로는 모든 문자를 포함할 수 없었습니다. 예를 들어 유럽 연합에서만 보더라도 모든 각 나라별 언어를 처리하려면 여러 개의 다른 기호화 방법이 필요합니다. 영어와 같은 단일 언어의 경우도 공통적으로 사용되는 모든 글자, 문장 부호 및 테크니컬 기호에 맞는 단일 기호화 방법을 갖고 있지 못하였습니다. 2 | 3 | 이러한 기호화 시스템은 또한 다른 기호화 시스템과 충돌합니다. 즉 두 가지 기호화 방법이 두 개의 다른 문자에 대해 같은 번호를 사용하거나 같은 문자에 대해 다른 번호를 사용할 수 있습니다. 주어진 모든 컴퓨터(특히 서버)는 서로 다른 여러 가지 기호화 방법을 지원해야 합니다. 그러나, 데이터를 서로 다른 기호화 방법이나 플랫폼 간에 전달할 때마다 그 데이터는 항상 손상의 위험을 겪게 됩니다. 4 | 유니코드로 모든 것을 해결할 수 있습니다! 5 | 6 | 유니코드는 사용 중인 플랫폼, 프로그램, 언어에 관계없이 문자마다 고유한 숫자를 제공합니다. 유니코드 표준은 Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys 및 기타 여러 회사와 같은 업계 선두주자에 의해 채택되었습니다. 유니코드는 XML, Java, ECMAScript(JavaScript), LDAP, CORBA 3.0, WML 등과 같이 현재 널리 사용되는 표준에서 필요하며 이는 ISO/IEC 10646을 구현하는 공식적인 방법입니다. 이는 많은 운영 체제, 요즘 사용되는 모든 브라우저 및 기타 많은 제품에서 지원됩니다. 유니코드 표준의 부상과 이를 지원하는 도구의 가용성은 최근 전 세계에 불고 있는 기술 경향에서 가장 중요한 부분을 차지하고 있습니다. 7 | 8 | 유니코드를 클라이언트-서버 또는 다중-연결 응용 프로그램과 웹 사이트에 통합하면 레거시 문자 세트 사용에 있어서 상당한 비용 절감 효과가 나타납니다. 유니코드를 통해 리엔지니어링 없이 다중 플랫폼, 언어 및 국가 간에 단일 소프트웨어 플랫폼 또는 단일 웹 사이트를 목표로 삼을 수 있습니다. 이를 사용하면 데이터를 손상 없이 여러 시스템을 통해 전송할 수 있습니다. 9 | 유니코드 콘소시엄에 대해 10 | 11 | 유니코드 콘소시엄은 비영리 조직으로서 현대 소프트웨어 제품과 표준에서 텍스트의 표현을 지정하는 유니코드 표준의 사용을 개발하고 확장하며 장려하기 위해 세워졌습니다. 콘소시엄 멤버쉽은 컴퓨터와 정보 처리 산업에 종사하고 있는 광범위한 회사 및 조직의 범위를 나타냅니다. 콘소시엄의 재정은 전적으로 회비에 의해 충당됩니다. 유니코드 컨소시엄에서의 멤버쉽은 전 세계 어느 곳에서나 유니코드 표준을 지원하고 그 확장과 구현을 지원하고자하는 조직과 개인에게 개방되어 있습니다. 12 | 13 | 더 자세한 내용은 용어집, 예제 유니코드 사용 가능 제품, 기술 정보 및 기타 유용한 정보를 참조하십시오. 14 | -------------------------------------------------------------------------------- /src/test/data/encodings/lang_czech: -------------------------------------------------------------------------------- 1 | Velký a Malý Tisý je národní přírodní rezervace ev. č. 498 poblíž města Lomnice nad Lužnicí v okrese Jindřichův Hradec ležící na území CHKO Třeboňsko. Řadí se mezi nejvýznamnější rybniční rezervace v Česku a je významná rozsáhlým litorálním porostem na březích rybníků. Oblast spravuje AOPK ČR Správa CHKO Třeboňsko a je evidována i v rámci světové organizace UNESCO jako biosférická rezervace, Natura 2000 a další. Důvodem ochrany je jedna z nejvýznamnějších ornitologických rezervací v Česku. Význam má i z pohledu entomologického. 2 | Součástí rezervace je 11 větších rybníků, mimo jiné i dvojice rybníků Velký a Malý Tisý, které daly lokalitě název. Pro rybníky v rezervaci je charakteristické, že mají velmi členité pobřeží tvořené zarostlými břehy, zátokami, poloostrovy a ostrůvky. Na břehy volně navazují podmáčené louky, lesy, vřesoviště a pole. Vlivem rozmanitosti různých stanovišť se zde nachází bohatá řada druhů z flory i fauny, které zde sídlí. Hlavně ptactvo využívá lokalitu jako důležitou migrační zastávku či shromaždiště před pravidelnými tahy. 3 | I přes to, že je lokalita po desetiletí chráněna, došlo nevhodnými hospodářskými zásahy v podobě nadměrného chovu ryb od 50. let 20. století k postupné degradaci a ústupu litorálních porostů. Od 90. let 20. století se ochranáři snaží snižováním počtu nasazovaných ryb a změnou jejich druhové skladby společně s vodohospodářskými zásahy do výšky vodní hladiny rybníku Velký Tisý podpořit rozvoj rákosových porostů. Výsledky těchto opatření ukázaly, že na obnovu porostů by i za vhodných podmínek byla potřeba doba dosahující až desítek let. -------------------------------------------------------------------------------- /src/test/data/encodings/iso2022cn: -------------------------------------------------------------------------------- 1 | $)A;y1>IO#,MPhR*:C<8VV2;M,5D1`Bk@40|@(KySP5DSoQT!#<4J9JG5%R;VVSoQT#,@}HgS"So#,R2C;SPDDR;8v1`Bk?IRTJJSCSZKySP5DWVD8#,1j5c7{:E#,:M3#SC5D<MJGK5#,A=VV1`Bk?ID\J9SCO`M,5DJ}WV4z1mA=8v2;M,5DWV7{#,;rJ9SC2;M,5DJ}WV4z1mO`M,5DWV7{!#HN:NR;L(LX6(5D]M(9}2;M,5D1`Bk;rF=L(V.]W\;aSPKp;55DN#OU!# 4 | Unicode$)AU}TZ8D1dKySPUbR;GP#! 5 | 6 | Unicode$)A8xC?8vWV7{La9)AKR;8vN(R;5DJ}WV#,2;B[JGJ2C4F=L(#,2;B[JGJ2C43LPr#,2;B[J2C4SoQT!#Unicode1jW-1;UbP)9$R5=g5DAl5!#WnPB5D1jW<6_5D4fTZ#,JG=|@4H+GrHm<~<=a:O#,1HJ9SC4+M3WV7{D\9;9a4)6`8vF=L(#,SoQT:M9z]4+Jd5=Pm6`2;M,5DO5M3#,6xN^Kp;5!# 9 | $)A9XSZUnicodeQ'JuQ';a 10 | 11 | Unicode$)AQ'JuQ';aJGR;8v7GS/@{5DWiV/#,JGN*7"U9#,@)U9:MMF9cJ9SCUnicode1jW<6x=(A"5D#,UnicodeQ'JuQ';aIhA"AKOV4zHm<~2zF7:M1jW5D1mJ>7(!#Q'JuQ';a5D;aT14z1mAK9c7:AlSr5D:MWiV/!#Q'JuQ';aV;SI;aT1La9)WJ=p!#UnicodeQ'JuQ';a5D;aT1WJ8q?*7E8xJ@=gIOHN:NV'3VUnicode1jW<:MO#M{P-VzFd@)U9:MV4PP5DWiV/<08vHK!# 12 | 13 | $)AS{V*8|6`PEO"#,Gk2NTDJuSo4J;c1m#,Unicode2zF7Qy1>#,<=4" 31 | }, 32 | "readmeFilename": "README.md", 33 | "directories": { 34 | "test": "test" 35 | }, 36 | "devDependencies": { 37 | "@types/jest": "^30.0.0", 38 | "@types/node": "^24.0.0", 39 | "jest": "^30.0.0", 40 | "prettier": "^3.0.0", 41 | "semantic-release": "^25.0.0", 42 | "ts-jest": "^29.0.0", 43 | "ts-node": "^10.9.1", 44 | "typescript": "^5.0.0" 45 | }, 46 | "keywords": [ 47 | "encoding", 48 | "character", 49 | "utf8", 50 | "detector", 51 | "chardet", 52 | "icu", 53 | "character detection", 54 | "character encoding", 55 | "language", 56 | "iconv", 57 | "iconv-light", 58 | "UTF-8", 59 | "UTF-16", 60 | "UTF-32", 61 | "ISO-2022-JP", 62 | "ISO-2022-KR", 63 | "ISO-2022-CN", 64 | "Shift_JIS", 65 | "Big5", 66 | "EUC-JP", 67 | "EUC-KR", 68 | "GB18030", 69 | "ISO-8859-1", 70 | "ISO-8859-2", 71 | "ISO-8859-5", 72 | "ISO-8859-6", 73 | "ISO-8859-7", 74 | "ISO-8859-8", 75 | "ISO-8859-9", 76 | "windows-1250", 77 | "windows-1251", 78 | "windows-1252", 79 | "windows-1253", 80 | "windows-1254", 81 | "windows-1255", 82 | "windows-1256", 83 | "KOI8-R" 84 | ], 85 | "author": "Dmitry Shirokov ", 86 | "contributors": [ 87 | "@spikying", 88 | "@wtgtybhertgeghgtwtg", 89 | "@suisho", 90 | "@seangarner", 91 | "@zevanty" 92 | ], 93 | "browser": { 94 | "./lib/fs/node.js": "./lib/fs/browser.js" 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/encoding/utf8.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type EncodingName, type Match } from '../match'; 3 | 4 | export default class Utf8 implements Recogniser { 5 | name(): EncodingName { 6 | return 'UTF-8'; 7 | } 8 | 9 | match(det: Context): Match | null { 10 | let hasBOM = false, 11 | numValid = 0, 12 | numInvalid = 0, 13 | trailBytes = 0, 14 | confidence; 15 | const input = det.rawInput; 16 | 17 | if ( 18 | det.rawLen >= 3 && 19 | (input[0] & 0xff) == 0xef && 20 | (input[1] & 0xff) == 0xbb && 21 | (input[2] & 0xff) == 0xbf 22 | ) { 23 | hasBOM = true; 24 | } 25 | 26 | // Scan for multi-byte sequences 27 | for (let i = 0; i < det.rawLen; i++) { 28 | const b = input[i]; 29 | if ((b & 0x80) == 0) continue; // ASCII 30 | 31 | // Hi bit on char found. Figure out how long the sequence should be 32 | if ((b & 0x0e0) == 0x0c0) { 33 | trailBytes = 1; 34 | } else if ((b & 0x0f0) == 0x0e0) { 35 | trailBytes = 2; 36 | } else if ((b & 0x0f8) == 0xf0) { 37 | trailBytes = 3; 38 | } else { 39 | numInvalid++; 40 | if (numInvalid > 5) break; 41 | trailBytes = 0; 42 | } 43 | 44 | // Verify that we've got the right number of trail bytes in the sequence 45 | for (;;) { 46 | i++; 47 | if (i >= det.rawLen) break; 48 | 49 | if ((input[i] & 0xc0) != 0x080) { 50 | numInvalid++; 51 | break; 52 | } 53 | if (--trailBytes == 0) { 54 | numValid++; 55 | break; 56 | } 57 | } 58 | } 59 | 60 | // Cook up some sort of confidence score, based on presence of a BOM 61 | // and the existence of valid and/or invalid multi-byte sequences. 62 | confidence = 0; 63 | if (hasBOM && numInvalid == 0) confidence = 100; 64 | else if (hasBOM && numValid > numInvalid * 10) confidence = 80; 65 | else if (numValid > 3 && numInvalid == 0) confidence = 100; 66 | else if (numValid > 0 && numInvalid == 0) confidence = 80; 67 | else if (numValid == 0 && numInvalid == 0) 68 | // Plain ASCII. 69 | confidence = 10; 70 | else if (numValid > numInvalid * 10) 71 | // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. 72 | confidence = 25; 73 | else return null; 74 | 75 | return match(det, this, confidence); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/encoding/sbcs.test.ts: -------------------------------------------------------------------------------- 1 | import * as chardet from '..'; 2 | 3 | describe('Singlebyte Character Sets', () => { 4 | const base = __dirname + '/../test/data/encodings'; 5 | 6 | it('should return ISO-8859-1 (English)', () => { 7 | expect(chardet.detectFileSync(base + '/iso88591_en')).toBe('ISO-8859-1'); 8 | }); 9 | 10 | it('should return ISO-8859-2 (Czech)', () => { 11 | expect(chardet.detectFileSync(base + '/iso88592_cs')).toBe('ISO-8859-2'); 12 | }); 13 | 14 | test.todo('should return ISO-8859-3'); 15 | test.todo('should return ISO-8859-4'); 16 | 17 | it('should return ISO-8859-5 (Russian)', () => { 18 | expect(chardet.detectFileSync(base + '/iso88595_ru')).toBe('ISO-8859-5'); 19 | }); 20 | 21 | it('should return ISO-8859-6 (Arabic)', () => { 22 | expect(chardet.detectFileSync(base + '/iso88596_ar')).toBe('ISO-8859-6'); 23 | }); 24 | 25 | it('should return ISO-8859-7 (Greek)', () => { 26 | expect(chardet.detectFileSync(base + '/iso88597_el')).toBe('ISO-8859-7'); 27 | }); 28 | 29 | it('should return ISO-8859-8 (Hebrew)', () => { 30 | expect(chardet.detectFileSync(base + '/iso88598_he')).toBe('ISO-8859-8'); 31 | }); 32 | 33 | it('should return ISO-8859-9 (Turkish)', () => { 34 | expect(chardet.detectFileSync(base + '/iso88599_tr')).toBe('ISO-8859-9'); 35 | }); 36 | 37 | test.todo('should return ISO-8859-10'); 38 | test.todo('should return ISO-8859-11'); 39 | // iso-8859-12 is abandoned 40 | test.todo('should return ISO-8859-13'); 41 | test.todo('should return ISO-8859-14'); 42 | test.todo('should return ISO-8859-15'); 43 | test.todo('should return ISO-8859-16'); 44 | 45 | it('should return windows-1250 (Czech)', () => { 46 | expect(chardet.detectFileSync(base + '/windows_1250')).toBe('windows-1250'); 47 | }); 48 | 49 | it('should return windows-1251 (Russian)', () => { 50 | expect(chardet.detectFileSync(base + '/windows_1251')).toBe('windows-1251'); 51 | }); 52 | 53 | it('should return windows-1252 (English)', () => { 54 | expect(chardet.detectFileSync(base + '/windows_1252')).toBe('windows-1252'); 55 | }); 56 | 57 | it('should return windows-1253 (Greek)', () => { 58 | expect(chardet.detectFileSync(base + '/windows_1253')).toBe('windows-1253'); 59 | }); 60 | 61 | it('should return windows-1254 (Turkish)', () => { 62 | expect(chardet.detectFileSync(base + '/windows_1254')).toBe('windows-1254'); 63 | }); 64 | 65 | it('should return windows-1255 (Hebrew)', () => { 66 | expect(chardet.detectFileSync(base + '/windows_1255')).toBe('windows-1255'); 67 | }); 68 | 69 | it('should return windows-1256 (Arabic)', () => { 70 | expect(chardet.detectFileSync(base + '/windows_1256')).toBe('windows-1256'); 71 | }); 72 | 73 | it('should return KOI8-R (Russian)', () => { 74 | expect(chardet.detectFileSync(base + '/koi8r')).toBe('KOI8-R'); 75 | }); 76 | }); 77 | -------------------------------------------------------------------------------- /src/test/data/encodings/iso2022jp: -------------------------------------------------------------------------------- 1 | $B%3%s%T%e!<%?!<$O!"K\$l$KHV9f$r3d$j?6$k$3$H$K$h$C$F07$($k$h$&$K$7$^$9!#%f%K%3!<%I$,=PMh$k$^$G$O!"$3$l$i$NHV9f$r3d$j?6$k;EAH$_$,2?I4$NB?$/$N4k6H$K:NMQ$5$l$F$$$^$9!#%f%K%3!<%I$O!"(BXML$B!"(BJava$B!"(BECMAScript(JavaScript)$B!"(BLDAP$B!"(BCORBA 3.0$B$J$I$N:G@hC<$NI8=`$NA0Ds$H$J$C$F$*$j!"%f%K%3!<%I$r$NB?$/$N@=IJ$G%5%]!<%H$5$l$F$$$^$9!#%f%K%3!<%II8=`$N=P8=$H%f%K%3!<%I$r%5%]!<%H$9$k%D!<%kN`$O!":r:#82Cx$K$J$C$F$$$k%=%U%H%&%(%"5;=Q$N%0%m!<%P%k2=$NN.$l$KBP$7$F!"FC$KLr$KN)$C$F$$$^$9!#(B 7 | 8 | $B%f%K%3!<%I$r%/%i%$%"%s%H%5!<%P!<7?$N%"%W%j%1!<%7%g%s$d!"B?AX9=B$$r;}$D%"%W%j%1!<%7%g%s!"%&%'%V%5%$%H$J$I$K$KAH$_9~$`$3$H$G!"=>Mh$NJ8;z%3!<%I%;%C%H$rMQ$$$k$h$j$bL@$i$+$J%3%9%H:o8:$,2DG=$G$9!#%f%K%3!<%I$O!"C10l$N%=%U%H%&%(%"@=IJ!"C10l$N%&%'%V%5%$%H$K!"2?$iAw$9$k$3$H$r2DG=$H$9$k$N$G$9!#(B 9 | $B%f%K%3!<%I%3%s%=!<%7%"%`$K$D$$$F(B 10 | 11 | $B%f%K%3!<%I%3%s%=!<%7%"%`$O!":G?7$N%=%U%H%&%(%"@=IJ$HI8=`$K$*$$$F%F%-%9%H$rI=8=$9$k$3$H$r0UL#$9$k!H%f%K%3!<%II8=`!I$N9=C[!"H/E8!"Ia5Z!"MxMQB%?J$rL\E*$H$7$F@_N)$5$l$?Hs1DMxAH?%$G$9!#F1%3%s%=!<%7%"%`$N2q0w$O!"%3%s%T%e!<%?!<$H>pJs=hM}$K78$o$k9-HF$J4k6H$dAH?%$+$i9=@.$5$l$F$$$^$9!#F1%3%s%=!<%7%"%`$O!":b@/E*$K$O!"=c?h$K2qHq$N$_$K$h$C$F1?1D$5$l$F$$$^$9!#%f%K%3!<%II8=`$r;Y;}$7!"$=$N3HD%$H\$7$$$3$H$r$*CN$j$K$J$j$?$$J}$O!"(BGlossary, Unicode-Enabled Products, Technical Introduction $B$*$h$S(B Useful Resources$B$r$4;2>H$/$@$5$$!#(B 14 | -------------------------------------------------------------------------------- /src/test/data/encodings/iso88591_en: -------------------------------------------------------------------------------- 1 | What is Unicode? 2 | 3 | Unicode provides a unique number for every character, 4 | no matter what the platform, 5 | no matter what the program, 6 | no matter what the language. 7 | 8 | Fundamentally, computers just deal with numbers. They store letters and other characters by assigning a number for each one. Before Unicode was invented, there were hundreds of different encoding systems for assigning these numbers. No single encoding could contain enough characters: for example, the European Union alone requires several different encodings to cover all its languages. Even for a single language like English no single encoding was adequate for all the letters, punctuation, and technical symbols in common use. 9 | 10 | These encoding systems also conflict with one another. That is, two encodings can use the same number for two different characters, or use different numbers for the same character. Any given computer (especially servers) needs to support many different encodings; yet whenever data is passed between different encodings or platforms, that data always runs the risk of corruption. 11 | Unicode is changing all that! 12 | 13 | Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language. The Unicode Standard has been adopted by such industry leaders as Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys and many others. Unicode is required by modern standards such as XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML, etc., and is the official way to implement ISO/IEC 10646. It is supported in many operating systems, all modern browsers, and many other products. The emergence of the Unicode Standard, and the availability of tools supporting it, are among the most significant recent global software technology trends. 14 | 15 | Incorporating Unicode into client-server or multi-tiered applications and websites offers significant cost savings over the use of legacy character sets. Unicode enables a single software product or a single website to be targeted across multiple platforms, languages and countries without re-engineering. It allows data to be transported through many different systems without corruption. 16 | About the Unicode Consortium 17 | 18 | The Unicode Consortium is a non-profit organization founded to develop, extend and promote use of the Unicode Standard, which specifies the representation of text in modern software products and standards. The membership of the consortium represents a broad spectrum of corporations and organizations in the computer and information processing industry. The consortium is supported financially solely through membership dues. Membership in the Unicode Consortium is open to organizations and individuals anywhere in the world who support the Unicode Standard and wish to assist in its extension and implementation. 19 | 20 | For more information, see the Glossary, Unicode Enabled Products, Technical Introduction and Useful Resources. 21 | -------------------------------------------------------------------------------- /src/test/data/encodings/iso2022kr: -------------------------------------------------------------------------------- 1 | $)C1b:;@{@87N DDG;EM4B <}@Z88 C38.GU4O4Y. 1[@Z3* 4Y8% 9.@Z?!55 <}@Z8& AvA$GO?) @z@eGU4O4Y. @/4ODZ5e0! 039_5G1b @|?!4B @L7/GQ <}@Z8& AvA$GO1b @'GX x>z=@4O4Y. ?98& 5i>n @/74 ?,GU?!<-88 :84u6s55 8p5g 0" 3*6s:0 >p>n8& C38.GO7A8i ?)7/ 03@G 4Y8% 1bH#H- 9f9}@L GJ?dGU4O4Y. ?5>n?M 00@: 4\@O >p>n@G 0f?l55 0xEk@{@87N ;g?k5G4B 8p5g 1[@Z, 9.@e :NH# 9W EWE)4ODC 1bH#?! 8B4B 4\@O 1bH#H- 9f9}@; 0.0m @VAv 8xGO?4=@4O4Y. 2 | 3 | $)C@L7/GQ 1bH#H- =C=:E[@: 6GGQ 4Y8% 1bH#H- =C=:E[0z Cf59GU4O4Y. Ao 5N 0!Av 1bH#H- 9f9}@L 5N 03@G 4Y8% 9.@Z?! 4kGX 00@: 9xH#8& ;g?kGO0E3* 00@: 9.@Z?! 4kGX 4Y8% 9xH#8& ;g?kGR nAx 8p5g DDG;EM(F/Hw <-9v)4B <-7N 4Y8% ?)7/ 0!Av 1bH#H- 9f9}@; Av?xGX>_ GU4O4Y. 1W7/3*, 5%@LEM8& <-7N 4Y8% 1bH#H- 9f9}@L3* GC7'F{ 0#?! @|4^GR 6'864Y 1W 5%@LEM4B GW;s p>n?! 0|0h>x@L 9.@Z864Y 0m@/GQ <}@Z8& A&0xGU4O4Y. @/4ODZ5e G%AX@: Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys 9W 1bE8 ?)7/ H8;g?M 00@: >w0h <15NAV@Z?! @GGX C$EC5G>z=@4O4Y. @/4ODZ5e4B XML, Java, ECMAScript(JavaScript), LDAP, CORBA 3.0, WML 5n0z 00@L Gv@g 3N8. ;g?k5G4B G%AX?!<- GJ?dGO8g @L4B ISO/IEC 10646@; 18GvGO4B 0x=D@{@N 9f9}@T4O4Y. @L4B 89@: ?n?5 CpF.-<-9v 6G4B 4YA_-?,0a @@?k GA7N1W7%0z @% ;g@LF.?! EkGUGO8i 790E=C 9.@Z <n<- ;s4gGQ :q?k @}0( H?0z0! 3*E8334O4Y. @/4ODZ5e8& EkGX 8.?#Av4O>n85 >x@L 4YA_ GC7'F{, >p>n 9W 190! 0#?! 4\@O n GC7'F{ 6G4B 4\@O @% ;g@LF.8& 8qG%7N ;o@; x@L ?)7/ =C=:E[@; EkGX @|<[GR v?! 4kGX 10 | 11 | $)C@/4ODZ5e D\v@: :q?58. A6Aw@87N<- Gv4k n A&G00z G%AX?!<- EX=:F.@G G%Gv@; AvA$GO4B @/4ODZ5e G%AX@G ;g?k@; 039_GO0m H.@eGO8g @e7AGO1b @'GX <v 8b9v=1@: DDG;EM?M A$:8 C38. ;j>w?! A>;gGO0m @V4B 1$9|@'GQ H8;g 9W A6Aw@G 9|@'8& 3*E83@4O4Y. D\v@G @gA$@: @|@{@87N H8:q?! @GGX Cf4g5K4O4Y. @/4ODZ5e DAv?!<-@G 8b9v=1@: @| <<0h >n4@ 0w?!<-3* @/4ODZ5e G%AX@; Av?xGO0m 1W H.@e0z 18Gv@; Av?xGO0m@ZGO4B A6Aw0z 03@N?!0T 039f5G>n @V=@4O4Y. 12 | 13 | $)C4u @Z<nA}, ?9A& @/4ODZ5e ;g?k 0!4I A&G0, 1b { 6 | 7 | const path = __dirname + '/test/data/encodings/utf8'; 8 | const expectedEncodingsFromPath = [ 9 | { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined }, 10 | { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' }, 11 | { 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' }, 12 | { 'confidence': 10, 'name': 'Big5', 'lang': 'zh' }, 13 | { 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin 14 | { 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek 15 | { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' }, 16 | { 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' }, 17 | { 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' }, 18 | { 'confidence': 0, 'name': 'ASCII', 'lang': undefined }, 19 | ]; 20 | 21 | it('has both named and default exports', () => { 22 | expect(defaultChardet.analyse).toBe(chardet.analyse); 23 | expect(defaultChardet.detect).toBe(chardet.detect); 24 | expect(defaultChardet.detectFile).toBe(chardet.detectFile); 25 | expect(defaultChardet.detectFileSync).toBe(chardet.detectFileSync); 26 | }); 27 | 28 | describe('#detect', () => { 29 | it('should detect encoding', () => { 30 | expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8'); 31 | }); 32 | 33 | it('should not block when non-buffer supplied', () => { 34 | const invalid = [123, '123']; 35 | const error = 'Input must be a byte array, e.g. Buffer or Uint8Array'; 36 | // @ts-expect-error Testing invalid inputs 37 | invalid.forEach((input) => expect(() => chardet.detect(input)).toThrow(error)); 38 | }) 39 | }); 40 | 41 | describe('#detectFile', () => { 42 | it('should detect encoding', async () => { 43 | const res = await chardet.detectFile(path); 44 | expect(res).toBe('UTF-8'); 45 | }); 46 | 47 | it('should detect encoding with smaller sample size', async () => { 48 | const res = await chardet.detectFile(path, { sampleSize: 32 }); 49 | expect(res).toBe('UTF-8'); 50 | }); 51 | 52 | it('should detect encoding with smaller sample size and offset', async () => { 53 | const res = await chardet.detectFile(path, { sampleSize: 32, offset: 64 }); 54 | expect(res).toBe('UTF-8'); 55 | }); 56 | 57 | it('should work as expected with sampleSize larger than actual file size (1)', async () => { 58 | const res = await chardet.detectFile(path, { sampleSize: 1024 * 1024 }); 59 | expect(res).toBe('UTF-8'); 60 | }); 61 | 62 | it('should work as expected with sampleSize larger than actual file size (2)', async () => { 63 | const res = await chardet.detectFile(__dirname + '/test/data/encodings/koi8r', { sampleSize: 1024 * 1024 }); 64 | expect(res).toBe('KOI8-R'); 65 | }); 66 | }); 67 | 68 | describe('#detectFileSync', () => { 69 | it('should detect encoding', () => { 70 | expect(chardet.detectFileSync(path)).toBe('UTF-8'); 71 | }); 72 | 73 | it('should detect encoding with smaller sample size', () => { 74 | expect(chardet.detectFileSync(path, { sampleSize: 32 })).toBe('UTF-8'); 75 | }); 76 | 77 | it('should detect encoding with smaller sample size and offset', () => { 78 | expect(chardet.detectFileSync(path, { sampleSize: 32, offset: 64 })).toBe('UTF-8'); 79 | }); 80 | }); 81 | 82 | describe('#analyse', () => { 83 | it('should return a list of encodings, sorted by confidence level in descending order', () => { 84 | const matches = chardet.analyse(fs.readFileSync(path)); 85 | expect(matches).toEqual(expectedEncodingsFromPath); 86 | }); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chardet 2 | 3 | _Chardet_ is a character detection module written in pure JavaScript (TypeScript). Module uses occurrence analysis to determine the most probable encoding. 4 | 5 | - Packed size is only **22 KB** 6 | - Works in all environments: Node / Browser / Native 7 | - Works on all platforms: Linux / Mac / Windows 8 | - No dependencies 9 | - No native code / bindings 10 | - 100% written in TypeScript 11 | - Extensive code coverage 12 | 13 | ## Installation 14 | 15 | ``` 16 | npm i chardet 17 | ``` 18 | 19 | ## Usage 20 | 21 | To return the encoding with the highest confidence: 22 | 23 | ```javascript 24 | import chardet from 'chardet'; 25 | 26 | const encoding = chardet.detect(Buffer.from('hello there!')); 27 | // or 28 | const encoding = await chardet.detectFile('/path/to/file'); 29 | // or 30 | const encoding = chardet.detectFileSync('/path/to/file'); 31 | ``` 32 | 33 | To return the full list of possible encodings use `analyse` method. 34 | 35 | ```javascript 36 | import chardet from 'chardet'; 37 | chardet.analyse(Buffer.from('hello there!')); 38 | ``` 39 | 40 | Returned value is an array of objects sorted by confidence value in descending order 41 | 42 | ```javascript 43 | [ 44 | { confidence: 90, name: 'UTF-8' }, 45 | { confidence: 20, name: 'windows-1252', lang: 'fr' }, 46 | ]; 47 | ``` 48 | 49 | In browser, you can use [Uint8Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array) instead of the `Buffer`: 50 | 51 | ```javascript 52 | import chardet from 'chardet'; 53 | chardet.analyse(new Uint8Array([0x68, 0x65, 0x6c, 0x6c, 0x6f])); 54 | ``` 55 | 56 | ## Working with large data sets 57 | 58 | Sometimes, when data set is huge and you want to optimize performance (with a trade off of less accuracy), 59 | you can sample only the first N bytes of the buffer: 60 | 61 | ```javascript 62 | const encoding = await chardet.detectFile('/path/to/file', { sampleSize: 32 }); 63 | ``` 64 | 65 | You can also specify where to begin reading from in the buffer: 66 | 67 | ```javascript 68 | const encoding = await chardet.detectFile('/path/to/file', { 69 | sampleSize: 32, 70 | offset: 128, 71 | }); 72 | ``` 73 | 74 | ## Working with strings 75 | 76 | In both Node.js and browsers, all strings in memory are represented in UTF-16 encoding. This is a fundamental aspect of the JavaScript language specification. Therefore, you cannot use plain strings directly as input for `chardet.analyse()` or `chardet.detect()`. Instead, you need the original string data in the form of a Buffer or Uint8Array. 77 | 78 | In other words, if you receive a piece of data over the network and want to detect its encoding, use the original data payload, not its string representation. By the time you convert data to a string, it will be in UTF-16 encoding. 79 | 80 | Note on [TextEncoder](https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/TextEncoder): By default, it returns a UTF-8 encoded buffer, which means the buffer will not be in the original encoding of the string. 81 | 82 | ## Supported Encodings: 83 | 84 | - UTF-8 85 | - UTF-16 LE 86 | - UTF-16 BE 87 | - UTF-32 LE 88 | - UTF-32 BE 89 | - ISO-2022-JP 90 | - ISO-2022-KR 91 | - ISO-2022-CN 92 | - Shift_JIS 93 | - Big5 94 | - EUC-JP 95 | - EUC-KR 96 | - GB18030 97 | - ISO-8859-1 98 | - ISO-8859-2 99 | - ISO-8859-5 100 | - ISO-8859-6 101 | - ISO-8859-7 102 | - ISO-8859-8 103 | - ISO-8859-9 104 | - windows-1250 105 | - windows-1251 106 | - windows-1252 107 | - windows-1253 108 | - windows-1254 109 | - windows-1255 110 | - windows-1256 111 | - KOI8-R 112 | 113 | Currently only these encodings are supported. 114 | 115 | ## TypeScript? 116 | 117 | Yes. Type definitions are included. 118 | 119 | ### References 120 | 121 | - ICU project http://site.icu-project.org/ 122 | -------------------------------------------------------------------------------- /src/encoding/unicode.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type Match, type EncodingName } from '../match'; 3 | 4 | /** 5 | * This class matches UTF-16 and UTF-32, both big- and little-endian. The 6 | * BOM will be used if it is present. 7 | */ 8 | export class UTF_16BE implements Recogniser { 9 | name(): EncodingName { 10 | return 'UTF-16BE'; 11 | } 12 | 13 | match(det: Context): Match | null { 14 | const input = det.rawInput; 15 | 16 | if ( 17 | input.length >= 2 && 18 | (input[0] & 0xff) == 0xfe && 19 | (input[1] & 0xff) == 0xff 20 | ) { 21 | return match(det, this, 100); // confidence = 100 22 | } 23 | 24 | // TODO: Do some statistics to check for unsigned UTF-16BE 25 | return null; 26 | } 27 | } 28 | 29 | export class UTF_16LE implements Recogniser { 30 | name(): EncodingName { 31 | return 'UTF-16LE'; 32 | } 33 | 34 | match(det: Context): Match | null { 35 | const input = det.rawInput; 36 | 37 | if ( 38 | input.length >= 2 && 39 | (input[0] & 0xff) == 0xff && 40 | (input[1] & 0xff) == 0xfe 41 | ) { 42 | // LE BOM is present. 43 | if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) { 44 | // It is probably UTF-32 LE, not UTF-16 45 | return null; 46 | } 47 | return match(det, this, 100); // confidence = 100 48 | } 49 | 50 | // TODO: Do some statistics to check for unsigned UTF-16LE 51 | return null; 52 | } 53 | } 54 | 55 | interface WithGetChar { 56 | getChar(input: Uint8Array, index: number): number; 57 | } 58 | 59 | class UTF_32 implements Recogniser, WithGetChar { 60 | name(): EncodingName { 61 | return 'UTF-32'; 62 | } 63 | 64 | getChar(_input: Uint8Array, _index: number): number { 65 | return -1; 66 | } 67 | 68 | match(det: Context): Match | null { 69 | let numValid = 0, 70 | numInvalid = 0, 71 | hasBOM = false, 72 | confidence = 0; 73 | const limit = (det.rawLen / 4) * 4; 74 | const input = det.rawInput; 75 | 76 | if (limit == 0) { 77 | return null; 78 | } 79 | 80 | if (this.getChar(input, 0) == 0x0000feff) { 81 | hasBOM = true; 82 | } 83 | 84 | for (let i = 0; i < limit; i += 4) { 85 | const ch = this.getChar(input, i); 86 | 87 | if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) { 88 | numInvalid += 1; 89 | } else { 90 | numValid += 1; 91 | } 92 | } 93 | 94 | // Cook up some sort of confidence score, based on presence of a BOM 95 | // and the existence of valid and/or invalid multi-byte sequences. 96 | if (hasBOM && numInvalid == 0) { 97 | confidence = 100; 98 | } else if (hasBOM && numValid > numInvalid * 10) { 99 | confidence = 80; 100 | } else if (numValid > 3 && numInvalid == 0) { 101 | confidence = 100; 102 | } else if (numValid > 0 && numInvalid == 0) { 103 | confidence = 80; 104 | } else if (numValid > numInvalid * 10) { 105 | // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. 106 | confidence = 25; 107 | } 108 | 109 | // return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 110 | return confidence == 0 ? null : match(det, this, confidence); 111 | } 112 | } 113 | 114 | export class UTF_32BE extends UTF_32 { 115 | name(): EncodingName { 116 | return 'UTF-32BE'; 117 | } 118 | getChar(input: Uint8Array, index: number) { 119 | return ( 120 | ((input[index + 0] & 0xff) << 24) | 121 | ((input[index + 1] & 0xff) << 16) | 122 | ((input[index + 2] & 0xff) << 8) | 123 | (input[index + 3] & 0xff) 124 | ); 125 | } 126 | } 127 | 128 | export class UTF_32LE extends UTF_32 { 129 | name(): EncodingName { 130 | return 'UTF-32LE'; 131 | } 132 | 133 | getChar(input: Uint8Array, index: number) { 134 | return ( 135 | ((input[index + 3] & 0xff) << 24) | 136 | ((input[index + 2] & 0xff) << 16) | 137 | ((input[index + 1] & 0xff) << 8) | 138 | (input[index + 0] & 0xff) 139 | ); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/encoding/iso2022.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type Match, type EncodingName } from '../match'; 3 | 4 | /** 5 | * This is a superclass for the individual detectors for 6 | * each of the detectable members of the ISO 2022 family 7 | * of encodings. 8 | */ 9 | 10 | class ISO_2022 implements Recogniser { 11 | escapeSequences: number[][] = []; 12 | 13 | name(): EncodingName { 14 | return 'ISO_2022'; 15 | } 16 | 17 | match(det: Context): Match | null { 18 | /** 19 | * Matching function shared among the 2022 detectors JP, CN and KR 20 | * Counts up the number of legal an unrecognized escape sequences in 21 | * the sample of text, and computes a score based on the total number & 22 | * the proportion that fit the encoding. 23 | * 24 | * 25 | * @param text the byte buffer containing text to analyse 26 | * @param textLen the size of the text in the byte. 27 | * @param escapeSequences the byte escape sequences to test for. 28 | * @return match quality, in the range of 0-100. 29 | */ 30 | 31 | let i, j; 32 | let escN; 33 | let hits = 0; 34 | let misses = 0; 35 | let shifts = 0; 36 | let confidence; 37 | 38 | // TODO: refactor me 39 | const text = det.inputBytes; 40 | const textLen = det.inputLen; 41 | 42 | scanInput: for (i = 0; i < textLen; i++) { 43 | if (text[i] == 0x1b) { 44 | checkEscapes: for ( 45 | escN = 0; 46 | escN < this.escapeSequences.length; 47 | escN++ 48 | ) { 49 | const seq = this.escapeSequences[escN]; 50 | 51 | if (textLen - i < seq.length) continue checkEscapes; 52 | 53 | for (j = 1; j < seq.length; j++) 54 | if (seq[j] != text[i + j]) continue checkEscapes; 55 | 56 | hits++; 57 | i += seq.length - 1; 58 | continue scanInput; 59 | } 60 | 61 | misses++; 62 | } 63 | 64 | // Shift in/out 65 | if (text[i] == 0x0e || text[i] == 0x0f) shifts++; 66 | } 67 | 68 | if (hits == 0) return null; 69 | 70 | // 71 | // Initial quality is based on relative proportion of recognized vs. 72 | // unrecognized escape sequences. 73 | // All good: quality = 100; 74 | // half or less good: quality = 0; 75 | // linear in between. 76 | confidence = (100 * hits - 100 * misses) / (hits + misses); 77 | 78 | // Back off quality if there were too few escape sequences seen. 79 | // Include shifts in this computation, so that KR does not get penalized 80 | // for having only a single Escape sequence, but many shifts. 81 | if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10; 82 | 83 | return confidence <= 0 ? null : match(det, this, confidence); 84 | } 85 | } 86 | 87 | export class ISO_2022_JP extends ISO_2022 { 88 | name(): EncodingName { 89 | return 'ISO-2022-JP'; 90 | } 91 | 92 | language() { 93 | return 'ja'; 94 | } 95 | 96 | escapeSequences = [ 97 | [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992 98 | [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990 99 | [0x1b, 0x24, 0x40], // JIS C 6226-1978 100 | [0x1b, 0x24, 0x41], // GB 2312-80 101 | [0x1b, 0x24, 0x42], // JIS X 208-1983 102 | [0x1b, 0x26, 0x40], // JIS X 208 1990, 1997 103 | [0x1b, 0x28, 0x42], // ASCII 104 | [0x1b, 0x28, 0x48], // JIS-Roman 105 | [0x1b, 0x28, 0x49], // Half-width katakana 106 | [0x1b, 0x28, 0x4a], // JIS-Roman 107 | [0x1b, 0x2e, 0x41], // ISO 8859-1 108 | [0x1b, 0x2e, 0x46], // ISO 8859-7 109 | ]; 110 | } 111 | 112 | export class ISO_2022_KR extends ISO_2022 { 113 | name(): EncodingName { 114 | return 'ISO-2022-KR'; 115 | } 116 | language() { 117 | return 'kr'; 118 | } 119 | escapeSequences = [[0x1b, 0x24, 0x29, 0x43]]; 120 | } 121 | 122 | export class ISO_2022_CN extends ISO_2022 { 123 | name(): EncodingName { 124 | return 'ISO-2022-CN'; 125 | } 126 | language() { 127 | return 'zh'; 128 | } 129 | escapeSequences = [ 130 | [0x1b, 0x24, 0x29, 0x41], // GB 2312-80 131 | [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1 132 | [0x1b, 0x24, 0x2a, 0x48], // CNS 11643-1992 Plane 2 133 | [0x1b, 0x24, 0x29, 0x45], // ISO-IR-165 134 | [0x1b, 0x24, 0x2b, 0x49], // CNS 11643-1992 Plane 3 135 | [0x1b, 0x24, 0x2b, 0x4a], // CNS 11643-1992 Plane 4 136 | [0x1b, 0x24, 0x2b, 0x4b], // CNS 11643-1992 Plane 5 137 | [0x1b, 0x24, 0x2b, 0x4c], // CNS 11643-1992 Plane 6 138 | [0x1b, 0x24, 0x2b, 0x4d], // CNS 11643-1992 Plane 7 139 | [0x1b, 0x4e], // SS2 140 | [0x1b, 0x4f], // SS3 141 | ]; 142 | } 143 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { Match } from './match'; 2 | import { Recogniser, Context } from './encoding'; 3 | 4 | import loadFs from './fs/node'; 5 | 6 | import Ascii from './encoding/ascii'; 7 | import Utf8 from './encoding/utf8'; 8 | import * as unicode from './encoding/unicode'; 9 | import * as mbcs from './encoding/mbcs'; 10 | import * as sbcs from './encoding/sbcs'; 11 | import * as iso2022 from './encoding/iso2022'; 12 | import { isByteArray } from './utils'; 13 | 14 | interface FullOptions { 15 | sampleSize: number; 16 | offset: number; 17 | } 18 | 19 | export type Options = Partial; 20 | 21 | const recognisers: Recogniser[] = [ 22 | new Utf8(), 23 | new unicode.UTF_16BE(), 24 | new unicode.UTF_16LE(), 25 | new unicode.UTF_32BE(), 26 | new unicode.UTF_32LE(), 27 | new mbcs.sjis(), 28 | new mbcs.big5(), 29 | new mbcs.euc_jp(), 30 | new mbcs.euc_kr(), 31 | new mbcs.gb_18030(), 32 | new iso2022.ISO_2022_JP(), 33 | new iso2022.ISO_2022_KR(), 34 | new iso2022.ISO_2022_CN(), 35 | new sbcs.ISO_8859_1(), 36 | new sbcs.ISO_8859_2(), 37 | new sbcs.ISO_8859_5(), 38 | new sbcs.ISO_8859_6(), 39 | new sbcs.ISO_8859_7(), 40 | new sbcs.ISO_8859_8(), 41 | new sbcs.ISO_8859_9(), 42 | new sbcs.windows_1251(), 43 | new sbcs.windows_1256(), 44 | new sbcs.KOI8_R(), 45 | new Ascii(), 46 | ]; 47 | 48 | export type AnalyseResult = Match[]; 49 | export type DetectResult = string | null; 50 | 51 | export const detect = (buffer: Uint8Array): string | null => { 52 | const matches: Match[] = analyse(buffer); 53 | return matches.length > 0 ? matches[0].name : null; 54 | }; 55 | 56 | export const analyse = (buffer: Uint8Array): AnalyseResult => { 57 | if (!isByteArray(buffer)) { 58 | throw new Error('Input must be a byte array, e.g. Buffer or Uint8Array'); 59 | } 60 | 61 | // Tally up the byte occurrence statistics. 62 | const byteStats = []; 63 | for (let i = 0; i < 256; i++) byteStats[i] = 0; 64 | 65 | for (let i = buffer.length - 1; i >= 0; i--) byteStats[buffer[i] & 0x00ff]++; 66 | 67 | let c1Bytes = false; 68 | for (let i = 0x80; i <= 0x9f; i += 1) { 69 | if (byteStats[i] !== 0) { 70 | c1Bytes = true; 71 | break; 72 | } 73 | } 74 | 75 | const context: Context = { 76 | byteStats, 77 | c1Bytes, 78 | rawInput: buffer, 79 | rawLen: buffer.length, 80 | inputBytes: buffer, 81 | inputLen: buffer.length, 82 | }; 83 | 84 | const matches = recognisers 85 | .map((rec) => { 86 | return rec.match(context); 87 | }) 88 | .filter((match) => { 89 | return !!match; 90 | }) 91 | .sort((a, b) => { 92 | return b!.confidence - a!.confidence; 93 | }); 94 | 95 | return matches as Match[]; 96 | }; 97 | 98 | export const detectFile = ( 99 | filepath: string, 100 | opts: Options = {} 101 | ): Promise => 102 | new Promise((resolve, reject) => { 103 | let fd: any; 104 | const fs = loadFs(); 105 | 106 | const handler = (err: Error | null, buffer: Buffer | null) => { 107 | if (fd) { 108 | fs.closeSync(fd); 109 | } 110 | 111 | if (err) { 112 | reject(err); 113 | } else if (buffer) { 114 | resolve(detect(buffer)); 115 | } else { 116 | reject(new Error('No error and no buffer received')); 117 | } 118 | }; 119 | 120 | const sampleSize = opts?.sampleSize || 0; 121 | if (sampleSize > 0) { 122 | fd = fs.openSync(filepath, 'r'); 123 | let sample = Buffer.allocUnsafe(sampleSize); 124 | 125 | fs.read(fd, sample, 0, sampleSize, opts.offset, (err: NodeJS.ErrnoException | null, bytesRead: number) => { 126 | if (err) { 127 | handler(err, null); 128 | } else { 129 | if (bytesRead < sampleSize) { 130 | sample = sample.subarray(0, bytesRead); 131 | } 132 | handler(null, sample); 133 | } 134 | }); 135 | return; 136 | } 137 | 138 | fs.readFile(filepath, handler); 139 | }); 140 | 141 | export const detectFileSync = ( 142 | filepath: string, 143 | opts: Options = {} 144 | ): DetectResult => { 145 | const fs = loadFs(); 146 | 147 | if (opts && opts.sampleSize) { 148 | const fd = fs.openSync(filepath, 'r'); 149 | let sample = Buffer.allocUnsafe(opts.sampleSize); 150 | 151 | const bytesRead = fs.readSync(fd, sample, 0, opts.sampleSize, opts.offset); 152 | if (bytesRead < opts.sampleSize) { 153 | sample = sample.subarray(0, bytesRead); 154 | } 155 | fs.closeSync(fd); 156 | return detect(sample); 157 | } 158 | 159 | return detect(fs.readFileSync(filepath)); 160 | }; 161 | 162 | export default { 163 | analyse, 164 | detect, 165 | detectFileSync, 166 | detectFile, 167 | }; 168 | 169 | export { Match, EncodingName } from './match'; 170 | -------------------------------------------------------------------------------- /src/encoding/mbcs.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type Match, type EncodingName } from '../match'; 3 | 4 | /** 5 | * Binary search implementation (recursive) 6 | */ 7 | function binarySearch(arr: number[], searchValue: number) { 8 | const find = ( 9 | arr: number[], 10 | searchValue: number, 11 | left: number, 12 | right: number, 13 | ): number => { 14 | if (right < left) return -1; 15 | 16 | /* 17 | int mid = mid = (left + right) / 2; 18 | There is a bug in the above line; 19 | Joshua Bloch suggests the following replacement: 20 | */ 21 | const mid = Math.floor((left + right) >>> 1); 22 | if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right); 23 | 24 | if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1); 25 | 26 | return mid; 27 | }; 28 | 29 | return find(arr, searchValue, 0, arr.length - 1); 30 | } 31 | 32 | // 'Character' iterated character class. 33 | // Recognizers for specific mbcs encodings make their 'characters' available 34 | // by providing a nextChar() function that fills in an instance of iteratedChar 35 | // with the next char from the input. 36 | // The returned characters are not converted to Unicode, but remain as the raw 37 | // bytes (concatenated into an int) from the codepage data. 38 | // 39 | // For Asian charsets, use the raw input rather than the input that has been 40 | // stripped of markup. Detection only considers multi-byte chars, effectively 41 | // stripping markup anyway, and double byte chars do occur in markup too. 42 | // 43 | class IteratedChar { 44 | charValue: number; // 1-4 bytes from the raw input data 45 | index: number; 46 | nextIndex: number; 47 | error: boolean; 48 | done: boolean; 49 | 50 | constructor() { 51 | this.charValue = 0; // 1-4 bytes from the raw input data 52 | this.index = 0; 53 | this.nextIndex = 0; 54 | this.error = false; 55 | this.done = false; 56 | } 57 | 58 | reset() { 59 | this.charValue = 0; 60 | this.index = -1; 61 | this.nextIndex = 0; 62 | this.error = false; 63 | this.done = false; 64 | } 65 | 66 | nextByte(det: Context) { 67 | if (this.nextIndex >= det.rawLen) { 68 | this.done = true; 69 | return -1; 70 | } 71 | const byteValue = det.rawInput[this.nextIndex++] & 0x00ff; 72 | return byteValue; 73 | } 74 | } 75 | 76 | /** 77 | * Asian double or multi-byte - charsets. 78 | * Match is determined mostly by the input data adhering to the 79 | * encoding scheme for the charset, and, optionally, 80 | * frequency-of-occurrence of characters. 81 | */ 82 | 83 | class mbcs implements Recogniser { 84 | commonChars: number[] = []; 85 | 86 | name(): EncodingName { 87 | return 'mbcs'; 88 | } 89 | 90 | /** 91 | * Test the match of this charset with the input text data 92 | * which is obtained via the CharsetDetector object. 93 | * 94 | * @param det The CharsetDetector, which contains the input text 95 | * to be checked for being in this charset. 96 | * @return Two values packed into one int (Damn java, anyhow) 97 | * bits 0-7: the match confidence, ranging from 0-100 98 | * bits 8-15: The match reason, an enum-like value. 99 | */ 100 | match(det: Context): Match | null { 101 | let doubleByteCharCount = 0, 102 | commonCharCount = 0, 103 | badCharCount = 0, 104 | totalCharCount = 0, 105 | confidence = 0; 106 | 107 | const iter = new IteratedChar(); 108 | 109 | detectBlock: { 110 | for (iter.reset(); this.nextChar(iter, det); ) { 111 | totalCharCount++; 112 | if (iter.error) { 113 | badCharCount++; 114 | } else { 115 | const cv = iter.charValue & 0xffffffff; 116 | 117 | if (cv > 0xff) { 118 | doubleByteCharCount++; 119 | if (this.commonChars != null) { 120 | // NOTE: This assumes that there are no 4-byte common chars. 121 | if (binarySearch(this.commonChars, cv) >= 0) { 122 | commonCharCount++; 123 | } 124 | } 125 | } 126 | } 127 | if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) { 128 | // console.log('its here!') 129 | // Bail out early if the byte data is not matching the encoding scheme. 130 | break detectBlock; 131 | } 132 | } 133 | 134 | if (doubleByteCharCount <= 10 && badCharCount == 0) { 135 | // Not many multi-byte chars. 136 | if (doubleByteCharCount == 0 && totalCharCount < 10) { 137 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 138 | // We don't have enough data to have any confidence. 139 | // Statistical analysis of single byte non-ASCII characters would probably help here. 140 | confidence = 0; 141 | } else { 142 | // ASCII or ISO file? It's probably not our encoding, 143 | // but is not incompatible with our encoding, so don't give it a zero. 144 | confidence = 10; 145 | } 146 | break detectBlock; 147 | } 148 | 149 | // 150 | // No match if there are too many characters that don't fit the encoding scheme. 151 | // (should we have zero tolerance for these?) 152 | // 153 | if (doubleByteCharCount < 20 * badCharCount) { 154 | confidence = 0; 155 | break detectBlock; 156 | } 157 | 158 | if (this.commonChars == null) { 159 | // We have no statistics on frequently occurring characters. 160 | // Assess confidence purely on having a reasonable number of 161 | // multi-byte characters (the more the better 162 | confidence = 30 + doubleByteCharCount - 20 * badCharCount; 163 | if (confidence > 100) { 164 | confidence = 100; 165 | } 166 | } else { 167 | // Frequency of occurrence statistics exist. 168 | const maxVal = Math.log(doubleByteCharCount / 4); 169 | const scaleFactor = 90.0 / maxVal; 170 | confidence = Math.floor( 171 | Math.log(commonCharCount + 1) * scaleFactor + 10, 172 | ); 173 | confidence = Math.min(confidence, 100); 174 | } 175 | } // end of detectBlock: 176 | 177 | return confidence == 0 ? null : match(det, this, confidence); 178 | } 179 | 180 | /** 181 | * Get the next character (however many bytes it is) from the input data 182 | * Subclasses for specific charset encodings must implement this function 183 | * to get characters according to the rules of their encoding scheme. 184 | * 185 | * This function is not a method of class iteratedChar only because 186 | * that would require a lot of extra derived classes, which is awkward. 187 | * @param it The iteratedChar 'struct' into which the returned char is placed. 188 | * @param det The charset detector, which is needed to get at the input byte data 189 | * being iterated over. 190 | * @return True if a character was returned, false at end of input. 191 | */ 192 | nextChar(_iter: IteratedChar, _det: Context): boolean { 193 | return true; 194 | } 195 | } 196 | 197 | /** 198 | * Shift_JIS charset recognizer. 199 | */ 200 | export class sjis extends mbcs { 201 | name(): EncodingName { 202 | return 'Shift_JIS'; 203 | } 204 | 205 | language() { 206 | return 'ja'; 207 | } 208 | 209 | // TODO: This set of data comes from the character frequency- 210 | // of-occurrence analysis tool. The data needs to be moved 211 | // into a resource and loaded from there. 212 | commonChars = [ 213 | 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 214 | 0x82a0, 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 215 | 0x82b3, 0x82b5, 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 216 | 0x82c8, 0x82c9, 0x82cc, 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 217 | 0x82ea, 0x82f0, 0x82f1, 0x8341, 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 218 | 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 0x838a, 0x838b, 0x838d, 0x8393, 219 | 0x8e96, 0x93fa, 0x95aa, 220 | ]; 221 | 222 | nextChar(iter: IteratedChar, det: Context) { 223 | iter.index = iter.nextIndex; 224 | iter.error = false; 225 | 226 | const firstByte = (iter.charValue = iter.nextByte(det)); 227 | if (firstByte < 0) return false; 228 | 229 | if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) 230 | return true; 231 | 232 | const secondByte = iter.nextByte(det); 233 | if (secondByte < 0) return false; 234 | 235 | iter.charValue = (firstByte << 8) | secondByte; 236 | if ( 237 | !( 238 | (secondByte >= 0x40 && secondByte <= 0x7f) || 239 | (secondByte >= 0x80 && secondByte <= 0xff) 240 | ) 241 | ) { 242 | // Illegal second byte value. 243 | iter.error = true; 244 | } 245 | return true; 246 | } 247 | } 248 | 249 | /** 250 | * Big5 charset recognizer. 251 | */ 252 | export class big5 extends mbcs { 253 | name(): EncodingName { 254 | return 'Big5'; 255 | } 256 | 257 | language() { 258 | return 'zh'; 259 | } 260 | // TODO: This set of data comes from the character frequency- 261 | // of-occurrence analysis tool. The data needs to be moved 262 | // into a resource and loaded from there. 263 | commonChars = [ 264 | 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 265 | 0xa446, 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 266 | 0xa477, 0xa4a3, 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 267 | 0xa4fd, 0xa540, 0xa548, 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 268 | 0xa662, 0xa668, 0xa670, 0xa6a8, 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 269 | 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 270 | 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 0xaa6b, 0xaaba, 0xaabe, 271 | 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 0xaec9, 0xafe0, 272 | 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 0xb5a5, 273 | 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 274 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f, 275 | ]; 276 | 277 | nextChar(iter: IteratedChar, det: Context) { 278 | iter.index = iter.nextIndex; 279 | iter.error = false; 280 | 281 | const firstByte = (iter.charValue = iter.nextByte(det)); 282 | 283 | if (firstByte < 0) return false; 284 | 285 | // single byte character. 286 | if (firstByte <= 0x7f || firstByte == 0xff) return true; 287 | 288 | const secondByte = iter.nextByte(det); 289 | 290 | if (secondByte < 0) return false; 291 | 292 | iter.charValue = (iter.charValue << 8) | secondByte; 293 | 294 | if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff) 295 | iter.error = true; 296 | 297 | return true; 298 | } 299 | } 300 | 301 | /** 302 | * EUC charset recognizers. One abstract class that provides the common function 303 | * for getting the next character according to the EUC encoding scheme, 304 | * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 305 | * 306 | * Get the next character value for EUC based encodings. 307 | * Character 'value' is simply the raw bytes that make up the character 308 | * packed into an int. 309 | */ 310 | function eucNextChar(iter: IteratedChar, det: Context) { 311 | iter.index = iter.nextIndex; 312 | iter.error = false; 313 | let firstByte = 0; 314 | let secondByte = 0; 315 | let thirdByte = 0; 316 | //int fourthByte = 0; 317 | buildChar: { 318 | firstByte = iter.charValue = iter.nextByte(det); 319 | if (firstByte < 0) { 320 | // Ran off the end of the input data 321 | iter.done = true; 322 | break buildChar; 323 | } 324 | if (firstByte <= 0x8d) { 325 | // single byte char 326 | break buildChar; 327 | } 328 | secondByte = iter.nextByte(det); 329 | iter.charValue = (iter.charValue << 8) | secondByte; 330 | if (firstByte >= 0xa1 && firstByte <= 0xfe) { 331 | // Two byte Char 332 | if (secondByte < 0xa1) { 333 | iter.error = true; 334 | } 335 | break buildChar; 336 | } 337 | if (firstByte == 0x8e) { 338 | // Code Set 2. 339 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 340 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 341 | // We don't know which we've got. 342 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two 343 | // bytes will look like a well formed 2 byte char. 344 | if (secondByte < 0xa1) { 345 | iter.error = true; 346 | } 347 | break buildChar; 348 | } 349 | if (firstByte == 0x8f) { 350 | // Code set 3. 351 | // Three byte total char size, two bytes of actual char value. 352 | thirdByte = iter.nextByte(det); 353 | iter.charValue = (iter.charValue << 8) | thirdByte; 354 | if (thirdByte < 0xa1) { 355 | iter.error = true; 356 | } 357 | } 358 | } 359 | return iter.done == false; 360 | } 361 | 362 | /** 363 | * The charset recognize for EUC-JP. A singleton instance of this class 364 | * is created and kept by the public CharsetDetector class 365 | */ 366 | export class euc_jp extends mbcs { 367 | name(): EncodingName { 368 | return 'EUC-JP'; 369 | } 370 | 371 | language() { 372 | return 'ja'; 373 | } 374 | 375 | // TODO: This set of data comes from the character frequency- 376 | // of-occurrence analysis tool. The data needs to be moved 377 | // into a resource and loaded from there. 378 | commonChars = [ 379 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 380 | 0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 381 | 0xa4b1, 0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 382 | 0xa4c1, 0xa4c3, 0xa4c4, 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 383 | 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 384 | 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 385 | 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 0xa5b0, 0xa5b3, 0xa5b5, 386 | 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 0xa5c8, 0xa5c9, 387 | 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 0xa5e5, 388 | 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 389 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 390 | 0xcdd1, 391 | ]; 392 | 393 | nextChar = eucNextChar; 394 | } 395 | 396 | /** 397 | * The charset recognize for EUC-KR. A singleton instance of this class 398 | * is created and kept by the public CharsetDetector class 399 | */ 400 | export class euc_kr extends mbcs { 401 | name(): EncodingName { 402 | return 'EUC-KR'; 403 | } 404 | 405 | language() { 406 | return 'ko'; 407 | } 408 | 409 | // TODO: This set of data comes from the character frequency- 410 | // of-occurrence analysis tool. The data needs to be moved 411 | // into a resource and loaded from there. 412 | commonChars = [ 413 | 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 414 | 0xb0fc, 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 415 | 0xb4cf, 0xb4d9, 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 416 | 0xb7af, 0xb7c2, 0xb7ce, 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 417 | 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 418 | 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 419 | 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 0xbef8, 0xbefa, 0xbfa1, 420 | 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 0xc0af, 0xc0b8, 421 | 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 0xc0da, 422 | 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 423 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 424 | 0xc8ad, 425 | ]; 426 | 427 | nextChar = eucNextChar; 428 | } 429 | 430 | /** 431 | * GB-18030 recognizer. Uses simplified Chinese statistics. 432 | */ 433 | export class gb_18030 extends mbcs { 434 | name(): EncodingName { 435 | return 'GB18030'; 436 | } 437 | 438 | language() { 439 | return 'zh'; 440 | } 441 | 442 | /* 443 | * Get the next character value for EUC based encodings. 444 | * Character 'value' is simply the raw bytes that make up the character 445 | * packed into an int. 446 | */ 447 | 448 | nextChar(iter: IteratedChar, det: Context) { 449 | iter.index = iter.nextIndex; 450 | iter.error = false; 451 | let firstByte = 0; 452 | let secondByte = 0; 453 | let thirdByte = 0; 454 | let fourthByte = 0; 455 | buildChar: { 456 | firstByte = iter.charValue = iter.nextByte(det); 457 | if (firstByte < 0) { 458 | // Ran off the end of the input data 459 | iter.done = true; 460 | break buildChar; 461 | } 462 | if (firstByte <= 0x80) { 463 | // single byte char 464 | break buildChar; 465 | } 466 | secondByte = iter.nextByte(det); 467 | iter.charValue = (iter.charValue << 8) | secondByte; 468 | if (firstByte >= 0x81 && firstByte <= 0xfe) { 469 | // Two byte Char 470 | if ( 471 | (secondByte >= 0x40 && secondByte <= 0x7e) || 472 | (secondByte >= 80 && secondByte <= 0xfe) 473 | ) { 474 | break buildChar; 475 | } 476 | // Four byte char 477 | if (secondByte >= 0x30 && secondByte <= 0x39) { 478 | thirdByte = iter.nextByte(det); 479 | if (thirdByte >= 0x81 && thirdByte <= 0xfe) { 480 | fourthByte = iter.nextByte(det); 481 | if (fourthByte >= 0x30 && fourthByte <= 0x39) { 482 | iter.charValue = 483 | (iter.charValue << 16) | (thirdByte << 8) | fourthByte; 484 | break buildChar; 485 | } 486 | } 487 | } 488 | iter.error = true; 489 | break buildChar; 490 | } 491 | } 492 | return iter.done == false; 493 | } 494 | 495 | // TODO: This set of data comes from the character frequency- 496 | // of-occurrence analysis tool. The data needs to be moved 497 | // into a resource and loaded from there. 498 | commonChars = [ 499 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 500 | 0xa3ac, 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 501 | 0xb5bd, 0xb5c4, 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 502 | 0xb7d6, 0xb7dd, 0xb8b4, 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 503 | 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 504 | 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 505 | 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 0xc7f8, 0xc8ab, 0xc8cb, 506 | 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 0xcad0, 0xcad6, 507 | 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 0xcfb5, 508 | 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 509 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 510 | 0xd6d0, 511 | ]; 512 | } 513 | -------------------------------------------------------------------------------- /src/encoding/sbcs.ts: -------------------------------------------------------------------------------- 1 | import type { Context, Recogniser } from '.'; 2 | import match, { type EncodingName, type Match } from '../match'; 3 | 4 | /** 5 | * This class recognizes single-byte encodings. Because the encoding scheme is so 6 | * simple, language statistics are used to do the matching. 7 | */ 8 | 9 | const N_GRAM_MASK = 0xffffff; 10 | 11 | class NGramParser { 12 | byteIndex: number = 0; 13 | ngram: number = 0; 14 | 15 | ngramCount: number = 0; 16 | hitCount: number = 0; 17 | 18 | ngramList: number[]; 19 | byteMap: number[]; 20 | 21 | // TODO: is it safe to set it like this? 22 | spaceChar: number = 0x20; 23 | 24 | constructor(theNgramList: number[], theByteMap: number[]) { 25 | this.ngramList = theNgramList; 26 | this.byteMap = theByteMap; 27 | } 28 | 29 | /* 30 | * Binary search for value in table, which must have exactly 64 entries. 31 | */ 32 | search(table: number[], value: number) { 33 | let index = 0; 34 | 35 | if (table[index + 32] <= value) index += 32; 36 | if (table[index + 16] <= value) index += 16; 37 | if (table[index + 8] <= value) index += 8; 38 | if (table[index + 4] <= value) index += 4; 39 | if (table[index + 2] <= value) index += 2; 40 | if (table[index + 1] <= value) index += 1; 41 | if (table[index] > value) index -= 1; 42 | 43 | if (index < 0 || table[index] != value) return -1; 44 | 45 | return index; 46 | } 47 | 48 | lookup(thisNgram: number) { 49 | this.ngramCount += 1; 50 | if (this.search(this.ngramList, thisNgram) >= 0) { 51 | this.hitCount += 1; 52 | } 53 | } 54 | 55 | addByte(b: number) { 56 | this.ngram = ((this.ngram << 8) + (b & 0xff)) & N_GRAM_MASK; 57 | this.lookup(this.ngram); 58 | } 59 | 60 | nextByte(det: Context) { 61 | if (this.byteIndex >= det.inputLen) return -1; 62 | 63 | return det.inputBytes[this.byteIndex++] & 0xff; 64 | } 65 | 66 | parse(det: Context, spaceCh: number) { 67 | let b, 68 | ignoreSpace = false; 69 | this.spaceChar = spaceCh; 70 | 71 | while ((b = this.nextByte(det)) >= 0) { 72 | const mb = this.byteMap[b]; 73 | 74 | // TODO: 0x20 might not be a space in all character sets... 75 | if (mb != 0) { 76 | if (!(mb == this.spaceChar && ignoreSpace)) { 77 | this.addByte(mb); 78 | } 79 | 80 | ignoreSpace = mb == this.spaceChar; 81 | } 82 | } 83 | 84 | // TODO: Is this OK? The buffer could have ended in the middle of a word... 85 | this.addByte(this.spaceChar); 86 | 87 | const rawPercent = this.hitCount / this.ngramCount; 88 | 89 | // TODO - This is a bit of a hack to take care of a case 90 | // were we were getting a confidence of 135... 91 | if (rawPercent > 0.33) return 98; 92 | 93 | return Math.floor(rawPercent * 300.0); 94 | } 95 | } 96 | 97 | class NGramsPlusLang { 98 | fLang: string; 99 | fNGrams: number[]; 100 | 101 | constructor(la: string, ng: number[]) { 102 | this.fLang = la; 103 | this.fNGrams = ng; 104 | } 105 | } 106 | 107 | const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] => 108 | Array.isArray(val) && isFinite(val[0] as number); 109 | 110 | class sbcs implements Recogniser { 111 | spaceChar = 0x20; 112 | 113 | private nGramLang?: string = undefined; 114 | 115 | ngrams(): NGramsPlusLang[] | number[] { 116 | return []; 117 | } 118 | 119 | byteMap(): number[] { 120 | return []; 121 | } 122 | 123 | name(_input: Context): EncodingName { 124 | return 'sbcs'; 125 | } 126 | 127 | language(): string | undefined { 128 | return this.nGramLang; 129 | } 130 | 131 | match(det: Context): Match | null { 132 | // This feels a bit dirty. Simpler alternative would be 133 | // splitting classes ISO_8859_1 etc into language-specific ones 134 | // with hardcoded languages like ISO_8859_9. 135 | this.nGramLang = undefined; 136 | 137 | const ngrams = this.ngrams(); 138 | 139 | if (isFlatNgrams(ngrams)) { 140 | const parser = new NGramParser(ngrams, this.byteMap()); 141 | const confidence = parser.parse(det, this.spaceChar); 142 | return confidence <= 0 ? null : match(det, this, confidence); 143 | } 144 | 145 | let bestConfidence = -1; 146 | 147 | for (let i = ngrams.length - 1; i >= 0; i--) { 148 | const ngl = ngrams[i]; 149 | 150 | const parser = new NGramParser(ngl.fNGrams, this.byteMap()); 151 | const confidence = parser.parse(det, this.spaceChar); 152 | if (confidence > bestConfidence) { 153 | bestConfidence = confidence; 154 | this.nGramLang = ngl.fLang; 155 | } 156 | } 157 | 158 | return bestConfidence <= 0 ? null : match(det, this, bestConfidence); 159 | } 160 | } 161 | 162 | export class ISO_8859_1 extends sbcs { 163 | byteMap() { 164 | return [ 165 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 166 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 167 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 168 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 169 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 170 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 171 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 172 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 173 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 174 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 175 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 176 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 177 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 178 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 179 | 0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 180 | 0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0x20, 181 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 182 | 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 183 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 184 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 185 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb, 186 | 0xfc, 0xfd, 0xfe, 0xff, 187 | ]; 188 | } 189 | 190 | ngrams() { 191 | return [ 192 | new NGramsPlusLang( 193 | 'da', 194 | [ 195 | 0x206166, 0x206174, 0x206465, 0x20656e, 0x206572, 0x20666f, 0x206861, 196 | 0x206920, 0x206d65, 0x206f67, 0x2070e5, 0x207369, 0x207374, 0x207469, 197 | 0x207669, 0x616620, 0x616e20, 0x616e64, 0x617220, 0x617420, 0x646520, 198 | 0x64656e, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656e20, 199 | 0x656e64, 0x657220, 0x657265, 0x657320, 0x657420, 0x666f72, 0x676520, 200 | 0x67656e, 0x676572, 0x696765, 0x696c20, 0x696e67, 0x6b6520, 0x6b6b65, 201 | 0x6c6572, 0x6c6967, 0x6c6c65, 0x6d6564, 0x6e6465, 0x6e6520, 0x6e6720, 202 | 0x6e6765, 0x6f6720, 0x6f6d20, 0x6f7220, 0x70e520, 0x722064, 0x722065, 203 | 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696c, 204 | 0x766572, 205 | ], 206 | ), 207 | new NGramsPlusLang( 208 | 'de', 209 | [ 210 | 0x20616e, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 211 | 0x206765, 0x206861, 0x20696e, 0x206d69, 0x207363, 0x207365, 0x20756e, 212 | 0x207665, 0x20766f, 0x207765, 0x207a75, 0x626572, 0x636820, 0x636865, 213 | 0x636874, 0x646173, 0x64656e, 0x646572, 0x646965, 0x652064, 0x652073, 214 | 0x65696e, 0x656974, 0x656e20, 0x657220, 0x657320, 0x67656e, 0x68656e, 215 | 0x687420, 0x696368, 0x696520, 0x696e20, 0x696e65, 0x697420, 0x6c6963, 216 | 0x6c6c65, 0x6e2061, 0x6e2064, 0x6e2073, 0x6e6420, 0x6e6465, 0x6e6520, 217 | 0x6e6720, 0x6e6765, 0x6e7465, 0x722064, 0x726465, 0x726569, 0x736368, 218 | 0x737465, 0x742064, 0x746520, 0x74656e, 0x746572, 0x756e64, 0x756e67, 219 | 0x766572, 220 | ], 221 | ), 222 | new NGramsPlusLang( 223 | 'en', 224 | [ 225 | 0x206120, 0x20616e, 0x206265, 0x20636f, 0x20666f, 0x206861, 0x206865, 226 | 0x20696e, 0x206d61, 0x206f66, 0x207072, 0x207265, 0x207361, 0x207374, 227 | 0x207468, 0x20746f, 0x207768, 0x616964, 0x616c20, 0x616e20, 0x616e64, 228 | 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 229 | 0x652073, 0x652074, 0x656420, 0x656e74, 0x657220, 0x657320, 0x666f72, 230 | 0x686174, 0x686520, 0x686572, 0x696420, 0x696e20, 0x696e67, 0x696f6e, 231 | 0x697320, 0x6e2061, 0x6e2074, 0x6e6420, 0x6e6720, 0x6e7420, 0x6f6620, 232 | 0x6f6e20, 0x6f7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 233 | 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696f, 0x746f20, 234 | 0x747320, 235 | ], 236 | ), 237 | new NGramsPlusLang( 238 | 'es', 239 | [ 240 | 0x206120, 0x206361, 0x20636f, 0x206465, 0x20656c, 0x20656e, 0x206573, 241 | 0x20696e, 0x206c61, 0x206c6f, 0x207061, 0x20706f, 0x207072, 0x207175, 242 | 0x207265, 0x207365, 0x20756e, 0x207920, 0x612063, 0x612064, 0x612065, 243 | 0x61206c, 0x612070, 0x616369, 0x61646f, 0x616c20, 0x617220, 0x617320, 244 | 0x6369f3, 0x636f6e, 0x646520, 0x64656c, 0x646f20, 0x652064, 0x652065, 245 | 0x65206c, 0x656c20, 0x656e20, 0x656e74, 0x657320, 0x657374, 0x69656e, 246 | 0x69f36e, 0x6c6120, 0x6c6f73, 0x6e2065, 0x6e7465, 0x6f2064, 0x6f2065, 247 | 0x6f6e20, 0x6f7220, 0x6f7320, 0x706172, 0x717565, 0x726120, 0x726573, 248 | 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746f20, 0x756520, 249 | 0xf36e20, 250 | ], 251 | ), 252 | new NGramsPlusLang( 253 | 'fr', 254 | [ 255 | 0x206175, 0x20636f, 0x206461, 0x206465, 0x206475, 0x20656e, 0x206574, 256 | 0x206c61, 0x206c65, 0x207061, 0x20706f, 0x207072, 0x207175, 0x207365, 257 | 0x20736f, 0x20756e, 0x20e020, 0x616e74, 0x617469, 0x636520, 0x636f6e, 258 | 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 259 | 0x65206c, 0x652070, 0x652073, 0x656e20, 0x656e74, 0x657220, 0x657320, 260 | 0x657420, 0x657572, 0x696f6e, 0x697320, 0x697420, 0x6c6120, 0x6c6520, 261 | 0x6c6573, 0x6d656e, 0x6e2064, 0x6e6520, 0x6e7320, 0x6e7420, 0x6f6e20, 262 | 0x6f6e74, 0x6f7572, 0x717565, 0x72206c, 0x726520, 0x732061, 0x732064, 263 | 0x732065, 0x73206c, 0x732070, 0x742064, 0x746520, 0x74696f, 0x756520, 264 | 0x757220, 265 | ], 266 | ), 267 | new NGramsPlusLang( 268 | 'it', 269 | [ 270 | 0x20616c, 0x206368, 0x20636f, 0x206465, 0x206469, 0x206520, 0x20696c, 271 | 0x20696e, 0x206c61, 0x207065, 0x207072, 0x20756e, 0x612063, 0x612064, 272 | 0x612070, 0x612073, 0x61746f, 0x636865, 0x636f6e, 0x64656c, 0x646920, 273 | 0x652061, 0x652063, 0x652064, 0x652069, 0x65206c, 0x652070, 0x652073, 274 | 0x656c20, 0x656c6c, 0x656e74, 0x657220, 0x686520, 0x692061, 0x692063, 275 | 0x692064, 0x692073, 0x696120, 0x696c20, 0x696e20, 0x696f6e, 0x6c6120, 276 | 0x6c6520, 0x6c6920, 0x6c6c61, 0x6e6520, 0x6e6920, 0x6e6f20, 0x6e7465, 277 | 0x6f2061, 0x6f2064, 0x6f2069, 0x6f2073, 0x6f6e20, 0x6f6e65, 0x706572, 278 | 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746f20, 279 | 0x7a696f, 280 | ], 281 | ), 282 | new NGramsPlusLang( 283 | 'nl', 284 | [ 285 | 0x20616c, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656e, 286 | 0x206765, 0x206865, 0x20696e, 0x206d61, 0x206d65, 0x206f70, 0x207465, 287 | 0x207661, 0x207665, 0x20766f, 0x207765, 0x207a69, 0x61616e, 0x616172, 288 | 0x616e20, 0x616e64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656e, 289 | 0x646572, 0x652062, 0x652076, 0x65656e, 0x656572, 0x656e20, 0x657220, 290 | 0x657273, 0x657420, 0x67656e, 0x686574, 0x696520, 0x696e20, 0x696e67, 291 | 0x697320, 0x6e2062, 0x6e2064, 0x6e2065, 0x6e2068, 0x6e206f, 0x6e2076, 292 | 0x6e6465, 0x6e6720, 0x6f6e64, 0x6f6f72, 0x6f7020, 0x6f7220, 0x736368, 293 | 0x737465, 0x742064, 0x746520, 0x74656e, 0x746572, 0x76616e, 0x766572, 294 | 0x766f6f, 295 | ], 296 | ), 297 | new NGramsPlusLang( 298 | 'no', 299 | [ 300 | 0x206174, 0x206176, 0x206465, 0x20656e, 0x206572, 0x20666f, 0x206861, 301 | 0x206920, 0x206d65, 0x206f67, 0x2070e5, 0x207365, 0x20736b, 0x20736f, 302 | 0x207374, 0x207469, 0x207669, 0x20e520, 0x616e64, 0x617220, 0x617420, 303 | 0x646520, 0x64656e, 0x646574, 0x652073, 0x656420, 0x656e20, 0x656e65, 304 | 0x657220, 0x657265, 0x657420, 0x657474, 0x666f72, 0x67656e, 0x696b6b, 305 | 0x696c20, 0x696e67, 0x6b6520, 0x6b6b65, 0x6c6520, 0x6c6c65, 0x6d6564, 306 | 0x6d656e, 0x6e2073, 0x6e6520, 0x6e6720, 0x6e6765, 0x6e6e65, 0x6f6720, 307 | 0x6f6d20, 0x6f7220, 0x70e520, 0x722073, 0x726520, 0x736f6d, 0x737465, 308 | 0x742073, 0x746520, 0x74656e, 0x746572, 0x74696c, 0x747420, 0x747465, 309 | 0x766572, 310 | ], 311 | ), 312 | new NGramsPlusLang( 313 | 'pt', 314 | [ 315 | 0x206120, 0x20636f, 0x206461, 0x206465, 0x20646f, 0x206520, 0x206573, 316 | 0x206d61, 0x206e6f, 0x206f20, 0x207061, 0x20706f, 0x207072, 0x207175, 317 | 0x207265, 0x207365, 0x20756d, 0x612061, 0x612063, 0x612064, 0x612070, 318 | 0x616465, 0x61646f, 0x616c20, 0x617220, 0x617261, 0x617320, 0x636f6d, 319 | 0x636f6e, 0x646120, 0x646520, 0x646f20, 0x646f73, 0x652061, 0x652064, 320 | 0x656d20, 0x656e74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6d656e, 321 | 0x6e7465, 0x6e746f, 0x6f2061, 0x6f2063, 0x6f2064, 0x6f2065, 0x6f2070, 322 | 0x6f7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 323 | 0x732065, 0x732070, 0x737461, 0x746520, 0x746f20, 0x756520, 0xe36f20, 324 | 0xe7e36f, 325 | ], 326 | ), 327 | new NGramsPlusLang( 328 | 'sv', 329 | [ 330 | 0x206174, 0x206176, 0x206465, 0x20656e, 0x2066f6, 0x206861, 0x206920, 331 | 0x20696e, 0x206b6f, 0x206d65, 0x206f63, 0x2070e5, 0x20736b, 0x20736f, 332 | 0x207374, 0x207469, 0x207661, 0x207669, 0x20e472, 0x616465, 0x616e20, 333 | 0x616e64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656e, 0x646572, 334 | 0x646574, 0x656420, 0x656e20, 0x657220, 0x657420, 0x66f672, 0x67656e, 335 | 0x696c6c, 0x696e67, 0x6b6120, 0x6c6c20, 0x6d6564, 0x6e2073, 0x6e6120, 336 | 0x6e6465, 0x6e6720, 0x6e6765, 0x6e696e, 0x6f6368, 0x6f6d20, 0x6f6e20, 337 | 0x70e520, 0x722061, 0x722073, 0x726120, 0x736b61, 0x736f6d, 0x742073, 338 | 0x746120, 0x746520, 0x746572, 0x74696c, 0x747420, 0x766172, 0xe47220, 339 | 0xf67220, 340 | ], 341 | ), 342 | ]; 343 | } 344 | 345 | name(input: Context): EncodingName { 346 | return input && input.c1Bytes ? 'windows-1252' : 'ISO-8859-1'; 347 | } 348 | } 349 | 350 | export class ISO_8859_2 extends sbcs { 351 | byteMap() { 352 | return [ 353 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 354 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 355 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 356 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 357 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 358 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 359 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 360 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 361 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 362 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 363 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 364 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 365 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 366 | 0x20, 0x20, 0x20, 0x20, 0x20, 0xb1, 0x20, 0xb3, 0x20, 0xb5, 0xb6, 0x20, 367 | 0x20, 0xb9, 0xba, 0xbb, 0xbc, 0x20, 0xbe, 0xbf, 0x20, 0xb1, 0x20, 0xb3, 368 | 0x20, 0xb5, 0xb6, 0xb7, 0x20, 0xb9, 0xba, 0xbb, 0xbc, 0x20, 0xbe, 0xbf, 369 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 370 | 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 371 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 372 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 373 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb, 374 | 0xfc, 0xfd, 0xfe, 0x20, 375 | ]; 376 | } 377 | 378 | ngrams() { 379 | return [ 380 | new NGramsPlusLang( 381 | 'cs', 382 | [ 383 | 0x206120, 0x206279, 0x20646f, 0x206a65, 0x206e61, 0x206e65, 0x206f20, 384 | 0x206f64, 0x20706f, 0x207072, 0x2070f8, 0x20726f, 0x207365, 0x20736f, 385 | 0x207374, 0x20746f, 0x207620, 0x207679, 0x207a61, 0x612070, 0x636520, 386 | 0x636820, 0x652070, 0x652073, 0x652076, 0x656d20, 0x656eed, 0x686f20, 387 | 0x686f64, 0x697374, 0x6a6520, 0x6b7465, 0x6c6520, 0x6c6920, 0x6e6120, 388 | 0x6ee920, 0x6eec20, 0x6eed20, 0x6f2070, 0x6f646e, 0x6f6a69, 0x6f7374, 389 | 0x6f7520, 0x6f7661, 0x706f64, 0x706f6a, 0x70726f, 0x70f865, 0x736520, 390 | 0x736f75, 0x737461, 0x737469, 0x73746e, 0x746572, 0x746eed, 0x746f20, 391 | 0x752070, 0xbe6520, 0xe16eed, 0xe9686f, 0xed2070, 0xed2073, 0xed6d20, 392 | 0xf86564, 393 | ], 394 | ), 395 | new NGramsPlusLang( 396 | 'hu', 397 | [ 398 | 0x206120, 0x20617a, 0x206265, 0x206567, 0x20656c, 0x206665, 0x206861, 399 | 0x20686f, 0x206973, 0x206b65, 0x206b69, 0x206bf6, 0x206c65, 0x206d61, 400 | 0x206d65, 0x206d69, 0x206e65, 0x20737a, 0x207465, 0x20e973, 0x612061, 401 | 0x61206b, 0x61206d, 0x612073, 0x616b20, 0x616e20, 0x617a20, 0x62616e, 402 | 0x62656e, 0x656779, 0x656b20, 0x656c20, 0x656c65, 0x656d20, 0x656e20, 403 | 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686f67, 0x696e74, 404 | 0x697320, 0x6b2061, 0x6bf67a, 0x6d6567, 0x6d696e, 0x6e2061, 0x6e616b, 405 | 0x6e656b, 0x6e656d, 0x6e7420, 0x6f6779, 0x732061, 0x737a65, 0x737a74, 406 | 0x737ae1, 0x73e967, 0x742061, 0x747420, 0x74e173, 0x7a6572, 0xe16e20, 407 | 0xe97320, 408 | ], 409 | ), 410 | new NGramsPlusLang( 411 | 'pl', 412 | [ 413 | 0x20637a, 0x20646f, 0x206920, 0x206a65, 0x206b6f, 0x206d61, 0x206d69, 414 | 0x206e61, 0x206e69, 0x206f64, 0x20706f, 0x207072, 0x207369, 0x207720, 415 | 0x207769, 0x207779, 0x207a20, 0x207a61, 0x612070, 0x612077, 0x616e69, 416 | 0x636820, 0x637a65, 0x637a79, 0x646f20, 0x647a69, 0x652070, 0x652073, 417 | 0x652077, 0x65207a, 0x65676f, 0x656a20, 0x656d20, 0x656e69, 0x676f20, 418 | 0x696120, 0x696520, 0x69656a, 0x6b6120, 0x6b6920, 0x6b6965, 0x6d6965, 419 | 0x6e6120, 0x6e6961, 0x6e6965, 0x6f2070, 0x6f7761, 0x6f7769, 0x706f6c, 420 | 0x707261, 0x70726f, 0x70727a, 0x727a65, 0x727a79, 0x7369ea, 0x736b69, 421 | 0x737461, 0x776965, 0x796368, 0x796d20, 0x7a6520, 0x7a6965, 0x7a7920, 422 | 0xf37720, 423 | ], 424 | ), 425 | new NGramsPlusLang( 426 | 'ro', 427 | [ 428 | 0x206120, 0x206163, 0x206361, 0x206365, 0x20636f, 0x206375, 0x206465, 429 | 0x206469, 0x206c61, 0x206d61, 0x207065, 0x207072, 0x207365, 0x2073e3, 430 | 0x20756e, 0x20ba69, 0x20ee6e, 0x612063, 0x612064, 0x617265, 0x617420, 431 | 0x617465, 0x617520, 0x636172, 0x636f6e, 0x637520, 0x63e320, 0x646520, 432 | 0x652061, 0x652063, 0x652064, 0x652070, 0x652073, 0x656120, 0x656920, 433 | 0x656c65, 0x656e74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 434 | 0x696520, 0x696920, 0x696e20, 0x6c6120, 0x6c6520, 0x6c6f72, 0x6c7569, 435 | 0x6e6520, 0x6e7472, 0x6f7220, 0x70656e, 0x726520, 0x726561, 0x727520, 436 | 0x73e320, 0x746520, 0x747275, 0x74e320, 0x756920, 0x756c20, 0xba6920, 437 | 0xee6e20, 438 | ], 439 | ), 440 | ]; 441 | } 442 | 443 | name(det: Context): EncodingName { 444 | return det && det.c1Bytes ? 'windows-1250' : 'ISO-8859-2'; 445 | } 446 | } 447 | 448 | export class ISO_8859_5 extends sbcs { 449 | byteMap() { 450 | return [ 451 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 452 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 453 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 454 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 455 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 456 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 457 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 458 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 459 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 460 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 461 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 462 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 463 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 464 | 0x20, 0x20, 0x20, 0x20, 0x20, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 465 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x20, 0xfe, 0xff, 0xd0, 0xd1, 0xd2, 0xd3, 466 | 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 467 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 468 | 0xec, 0xed, 0xee, 0xef, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 469 | 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 470 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 471 | 0x20, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 472 | 0xfc, 0x20, 0xfe, 0xff, 473 | ]; 474 | } 475 | 476 | ngrams() { 477 | return [ 478 | 0x20d220, 0x20d2de, 0x20d4de, 0x20d7d0, 0x20d820, 0x20dad0, 0x20dade, 479 | 0x20ddd0, 0x20ddd5, 0x20ded1, 0x20dfde, 0x20dfe0, 0x20e0d0, 0x20e1de, 480 | 0x20e1e2, 0x20e2de, 0x20e7e2, 0x20ede2, 0xd0ddd8, 0xd0e2ec, 0xd3de20, 481 | 0xd5dbec, 0xd5ddd8, 0xd5e1e2, 0xd5e220, 0xd820df, 0xd8d520, 0xd8d820, 482 | 0xd8ef20, 0xdbd5dd, 0xdbd820, 0xdbecdd, 0xddd020, 0xddd520, 0xddd8d5, 483 | 0xddd8ef, 0xddde20, 0xddded2, 0xde20d2, 0xde20df, 0xde20e1, 0xded220, 484 | 0xded2d0, 0xded3de, 0xded920, 0xdedbec, 0xdedc20, 0xdee1e2, 0xdfdedb, 485 | 0xdfe0d5, 0xdfe0d8, 0xdfe0de, 0xe0d0d2, 0xe0d5d4, 0xe1e2d0, 0xe1e2d2, 486 | 0xe1e2d8, 0xe1ef20, 0xe2d5db, 0xe2de20, 0xe2dee0, 0xe2ec20, 0xe7e2de, 487 | 0xebe520, 488 | ]; 489 | } 490 | 491 | name(): EncodingName { 492 | return 'ISO-8859-5'; 493 | } 494 | 495 | language() { 496 | return 'ru'; 497 | } 498 | } 499 | 500 | export class ISO_8859_6 extends sbcs { 501 | byteMap() { 502 | return [ 503 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 504 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 505 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 506 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 507 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 508 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 509 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 510 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 511 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 512 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 513 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 514 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 515 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 516 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 517 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 518 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 519 | 0x20, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 520 | 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 521 | 0xd8, 0xd9, 0xda, 0x20, 0x20, 0x20, 0x20, 0x20, 0xe0, 0xe1, 0xe2, 0xe3, 522 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0x20, 0x20, 0x20, 0x20, 0x20, 523 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 524 | 0x20, 0x20, 0x20, 0x20, 525 | ]; 526 | } 527 | 528 | ngrams() { 529 | return [ 530 | 0x20c7e4, 0x20c7e6, 0x20c8c7, 0x20d9e4, 0x20e1ea, 0x20e4e4, 0x20e5e6, 531 | 0x20e8c7, 0xc720c7, 0xc7c120, 0xc7ca20, 0xc7d120, 0xc7e420, 0xc7e4c3, 532 | 0xc7e4c7, 0xc7e4c8, 0xc7e4ca, 0xc7e4cc, 0xc7e4cd, 0xc7e4cf, 0xc7e4d3, 533 | 0xc7e4d9, 0xc7e4e2, 0xc7e4e5, 0xc7e4e8, 0xc7e4ea, 0xc7e520, 0xc7e620, 534 | 0xc7e6ca, 0xc820c7, 0xc920c7, 0xc920e1, 0xc920e4, 0xc920e5, 0xc920e8, 535 | 0xca20c7, 0xcf20c7, 0xcfc920, 0xd120c7, 0xd1c920, 0xd320c7, 0xd920c7, 536 | 0xd9e4e9, 0xe1ea20, 0xe420c7, 0xe4c920, 0xe4e920, 0xe4ea20, 0xe520c7, 537 | 0xe5c720, 0xe5c920, 0xe5e620, 0xe620c7, 0xe720c7, 0xe7c720, 0xe8c7e4, 538 | 0xe8e620, 0xe920c7, 0xea20c7, 0xea20e5, 0xea20e8, 0xeac920, 0xead120, 539 | 0xeae620, 540 | ]; 541 | } 542 | 543 | name(): EncodingName { 544 | return 'ISO-8859-6'; 545 | } 546 | 547 | language() { 548 | return 'ar'; 549 | } 550 | } 551 | 552 | export class ISO_8859_7 extends sbcs { 553 | byteMap() { 554 | return [ 555 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 556 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 557 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 558 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 559 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 560 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 561 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 562 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 563 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 564 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 565 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 566 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 567 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 568 | 0x20, 0x20, 0x20, 0x20, 0x20, 0xa1, 0xa2, 0x20, 0x20, 0x20, 0x20, 0x20, 569 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 570 | 0x20, 0x20, 0xdc, 0x20, 0xdd, 0xde, 0xdf, 0x20, 0xfc, 0x20, 0xfd, 0xfe, 571 | 0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 572 | 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0x20, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 573 | 0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 574 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 575 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 576 | 0xfc, 0xfd, 0xfe, 0x20, 577 | ]; 578 | } 579 | 580 | ngrams() { 581 | return [ 582 | 0x20e1ed, 0x20e1f0, 0x20e3e9, 0x20e4e9, 0x20e5f0, 0x20e720, 0x20eae1, 583 | 0x20ece5, 0x20ede1, 0x20ef20, 0x20f0e1, 0x20f0ef, 0x20f0f1, 0x20f3f4, 584 | 0x20f3f5, 0x20f4e7, 0x20f4ef, 0xdfe120, 0xe120e1, 0xe120f4, 0xe1e920, 585 | 0xe1ed20, 0xe1f0fc, 0xe1f220, 0xe3e9e1, 0xe5e920, 0xe5f220, 0xe720f4, 586 | 0xe7ed20, 0xe7f220, 0xe920f4, 0xe9e120, 0xe9eade, 0xe9f220, 0xeae1e9, 587 | 0xeae1f4, 0xece520, 0xed20e1, 0xed20e5, 0xed20f0, 0xede120, 0xeff220, 588 | 0xeff520, 0xf0eff5, 0xf0f1ef, 0xf0fc20, 0xf220e1, 0xf220e5, 0xf220ea, 589 | 0xf220f0, 0xf220f4, 0xf3e520, 0xf3e720, 0xf3f4ef, 0xf4e120, 0xf4e1e9, 590 | 0xf4e7ed, 0xf4e7f2, 0xf4e9ea, 0xf4ef20, 0xf4eff5, 0xf4f9ed, 0xf9ed20, 591 | 0xfeed20, 592 | ]; 593 | } 594 | 595 | name(det: Context): EncodingName { 596 | return det && det.c1Bytes ? 'windows-1253' : 'ISO-8859-7'; 597 | } 598 | 599 | language() { 600 | return 'el'; 601 | } 602 | } 603 | 604 | export class ISO_8859_8 extends sbcs { 605 | byteMap() { 606 | return [ 607 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 608 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 609 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 610 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 611 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 612 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 613 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 614 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 615 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 616 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 617 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 618 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 619 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 620 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 621 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 622 | 0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 623 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 624 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 625 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xe0, 0xe1, 0xe2, 0xe3, 626 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 627 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0x20, 628 | 0x20, 0x20, 0x20, 0x20, 629 | ]; 630 | } 631 | 632 | ngrams() { 633 | return [ 634 | new NGramsPlusLang( 635 | 'he', 636 | [ 637 | 0x20e0e5, 0x20e0e7, 0x20e0e9, 0x20e0fa, 0x20e1e9, 0x20e1ee, 0x20e4e0, 638 | 0x20e4e5, 0x20e4e9, 0x20e4ee, 0x20e4f2, 0x20e4f9, 0x20e4fa, 0x20ece0, 639 | 0x20ece4, 0x20eee0, 0x20f2ec, 0x20f9ec, 0xe0fa20, 0xe420e0, 0xe420e1, 640 | 0xe420e4, 0xe420ec, 0xe420ee, 0xe420f9, 0xe4e5e0, 0xe5e020, 0xe5ed20, 641 | 0xe5ef20, 0xe5f820, 0xe5fa20, 0xe920e4, 0xe9e420, 0xe9e5fa, 0xe9e9ed, 642 | 0xe9ed20, 0xe9ef20, 0xe9f820, 0xe9fa20, 0xec20e0, 0xec20e4, 0xece020, 643 | 0xece420, 0xed20e0, 0xed20e1, 0xed20e4, 0xed20ec, 0xed20ee, 0xed20f9, 644 | 0xeee420, 0xef20e4, 0xf0e420, 0xf0e920, 0xf0e9ed, 0xf2ec20, 0xf820e4, 645 | 0xf8e9ed, 0xf9ec20, 0xfa20e0, 0xfa20e1, 0xfa20e4, 0xfa20ec, 0xfa20ee, 646 | 0xfa20f9, 647 | ], 648 | ), 649 | new NGramsPlusLang( 650 | 'he', 651 | [ 652 | 0x20e0e5, 0x20e0ec, 0x20e4e9, 0x20e4ec, 0x20e4ee, 0x20e4f0, 0x20e9f0, 653 | 0x20ecf2, 0x20ecf9, 0x20ede5, 0x20ede9, 0x20efe5, 0x20efe9, 0x20f8e5, 654 | 0x20f8e9, 0x20fae0, 0x20fae5, 0x20fae9, 0xe020e4, 0xe020ec, 0xe020ed, 655 | 0xe020fa, 0xe0e420, 0xe0e5e4, 0xe0ec20, 0xe0ee20, 0xe120e4, 0xe120ed, 656 | 0xe120fa, 0xe420e4, 0xe420e9, 0xe420ec, 0xe420ed, 0xe420ef, 0xe420f8, 657 | 0xe420fa, 0xe4ec20, 0xe5e020, 0xe5e420, 0xe7e020, 0xe9e020, 0xe9e120, 658 | 0xe9e420, 0xec20e4, 0xec20ed, 0xec20fa, 0xecf220, 0xecf920, 0xede9e9, 659 | 0xede9f0, 0xede9f8, 0xee20e4, 0xee20ed, 0xee20fa, 0xeee120, 0xeee420, 660 | 0xf2e420, 0xf920e4, 0xf920ed, 0xf920fa, 0xf9e420, 0xfae020, 0xfae420, 661 | 0xfae5e9, 662 | ], 663 | ), 664 | ]; 665 | } 666 | 667 | name(det: Context): EncodingName { 668 | return det && det.c1Bytes ? 'windows-1255' : 'ISO-8859-8'; 669 | } 670 | 671 | language() { 672 | return 'he'; 673 | } 674 | } 675 | 676 | export class ISO_8859_9 extends sbcs { 677 | byteMap() { 678 | return [ 679 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 680 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 681 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 682 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 683 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 684 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 685 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 686 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 687 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 688 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 689 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 690 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 691 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 692 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 693 | 0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 694 | 0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0x20, 695 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 696 | 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 697 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 698 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 699 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb, 700 | 0xfc, 0xfd, 0xfe, 0xff, 701 | ]; 702 | } 703 | 704 | ngrams() { 705 | return [ 706 | 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 707 | 0x20696c, 0x206b61, 0x206b6f, 0x206d61, 0x206f6c, 0x207361, 0x207461, 708 | 0x207665, 0x207961, 0x612062, 0x616b20, 0x616c61, 0x616d61, 0x616e20, 709 | 0x616efd, 0x617220, 0x617261, 0x6172fd, 0x6173fd, 0x617961, 0x626972, 710 | 0x646120, 0x646520, 0x646920, 0x652062, 0x65206b, 0x656469, 0x656e20, 711 | 0x657220, 0x657269, 0x657369, 0x696c65, 0x696e20, 0x696e69, 0x697220, 712 | 0x6c616e, 0x6c6172, 0x6c6520, 0x6c6572, 0x6e2061, 0x6e2062, 0x6e206b, 713 | 0x6e6461, 0x6e6465, 0x6e6520, 0x6e6920, 0x6e696e, 0x6efd20, 0x72696e, 714 | 0x72fd6e, 0x766520, 0x796120, 0x796f72, 0xfd6e20, 0xfd6e64, 0xfd6efd, 715 | 0xfdf0fd, 716 | ]; 717 | } 718 | 719 | name(det: Context): EncodingName { 720 | return det && det.c1Bytes ? 'windows-1254' : 'ISO-8859-9'; 721 | } 722 | 723 | language() { 724 | return 'tr'; 725 | } 726 | } 727 | 728 | export class windows_1251 extends sbcs { 729 | byteMap() { 730 | return [ 731 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 732 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 733 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 734 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 735 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 736 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 737 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 738 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 739 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 740 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 741 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x90, 0x83, 0x20, 0x83, 742 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9a, 0x20, 0x9c, 0x9d, 0x9e, 0x9f, 743 | 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9a, 0x20, 744 | 0x9c, 0x9d, 0x9e, 0x9f, 0x20, 0xa2, 0xa2, 0xbc, 0x20, 0xb4, 0x20, 0x20, 745 | 0xb8, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0xbf, 0x20, 0x20, 0xb3, 0xb3, 746 | 0xb4, 0xb5, 0x20, 0x20, 0xb8, 0x20, 0xba, 0x20, 0xbc, 0xbe, 0xbe, 0xbf, 747 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 748 | 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 749 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xe0, 0xe1, 0xe2, 0xe3, 750 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 751 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 752 | 0xfc, 0xfd, 0xfe, 0xff, 753 | ]; 754 | } 755 | 756 | ngrams() { 757 | return [ 758 | 0x20e220, 0x20e2ee, 0x20e4ee, 0x20e7e0, 0x20e820, 0x20eae0, 0x20eaee, 759 | 0x20ede0, 0x20ede5, 0x20eee1, 0x20efee, 0x20eff0, 0x20f0e0, 0x20f1ee, 760 | 0x20f1f2, 0x20f2ee, 0x20f7f2, 0x20fdf2, 0xe0ede8, 0xe0f2fc, 0xe3ee20, 761 | 0xe5ebfc, 0xe5ede8, 0xe5f1f2, 0xe5f220, 0xe820ef, 0xe8e520, 0xe8e820, 762 | 0xe8ff20, 0xebe5ed, 0xebe820, 0xebfced, 0xede020, 0xede520, 0xede8e5, 763 | 0xede8ff, 0xedee20, 0xedeee2, 0xee20e2, 0xee20ef, 0xee20f1, 0xeee220, 764 | 0xeee2e0, 0xeee3ee, 0xeee920, 0xeeebfc, 0xeeec20, 0xeef1f2, 0xefeeeb, 765 | 0xeff0e5, 0xeff0e8, 0xeff0ee, 0xf0e0e2, 0xf0e5e4, 0xf1f2e0, 0xf1f2e2, 766 | 0xf1f2e8, 0xf1ff20, 0xf2e5eb, 0xf2ee20, 0xf2eef0, 0xf2fc20, 0xf7f2ee, 767 | 0xfbf520, 768 | ]; 769 | } 770 | 771 | name(): EncodingName { 772 | return 'windows-1251'; 773 | } 774 | 775 | language() { 776 | return 'ru'; 777 | } 778 | } 779 | 780 | export class windows_1256 extends sbcs { 781 | byteMap() { 782 | return [ 783 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 784 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 785 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 786 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 787 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 788 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 789 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 790 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 791 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 792 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 793 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x81, 0x20, 0x83, 794 | 0x20, 0x20, 0x20, 0x20, 0x88, 0x20, 0x8a, 0x20, 0x9c, 0x8d, 0x8e, 0x8f, 795 | 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x98, 0x20, 0x9a, 0x20, 796 | 0x9c, 0x20, 0x20, 0x9f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 797 | 0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 798 | 0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 799 | 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 800 | 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x20, 801 | 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 802 | 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 803 | 0x20, 0x20, 0x20, 0x20, 0xf4, 0x20, 0x20, 0x20, 0x20, 0xf9, 0x20, 0xfb, 804 | 0xfc, 0x20, 0x20, 0xff, 805 | ]; 806 | } 807 | 808 | ngrams() { 809 | return [ 810 | 0x20c7e1, 0x20c7e4, 0x20c8c7, 0x20dae1, 0x20dded, 0x20e1e1, 0x20e3e4, 811 | 0x20e6c7, 0xc720c7, 0xc7c120, 0xc7ca20, 0xc7d120, 0xc7e120, 0xc7e1c3, 812 | 0xc7e1c7, 0xc7e1c8, 0xc7e1ca, 0xc7e1cc, 0xc7e1cd, 0xc7e1cf, 0xc7e1d3, 813 | 0xc7e1da, 0xc7e1de, 0xc7e1e3, 0xc7e1e6, 0xc7e1ed, 0xc7e320, 0xc7e420, 814 | 0xc7e4ca, 0xc820c7, 0xc920c7, 0xc920dd, 0xc920e1, 0xc920e3, 0xc920e6, 815 | 0xca20c7, 0xcf20c7, 0xcfc920, 0xd120c7, 0xd1c920, 0xd320c7, 0xda20c7, 816 | 0xdae1ec, 0xdded20, 0xe120c7, 0xe1c920, 0xe1ec20, 0xe1ed20, 0xe320c7, 817 | 0xe3c720, 0xe3c920, 0xe3e420, 0xe420c7, 0xe520c7, 0xe5c720, 0xe6c7e1, 818 | 0xe6e420, 0xec20c7, 0xed20c7, 0xed20e3, 0xed20e6, 0xedc920, 0xedd120, 819 | 0xede420, 820 | ]; 821 | } 822 | 823 | name(): EncodingName { 824 | return 'windows-1256'; 825 | } 826 | 827 | language() { 828 | return 'ar'; 829 | } 830 | } 831 | 832 | export class KOI8_R extends sbcs { 833 | byteMap() { 834 | return [ 835 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 836 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 837 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 838 | 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 839 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 840 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 841 | 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 842 | 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 843 | 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 844 | 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 845 | 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 846 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 847 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 848 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xa3, 0x20, 0x20, 0x20, 0x20, 849 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xa3, 850 | 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 851 | 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 852 | 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 853 | 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xc0, 0xc1, 0xc2, 0xc3, 854 | 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 855 | 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 856 | 0xdc, 0xdd, 0xde, 0xdf, 857 | ]; 858 | } 859 | 860 | ngrams() { 861 | return [ 862 | 0x20c4cf, 0x20c920, 0x20cbc1, 0x20cbcf, 0x20cec1, 0x20cec5, 0x20cfc2, 863 | 0x20d0cf, 0x20d0d2, 0x20d2c1, 0x20d3cf, 0x20d3d4, 0x20d4cf, 0x20d720, 864 | 0x20d7cf, 0x20dac1, 0x20dcd4, 0x20ded4, 0xc1cec9, 0xc1d4d8, 0xc5ccd8, 865 | 0xc5cec9, 0xc5d3d4, 0xc5d420, 0xc7cf20, 0xc920d0, 0xc9c520, 0xc9c920, 866 | 0xc9d120, 0xccc5ce, 0xccc920, 0xccd8ce, 0xcec120, 0xcec520, 0xcec9c5, 867 | 0xcec9d1, 0xcecf20, 0xcecfd7, 0xcf20d0, 0xcf20d3, 0xcf20d7, 0xcfc7cf, 868 | 0xcfca20, 0xcfccd8, 0xcfcd20, 0xcfd3d4, 0xcfd720, 0xcfd7c1, 0xd0cfcc, 869 | 0xd0d2c5, 0xd0d2c9, 0xd0d2cf, 0xd2c1d7, 0xd2c5c4, 0xd3d120, 0xd3d4c1, 870 | 0xd3d4c9, 0xd3d4d7, 0xd4c5cc, 0xd4cf20, 0xd4cfd2, 0xd4d820, 0xd9c820, 871 | 0xded4cf, 872 | ]; 873 | } 874 | 875 | name(): EncodingName { 876 | return 'KOI8-R'; 877 | } 878 | 879 | language() { 880 | return 'ru'; 881 | } 882 | } 883 | 884 | /* 885 | module.exports.ISO_8859_7 = function() { 886 | this.byteMap = function() { 887 | return [ 888 | 889 | ]; 890 | }; 891 | 892 | this.ngrams = function() { 893 | return [ 894 | 895 | ]; 896 | }; 897 | 898 | this.name = function(det) { 899 | if (typeof det == 'undefined') 900 | return 'ISO-8859-7'; 901 | return det.c1Bytes ? 'windows-1253' : 'ISO-8859-7'; 902 | }; 903 | 904 | language() { 905 | return 'el'; 906 | }; 907 | }; 908 | util.inherits(module.exports.ISO_8859_7, sbcs); 909 | */ 910 | --------------------------------------------------------------------------------