├── .npmrc
├── .npmignore
├── .prettierrc.json
├── src
    ├── fs
    │   ├── browser.ts
    │   └── node.ts
    ├── test
    │   └── data
    │   │   ├── collation.zip
    │   │   └── encodings
    │   │       ├── big5
    │   │       ├── euc_jp
    │   │       ├── euc_kr
    │   │       ├── koi8r
    │   │       ├── gb18030
    │   │       ├── iso88598
    │   │       ├── shiftjis
    │   │       ├── utf16be
    │   │       ├── utf16le
    │   │       ├── utf32be
    │   │       ├── utf32le
    │   │       ├── iso88592_cs
    │   │       ├── iso88595_ru
    │   │       ├── iso88596_ar
    │   │       ├── iso88597_el
    │   │       ├── iso88598_he
    │   │       ├── iso88599_tr
    │   │       ├── ascii
    │   │       ├── windows_1250
    │   │       ├── windows_1251
    │   │       ├── windows_1252
    │   │       ├── windows_1253
    │   │       ├── windows_1254
    │   │       ├── windows_1255
    │   │       ├── windows_1256
    │   │       ├── lang_chinese
    │   │       ├── utf8
    │   │       ├── lang_russian
    │   │       ├── lang_greek
    │   │       ├── lang_japanese
    │   │       ├── lang_korean
    │   │       ├── lang_czech
    │   │       ├── iso2022cn
    │   │       ├── lang_arabic
    │   │       ├── lang_hebrew
    │   │       ├── lang_turkish
    │   │       ├── iso2022jp
    │   │       ├── iso88591_en
    │   │       └── iso2022kr
    ├── encoding
    │   ├── utf8.test.ts
    │   ├── ascii.test.ts
    │   ├── index.ts
    │   ├── ascii.ts
    │   ├── unicode.test.ts
    │   ├── mbcs.test.ts
    │   ├── iso2022.test.ts
    │   ├── utf8.ts
    │   ├── sbcs.test.ts
    │   ├── unicode.ts
    │   ├── iso2022.ts
    │   ├── mbcs.ts
    │   └── sbcs.ts
    ├── utils.ts
    ├── utils.test.ts
    ├── match.ts
    ├── index.test.ts
    └── index.ts
├── .github
    └── workflows
    │   ├── test-build.sh
    │   ├── build.yml
    │   ├── release.yml
    │   ├── test-build.js
    │   └── test-build.ts
├── .gitignore
├── jest.config.js
├── renovate.json
├── tsconfig.json
├── LICENSE
├── package.json
└── README.md


/.npmrc:
--------------------------------------------------------------------------------
1 | package-lock=false
2 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | test
2 | scripts
3 | yarn.lock
4 | 


--------------------------------------------------------------------------------
/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": true,
3 |   "printWidth": 80
4 | }
5 | 


--------------------------------------------------------------------------------
/src/fs/browser.ts:
--------------------------------------------------------------------------------
1 | export default () => {
2 |   throw new Error('File system is not available');
3 | };
4 | 


--------------------------------------------------------------------------------
/src/test/data/collation.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/collation.zip


--------------------------------------------------------------------------------
/src/test/data/encodings/big5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/big5


--------------------------------------------------------------------------------
/src/test/data/encodings/euc_jp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/euc_jp


--------------------------------------------------------------------------------
/src/test/data/encodings/euc_kr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/euc_kr


--------------------------------------------------------------------------------
/src/test/data/encodings/koi8r:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/koi8r


--------------------------------------------------------------------------------
/src/test/data/encodings/gb18030:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/gb18030


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88598:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88598


--------------------------------------------------------------------------------
/src/test/data/encodings/shiftjis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/shiftjis


--------------------------------------------------------------------------------
/src/test/data/encodings/utf16be:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf16be


--------------------------------------------------------------------------------
/src/test/data/encodings/utf16le:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf16le


--------------------------------------------------------------------------------
/src/test/data/encodings/utf32be:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf32be


--------------------------------------------------------------------------------
/src/test/data/encodings/utf32le:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/utf32le


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88592_cs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88592_cs


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88595_ru:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88595_ru


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88596_ar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88596_ar


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88597_el:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88597_el


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88598_he:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88598_he


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88599_tr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/iso88599_tr


--------------------------------------------------------------------------------
/src/test/data/encodings/ascii:
--------------------------------------------------------------------------------
1 |  !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1250:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1250


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1251:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1251


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1252:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1252


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1253:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1253


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1254:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1254


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1255:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1255


--------------------------------------------------------------------------------
/src/test/data/encodings/windows_1256:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/runk/node-chardet/HEAD/src/test/data/encodings/windows_1256


--------------------------------------------------------------------------------
/.github/workflows/test-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh -ex
2 | 
3 | node ./.github/workflows/test-build.js
4 | npx ts-node ./.github/workflows/test-build.ts
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | testing.js
 2 | .DS_Store
 3 | node_modules
 4 | coverage
 5 | npm-debug.log
 6 | lib
 7 | TODO.md
 8 | package-lock.json
 9 | .vscode
10 | 


--------------------------------------------------------------------------------
/src/encoding/utf8.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | 
 3 | describe('UTF-8', () => {
 4 |   it('should return UTF-8', () => {
 5 |     expect(
 6 |       chardet.detectFileSync(__dirname + '/../test/data/encodings/utf8'),
 7 |     ).toBe('UTF-8');
 8 |   });
 9 | });
10 | 


--------------------------------------------------------------------------------
/src/encoding/ascii.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | 
 3 | describe('ASCII', () => {
 4 |   it('should return ASCII', () => {
 5 |     expect(
 6 |       chardet.detectFileSync(__dirname + '/../test/data/encodings/ascii'),
 7 |     ).toBe('ASCII');
 8 |   });
 9 | });
10 | 


--------------------------------------------------------------------------------
/src/fs/node.ts:
--------------------------------------------------------------------------------
 1 | let fsModule: any;
 2 | 
 3 | export default () => {
 4 |   if (typeof module === 'object' && typeof module.exports === 'object') {
 5 |     fsModule = fsModule ? fsModule : require('fs');
 6 |     return fsModule;
 7 |   }
 8 |   throw new Error('File system is not available');
 9 | };
10 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   testEnvironment: 'node',
 3 |   testRegex: '.*test.ts$',
 4 |   transform: { '^.+\\.ts?$': 'ts-jest' },
 5 |   moduleFileExtensions: ['ts', 'js', 'json'],
 6 |   rootDir: 'src',
 7 |   collectCoverage: true,
 8 |   coverageDirectory: '<rootDir>/../coverage',
 9 | };
10 | 


--------------------------------------------------------------------------------
/src/utils.ts:
--------------------------------------------------------------------------------
1 | // May also check if every element is a number <= 255 but
2 | // it a little bit slower
3 | export const isByteArray = (input: any): input is Uint8Array => {
4 |   if (input == null || typeof input != 'object') return false;
5 | 
6 |   return isFinite(input.length) && input.length >= 0;
7 | };
8 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_chinese:
--------------------------------------------------------------------------------
1 | 政府資訊科技總監辦公室和平等機會委員會合辦無障礙網頁嘉許計劃，希望透過表彰採用無障礙網頁設計的網站，推動更多企業和機構在其網站採用無障礙網頁設計，讓社會各階層包括殘疾人士更方便地獲取網上資訊和使用網上服務。無障礙網頁嘉許計劃頒獎典禮將於2013年4月15日舉行，為首次舉辦的「國際IT匯」的精彩活動之一。有關詳情，請瀏覽這裡。
2 | 政府一向致力推動長者在生活上更廣泛應用資訊科技。政府資訊科技總監辦公室已開展一項全港性嘉許計劃「智醒長者嘉許計劃」，表揚在日常生活中積極使用資訊及通訊科技的長者，以鼓勵他們繼續使用資訊及通訊科技。嘉許計劃設有金、銀、銅獎，長者於特定期間完成指定要求，可獲頒贈嘉許證書及獎牌。有關詳情，請瀏覽這裡。


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": ["config:base"],
 3 |   "dependencyDashboard": false,
 4 |   "schedule": "on the first day of the month",
 5 |   "packageRules": [
 6 |     {
 7 |       "updateTypes": ["minor", "patch", "pin", "digest"],
 8 |       "automerge": true
 9 |     },
10 |     {
11 |       "depTypeList": ["devDependencies"],
12 |       "automerge": true
13 |     }
14 |   ]
15 | }
16 | 


--------------------------------------------------------------------------------
/src/encoding/index.ts:
--------------------------------------------------------------------------------
 1 | import type { EncodingName, Match } from '../match';
 2 | 
 3 | export interface Recogniser {
 4 |   match(input: Context): Match | null;
 5 |   name(input?: Context): EncodingName;
 6 |   language?(): string | undefined;
 7 | }
 8 | 
 9 | export interface Context {
10 |   byteStats: number[];
11 |   c1Bytes: boolean;
12 |   rawInput: Uint8Array;
13 |   rawLen: number;
14 |   inputBytes: Uint8Array;
15 |   inputLen: number;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/encoding/ascii.ts:
--------------------------------------------------------------------------------
 1 | import type { Context, Recogniser } from '.';
 2 | import match, { type EncodingName, type Match } from '../match';
 3 | 
 4 | export default class Ascii implements Recogniser {
 5 |   name(): EncodingName {
 6 |     return 'ASCII';
 7 |   }
 8 | 
 9 |   match(det: Context): Match | null {
10 |     const input = det.rawInput;
11 | 
12 |     for (let i = 0; i < det.rawLen; i++) {
13 |       const b = input[i];
14 |       if (b < 32 || b > 126) {
15 |         return match(det, this, 0);
16 |       }
17 |     }
18 | 
19 |     return match(det, this, 100);
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "forceConsistentCasingInFileNames": true,
 4 |     "outDir": "lib",
 5 |     "rootDir": "src",
 6 |     "allowJs": false,
 7 |     "allowSyntheticDefaultImports": true,
 8 |     "declaration": true,
 9 |     "diagnostics": true,
10 |     "esModuleInterop": true,
11 |     "extendedDiagnostics": false,
12 |     "listEmittedFiles": true,
13 |     "module": "commonjs",
14 |     "removeComments": true,
15 |     "sourceMap": true,
16 |     "strict": true,
17 |     "target": "ES2019",
18 |     "noUnusedLocals": true,
19 |     "noUnusedParameters": true
20 |   },
21 |   "exclude": ["node_modules", "**/*.spec.ts", "**/*.test.ts", "__mocks__", "lib"]
22 | }
23 | 


--------------------------------------------------------------------------------
/src/encoding/unicode.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | 
 3 | describe('Unicode', () => {
 4 |   const base = __dirname + '/../test/data/encodings';
 5 | 
 6 |   it('should return UTF-16LE', () => {
 7 |     expect(chardet.detectFileSync(base + '/utf16le')).toBe('UTF-16LE');
 8 |   });
 9 | 
10 |   it('should return UTF-16BE', () => {
11 |     expect(chardet.detectFileSync(base + '/utf16be')).toBe('UTF-16BE');
12 |   });
13 | 
14 |   it('should return UTF-32LE', () => {
15 |     expect(chardet.detectFileSync(base + '/utf32le')).toBe('UTF-32LE');
16 |   });
17 | 
18 |   it('should return UTF-32BE', () => {
19 |     expect(chardet.detectFileSync(base + '/utf32be')).toBe('UTF-32BE');
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: ["*"]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         node-version: [18.x, 20.x, 22.x]
16 | 
17 |     steps:
18 |       - name: Checkout
19 |         uses: actions/checkout@v5
20 |         with:
21 |           fetch-depth: 0
22 |       - name: Use Node.js ${{ matrix.node-version }}
23 |         uses: actions/setup-node@v6
24 |         with:
25 |           node-version: ${{ matrix.node-version }}
26 |       - run: npm i
27 |       - run: npm test
28 |       - run: npm run build
29 |       - run: .github/workflows/test-build.sh
30 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/utf8:
--------------------------------------------------------------------------------
 1 | Euro Symbol: €.
 2 | Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα.
 3 | Íslenska / Icelandic: Ég get etið gler án þess að meiða mig.
 4 | Polish: Mogę jeść szkło, i mi nie szkodzi.
 5 | Romanian: Pot să mănânc sticlă și ea nu mă rănește.
 6 | Ukrainian: Я можу їсти шкло, й воно мені не пошкодить.
 7 | Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։
 8 | Georgian: მინას ვჭამ და არა მტკივა.
 9 | Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती.
10 | Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי.
11 | Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ.
12 | Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني.
13 | Japanese: 私はガラスを食べられます。それは私を傷つけません。
14 | Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | jobs:
 7 |   release:
 8 |     name: Release
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout
12 |         uses: actions/checkout@v5
13 |         with:
14 |           fetch-depth: 0
15 |       - name: Setup Node.js
16 |         uses: actions/setup-node@v6
17 |         with:
18 |           node-version: 24
19 |       - name: Install dependencies
20 |         run: npm i
21 |       - name: Build module
22 |         run: npm run build
23 |       - name: Release
24 |         env:
25 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
26 |           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
27 |         run: npx semantic-release
28 | 


--------------------------------------------------------------------------------
/src/utils.test.ts:
--------------------------------------------------------------------------------
 1 | import { isByteArray } from './utils';
 2 | 
 3 | describe('isByteArray', () => {
 4 |   test('positives', () => {
 5 |     expect(isByteArray(Buffer.from('hello'))).toBe(true);
 6 |     expect(isByteArray(new Uint8Array(0))).toBe(true);
 7 |     expect(isByteArray(new Uint8Array(1))).toBe(true);
 8 |     expect(isByteArray([])).toBe(true);
 9 |     expect(isByteArray([1])).toBe(true);
10 |   });
11 | 
12 |   test('negatives', () => {
13 |     expect(isByteArray(null)).toBe(false);
14 |     expect(isByteArray('')).toBe(false);
15 |     expect(isByteArray('hello')).toBe(false);
16 |     expect(isByteArray(123)).toBe(false);
17 |     expect(isByteArray('123')).toBe(false);
18 |     expect(isByteArray({})).toBe(false);
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/src/encoding/mbcs.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | 
 3 | describe('Multibyte Character Sets', () => {
 4 |   const base = __dirname + '/../test/data/encodings';
 5 | 
 6 |   it('should return Shift_JIS', () => {
 7 |     expect(chardet.detectFileSync(base + '/shiftjis')).toBe('Shift_JIS');
 8 |   });
 9 | 
10 |   it('should return GB18030', () => {
11 |     expect(chardet.detectFileSync(base + '/gb18030')).toBe('GB18030');
12 |   });
13 | 
14 |   it('should return Big5', () => {
15 |     expect(chardet.detectFileSync(base + '/big5')).toBe('Big5');
16 |   });
17 | 
18 |   it('should return EUC-JP', () => {
19 |     expect(chardet.detectFileSync(base + '/euc_jp')).toBe('EUC-JP');
20 |   });
21 | 
22 |   it('should return EUC-KR', () => {
23 |     expect(chardet.detectFileSync(base + '/euc_kr')).toBe('EUC-KR');
24 |   });
25 | });
26 | 


--------------------------------------------------------------------------------
/src/encoding/iso2022.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | import fs from 'fs';
 3 | import path from 'path';
 4 | 
 5 | describe('ISO-2022', () => {
 6 |   const base = __dirname + '/../test/data/encodings';
 7 | 
 8 |   const analyse = (asset: string) =>
 9 |     chardet.analyse(fs.readFileSync(path.join(base, asset))).shift();
10 | 
11 |   it('should return ISO-2022-JP', () => {
12 |     expect(analyse('iso2022jp')).toEqual({
13 |       confidence: 100,
14 |       lang: 'ja',
15 |       name: 'ISO-2022-JP',
16 |     });
17 |   });
18 | 
19 |   it('should return ISO-2022-KR', () => {
20 |     expect(analyse('iso2022kr')).toEqual({
21 |       confidence: 100,
22 |       lang: 'kr',
23 |       name: 'ISO-2022-KR',
24 |     });
25 |   });
26 | 
27 |   it('should return ISO-2022-CN', () => {
28 |     expect(analyse('iso2022cn')).toEqual({
29 |       confidence: 100,
30 |       lang: 'zh',
31 |       name: 'ISO-2022-CN',
32 |     });
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/.github/workflows/test-build.js:
--------------------------------------------------------------------------------
 1 | const assert = require('assert');
 2 | 
 3 | const chardet = require(process.cwd());
 4 | 
 5 | assert(typeof chardet.analyse, 'function');
 6 | assert(typeof chardet.detect, 'function');
 7 | assert(typeof chardet.detectFile, 'function');
 8 | assert(typeof chardet.detectFileSync, 'function');
 9 | 
10 | assert.deepStrictEqual(chardet.analyse(Buffer.from('This is a test')), [
11 |   { confidence: 100, name: 'ASCII', lang: undefined },
12 |   { confidence: 98, name: 'ISO-8859-1', lang: 'en' },
13 |   { confidence: 98, name: 'ISO-8859-2', lang: 'hu' },
14 |   { confidence: 10, name: 'UTF-8', lang: undefined },
15 |   { confidence: 10, name: 'Shift_JIS', lang: 'ja' },
16 |   { confidence: 10, name: 'Big5', lang: 'zh' },
17 |   { confidence: 10, name: 'EUC-JP', lang: 'ja' },
18 |   { confidence: 10, name: 'EUC-KR', lang: 'ko' },
19 |   { confidence: 10, name: 'GB18030', lang: 'zh' },
20 | ]);
21 | 
22 | console.log(' > test-build.js OK');
23 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_russian:
--------------------------------------------------------------------------------
1 | Первомай в современном виде возник в конце XIX века в рабочем движении, выдвинувшем в качестве одного из основных требований введение восьмичасового рабочего дня. 1 мая 1886 года социалистические, коммунистические и анархические организации США и Канады устроили ряд митингов и демонстраций. При разгоне такой демонстрации в Чикаго 4 мая погибло шесть демонстрантов. В ходе последовавших за этим массовых выступлений протеста против жестоких действий полиции в результате взрыва бомбы последовавшей перестрелке было убито восемь полицейских и минимум четверо рабочих (по некоторым данным, до пятидесяти убитых и раненых[2]), несколько десятков человек получили ранения. По обвинению в организации взрыва четверо рабочих-анархистов были приговорены к повешению (впоследствии было доказано, что обвинение было ложным)[3]. Именно в память о казнённых Парижский конгресс II Интернационала (июль 1889) объявил 1 мая Днём солидарности рабочих всего мира и предложил ежегодно отмечать его демонстрациями с социальными требованиями.


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_greek:
--------------------------------------------------------------------------------
1 | Η Λαϊκή ή Δημώδης Λατινική (λατ. sermo vulgaris) είναι ένας όρος-ομπρέλα, ο οποίος καλύπτει τις διαλέκτους τής λατινικής γλώσσας που ομιλούνταν κυρίως στις δυτικές επαρχίες τής Ρωμαϊκής Αυτοκρατορίας, μέχρις ότου αυτές οι διάλεκτοι, αποκλίνοντας ακόμη περισσότερο, εξελίχθηκαν στις πρώιμες ρομανικές γλώσσες κατά τον 9ο αιώνα.
2 | Η ομιλουμένη Λατινική διέφερε από τη λογοτεχνική κλασική Λατινική στην προφορά, το λεξιλόγιο και τη γραμματική. Κάποια χαρακτηριστικά της δημώδους Λατινικής δεν εμφανίστηκαν παρά στην ύστερη Αυτοκρατορία. Άλλα χαρακτηριστικά της υπήρχαν πιθανόν στην ομιλουμένη Λατινική, τουλάχιστον στις πρωτογενείς μορφές τους, πολύ νωρίτερα. Οι περισσότεροι ορισμοί τής δημώδους Λατινικής την παρουσιάζουν ως προφορική παρά ως γραπτή γλώσσα, επειδή οι μαρτυρίες οδηγούν στο συμπέρασμα ότι η ομιλουμένη Λατινική διασπάστηκε σε αποκλίνουσες διαλέκτους αυτή την περίοδο. Επειδή κανείς τότε δεν μετέγραψε φωνητικά την καθημερινή ομιλία των Λατίνων, οι μελετητές τής λαϊκής Λατινικής πρέπει να χρησιμοποιούν έμμεσες μεθόδους.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2024 Dmitry Shirokov
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.github/workflows/test-build.ts:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | const main = async () => {
 4 |   const chardet = await import(process.cwd());
 5 | 
 6 |   assert(typeof chardet.analyse, 'function');
 7 |   assert(typeof chardet.detect, 'function');
 8 |   assert(typeof chardet.detectFile, 'function');
 9 |   assert(typeof chardet.detectFileSync, 'function');
10 | 
11 |   assert.deepStrictEqual(chardet.analyse(Buffer.from('This is a test')), [
12 |     { confidence: 100, name: 'ASCII', lang: undefined },
13 |     { confidence: 98, name: 'ISO-8859-1', lang: 'en' },
14 |     { confidence: 98, name: 'ISO-8859-2', lang: 'hu' },
15 |     { confidence: 10, name: 'UTF-8', lang: undefined },
16 |     { confidence: 10, name: 'Shift_JIS', lang: 'ja' },
17 |     { confidence: 10, name: 'Big5', lang: 'zh' },
18 |     { confidence: 10, name: 'EUC-JP', lang: 'ja' },
19 |     { confidence: 10, name: 'EUC-KR', lang: 'ko' },
20 |     { confidence: 10, name: 'GB18030', lang: 'zh' },
21 |   ]);
22 | };
23 | 
24 | main()
25 |   .then(() => console.log(' > test-build.ts OK'))
26 |   .catch((err) => {
27 |     console.error(err);
28 |     process.exit(1);
29 |   });
30 | 


--------------------------------------------------------------------------------
/src/match.ts:
--------------------------------------------------------------------------------
 1 | import { Context, Recogniser } from "./encoding";
 2 | 
 3 | export type EncodingName =
 4 |   | 'ASCII'
 5 |   | 'Big5'
 6 |   | 'EUC-JP'
 7 |   | 'EUC-KR'
 8 |   | 'GB18030'
 9 |   | 'ISO_2022' // TODO: Use hyphen
10 |   | 'ISO-2022-CN'
11 |   | 'ISO-2022-JP'
12 |   | 'ISO-2022-KR'
13 |   | 'ISO-8859-1'
14 |   | 'ISO-8859-2'
15 |   | 'ISO-8859-5'
16 |   | 'ISO-8859-6'
17 |   | 'ISO-8859-7'
18 |   | 'ISO-8859-8'
19 |   | 'ISO-8859-9'
20 |   | 'ISO-8859-9'
21 |   | 'KOI8-R'
22 |   | 'mbcs'
23 |   | 'sbcs'
24 |   | 'Shift_JIS' // TODO: Use hyphen
25 |   | 'UTF-16BE'
26 |   | 'UTF-16LE'
27 |   | 'UTF-32'
28 |   | 'UTF-32BE'
29 |   | 'UTF-32LE'
30 |   | 'UTF-8'
31 |   | 'windows-1250'
32 |   | 'windows-1251'
33 |   | 'windows-1252'
34 |   | 'windows-1253'
35 |   | 'windows-1254'
36 |   | 'windows-1254'
37 |   | 'windows-1255'
38 |   | 'windows-1256'
39 | 
40 | export interface Match {
41 |   confidence: number;
42 |   name: EncodingName;
43 |   lang?: string;
44 | }
45 | 
46 | export default (ctx: Context, rec: Recogniser, confidence: number): Match => ({
47 |   confidence,
48 |   name: rec.name(ctx),
49 |   lang: rec.language ? rec.language() : undefined,
50 | });
51 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_japanese:
--------------------------------------------------------------------------------
 1 | コンピューターは、本質的には数字しか扱うことができません。コンピューターは、文字や記号などのそれぞれに番号を割り振ることによって扱えるようにします。ユニコードが出来るまでは、これらの番号を割り振る仕組みが何百種類も存在しました。どの一つをとっても、十分な文字を含んではいませんでした。例えば、欧州連合一つを見ても、そのすべての言語をカバーするためには、いくつかの異なる符号化の仕組みが必要でした。英語のような一つの言語に限っても、一つだけの符号化の仕組みでは、一般的に使われるすべての文字、句読点、技術的な記号などを扱うには不十分でした。
 2 | 
 3 | これらの符号化の仕組みは、相互に矛盾するものでもありました。二つの異なる符号化の仕組みが、二つの異なる文字に同一の番号を付けることもできるし、同じ文字に異なる番号を付けることもできるのです。どのようなコンピューターも（特にサーバーは）多くの異なった符号化の仕組みをサポートする必要があります。たとえデータが異なる符号化の仕組みやプラットフォームを通過しても、いつどこでデータが乱れるか分からない危険を冒すことのなるのです。
 4 | ユニコードはすべてを変えます
 5 | 
 6 | ユニコードは、プラットフォームに係わらず、プログラムに係わらず、言語に係わらず、すべての文字に独立した番号を与えます。ユニコード標準は、アップル、ヒューレットパッカード、IBM、ジャストシステム、マイクロソフト、オラクル、SAP、サン、サイベースなどの産業界の主導的企業と他の多くの企業に採用されています。ユニコードは、XML、Java、ECMAScript(JavaScript)、LDAP、CORBA 3.0などの最先端の標準の前提となっており、ユニコードを実装すれば、ISO/IEC 10646に適合することになります。ユニコードは、多くのオペレーティングシステムとすべての最新のブラウザーと他の多くの製品でサポートされています。ユニコード標準の出現とユニコードをサポートするツール類は、昨今顕著になっているソフトウエア技術のグローバル化の流れに対して、特に役に立っています。
 7 | 
 8 | ユニコードをクライアントサーバー型のアプリケーションや、多層構造を持つアプリケーション、ウェブサイトなどにに組み込むことで、従来の文字コードセットを用いるよりも明らかなコスト削減が可能です。ユニコードは、単一のソフトウエア製品、単一のウェブサイトに、何ら手を加えることなく、複数のプラットフォーム、複数の言語、複数の国をカバーすることが出来るのです。ユニコードは、データが多くの異なるシステムの間を、何の乱れもなしに転送することを可能とするのです。
 9 | ユニコードコンソーシアムについて
10 | 
11 | ユニコードコンソーシアムは、最新のソフトウエア製品と標準においてテキストを表現することを意味する“ユニコード標準”の構築、発展、普及、利用促進を目的として設立された非営利組織です。同コンソーシアムの会員は、コンピューターと情報処理に係わる広汎な企業や組織から構成されています。同コンソーシアムは、財政的には、純粋に会費のみによって運営されています。ユニコード標準を支持し、その拡張と実装を支援する世界中の組織や個人は、だれもがユニコードコンソーシアムの会員なることができます。
12 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_korean:
--------------------------------------------------------------------------------
 1 | 기본적으로 컴퓨터는 숫자만 처리합니다. 글자나 다른 문자에도 숫자를 지정하여 저장합니다. 유니코드가 개발되기 전에는 이러한 숫자를 지정하기 위해 수백 가지의 다른 기호화 시스템을 사용했습니다. 단일 기호화 방법으로는 모든 문자를 포함할 수 없었습니다. 예를 들어 유럽 연합에서만 보더라도 모든 각 나라별 언어를 처리하려면 여러 개의 다른 기호화 방법이 필요합니다. 영어와 같은 단일 언어의 경우도 공통적으로 사용되는 모든 글자, 문장 부호 및 테크니컬 기호에 맞는 단일 기호화 방법을 갖고 있지 못하였습니다.
 2 | 
 3 | 이러한 기호화 시스템은 또한 다른 기호화 시스템과 충돌합니다. 즉 두 가지 기호화 방법이 두 개의 다른 문자에 대해 같은 번호를 사용하거나 같은 문자에 대해 다른 번호를 사용할 수 있습니다. 주어진 모든 컴퓨터(특히 서버)는 서로 다른 여러 가지 기호화 방법을 지원해야 합니다. 그러나, 데이터를 서로 다른 기호화 방법이나 플랫폼 간에 전달할 때마다 그 데이터는 항상 손상의 위험을 겪게 됩니다.
 4 | 유니코드로 모든 것을 해결할 수 있습니다!
 5 | 
 6 | 유니코드는 사용 중인 플랫폼, 프로그램, 언어에 관계없이 문자마다 고유한 숫자를 제공합니다. 유니코드 표준은 Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys 및 기타 여러 회사와 같은 업계 선두주자에 의해 채택되었습니다. 유니코드는 XML, Java, ECMAScript(JavaScript), LDAP, CORBA 3.0, WML 등과 같이 현재 널리 사용되는 표준에서 필요하며 이는 ISO/IEC 10646을 구현하는 공식적인 방법입니다. 이는 많은 운영 체제, 요즘 사용되는 모든 브라우저 및 기타 많은 제품에서 지원됩니다. 유니코드 표준의 부상과 이를 지원하는 도구의 가용성은 최근 전 세계에 불고 있는 기술 경향에서 가장 중요한 부분을 차지하고 있습니다.
 7 | 
 8 | 유니코드를 클라이언트-서버 또는 다중-연결 응용 프로그램과 웹 사이트에 통합하면 레거시 문자 세트 사용에 있어서 상당한 비용 절감 효과가 나타납니다. 유니코드를 통해 리엔지니어링 없이 다중 플랫폼, 언어 및 국가 간에 단일 소프트웨어 플랫폼 또는 단일 웹 사이트를 목표로 삼을 수 있습니다. 이를 사용하면 데이터를 손상 없이 여러 시스템을 통해 전송할 수 있습니다.
 9 | 유니코드 콘소시엄에 대해
10 | 
11 | 유니코드 콘소시엄은 비영리 조직으로서 현대 소프트웨어 제품과 표준에서 텍스트의 표현을 지정하는 유니코드 표준의 사용을 개발하고 확장하며 장려하기 위해 세워졌습니다. 콘소시엄 멤버쉽은 컴퓨터와 정보 처리 산업에 종사하고 있는 광범위한 회사 및 조직의 범위를 나타냅니다. 콘소시엄의 재정은 전적으로 회비에 의해 충당됩니다. 유니코드 컨소시엄에서의 멤버쉽은 전 세계 어느 곳에서나 유니코드 표준을 지원하고 그 확장과 구현을 지원하고자하는 조직과 개인에게 개방되어 있습니다.
12 | 
13 | 더 자세한 내용은 용어집, 예제 유니코드 사용 가능 제품, 기술 정보 및 기타 유용한 정보를 참조하십시오.
14 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_czech:
--------------------------------------------------------------------------------
1 | Velký a Malý Tisý je národní přírodní rezervace ev. č. 498 poblíž města Lomnice nad Lužnicí v okrese Jindřichův Hradec ležící na území CHKO Třeboňsko. Řadí se mezi nejvýznamnější rybniční rezervace v Česku a je významná rozsáhlým litorálním porostem na březích rybníků. Oblast spravuje AOPK ČR Správa CHKO Třeboňsko a je evidována i v rámci světové organizace UNESCO jako biosférická rezervace, Natura 2000 a další. Důvodem ochrany je jedna z nejvýznamnějších ornitologických rezervací v Česku. Význam má i z pohledu entomologického.
2 | Součástí rezervace je 11 větších rybníků, mimo jiné i dvojice rybníků Velký a Malý Tisý, které daly lokalitě název. Pro rybníky v rezervaci je charakteristické, že mají velmi členité pobřeží tvořené zarostlými břehy, zátokami, poloostrovy a ostrůvky. Na břehy volně navazují podmáčené louky, lesy, vřesoviště a pole. Vlivem rozmanitosti různých stanovišť se zde nachází bohatá řada druhů z flory i fauny, které zde sídlí. Hlavně ptactvo využívá lokalitu jako důležitou migrační zastávku či shromaždiště před pravidelnými tahy.
3 | I přes to, že je lokalita po desetiletí chráněna, došlo nevhodnými hospodářskými zásahy v podobě nadměrného chovu ryb od 50. let 20. století k postupné degradaci a ústupu litorálních porostů. Od 90. let 20. století se ochranáři snaží snižováním počtu nasazovaných ryb a změnou jejich druhové skladby společně s vodohospodářskými zásahy do výšky vodní hladiny rybníku Velký Tisý podpořit rozvoj rákosových porostů. Výsledky těchto opatření ukázaly, že na obnovu porostů by i za vhodných podmínek byla potřeba doba dosahující až desítek let.


--------------------------------------------------------------------------------
/src/test/data/encodings/iso2022cn:
--------------------------------------------------------------------------------
 1 | $)A;y1>IO#,<FKc;zV;JG4&@mJ}WV!#K|CGV86(R;8vJ}WV#,@44"4fWVD8;rFdK{WV7{!#TZ44TlUnicodeV.G0#,SPJ}0YVVV86(UbP)J}WV5D1`BkO5M3!#C;SPR;8v1`Bk?IRT0|:,Wc9;5DWV7{#:@}Hg#,5%5%E7V]92M,Le>MPhR*:C<8VV2;M,5D1`Bk@40|@(KySP5DSoQT!#<4J9JG5%R;VVSoQT#,@}HgS"So#,R2C;SPDDR;8v1`Bk?IRTJJSCSZKySP5DWVD8#,1j5c7{:E#,:M3#SC5D<<Ju7{:E!#
 2 | 
 3 | $)AUbP)1`BkO5M3R2;a;%O`3eM;!#R2>MJGK5#,A=VV1`Bk?ID\J9SCO`M,5DJ}WV4z1mA=8v2;M,5DWV7{#,;rJ9SC2;M,5DJ}WV4z1mO`M,5DWV7{!#HN:NR;L(LX6(5D<FKc;z(LX1pJG7~NqFw)6<PhR*V'3VPm6`2;M,5D1`Bk#,5+JG#,2;B[J2C4J1:rJ}>]M(9}2;M,5D1`Bk;rF=L(V.<d#,DGP)J}>]W\;aSPKp;55DN#OU!#
 4 | Unicode$)AU}TZ8D1dKySPUbR;GP#!
 5 | 
 6 | Unicode$)A8xC?8vWV7{La9)AKR;8vN(R;5DJ}WV#,2;B[JGJ2C4F=L(#,2;B[JGJ2C43LPr#,2;B[J2C4SoQT!#Unicode1jW<RQ>-1;UbP)9$R5=g5DAl5<CGKy2ISC#,@}Hg#:Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys:MFdK|Pm6`9+K>!#WnPB5D1jW<6<PhR*Unicode#,@}HgXML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML5H5H#,2"GR#,UnicodeJGJ5OVISO/IEC 106465DU}9f7=J=!#Pm6`2YWwO5M3#,KySPWnPB5Dd/@@Fw:MPm6`FdK{2zF76<V'3VK|!#Unicode1jW<5D3vOV:MV'3VK|9$>_5D4fTZ#,JG=|@4H+GrHm<~<<JuWnVXR*5D7"U9GwJF!#
 7 | 
 8 | $)A=+UnicodeSk?M;'7~NqFw;r6`2cS&SC3LPr:MMxU>=a:O#,1HJ9SC4+M3WV7{</=ZJ!7QSC!#UnicodeJ95%R;Hm<~2zF7;r5%R;MxU>D\9;9a4)6`8vF=L(#,SoQT:M9z<R#,6x2;PhR*VX=(!#K|?I=+J}>]4+Jd5=Pm6`2;M,5DO5M3#,6xN^Kp;5!#
 9 | $)A9XSZUnicodeQ'JuQ';a
10 | 
11 | Unicode$)AQ'JuQ';aJGR;8v7GS/@{5DWiV/#,JGN*7"U9#,@)U9:MMF9cJ9SCUnicode1jW<6x=(A"5D#,UnicodeQ'JuQ';aIhA"AKOV4zHm<~2zF7:M1jW<ND1>5D1mJ>7(!#Q'JuQ';a5D;aT14z1mAK9c7:AlSr5D<FKc;z:MWJQ69$R55D9+K>:MWiV/!#Q'JuQ';aV;SI;aT1La9)WJ=p!#UnicodeQ'JuQ';a5D;aT1WJ8q?*7E8xJ@=gIOHN:NV'3VUnicode1jW<:MO#M{P-VzFd@)U9:MV4PP5DWiV/<08vHK!#
12 | 
13 | $)AS{V*8|6`PEO"#,Gk2NTDJuSo4J;c1m#,Unicode2zF7Qy1>#,<<Ju<r=i:M2N?<WJAO!#
14 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_arabic:
--------------------------------------------------------------------------------
1 | قبل حرب 1948 كانت المنطقة جزءاً من الانتداب البريطاني على فلسطين. بقيت أراضي الضفة الغربية في أيادي الجيش الأردني بعد التوقيع على اتفاقيات الهدنة (اتفاقيات رودس) التي أنهت الحرب عام 1949 ورسمت الحدود الفاصلة بين الضفة الغربية والأراضي التي أقيمت عليها دولة إسرائيل. هذه الحدود (التي هي جزء من الخط الأخضر) ضمت إلى الضفة الغربية الجزء الشرقي لمدينة القدس، بما في ذلك البلدة القديمة، ما عدا جبل المشارف.
2 | تمت الوحدة بين الضفتين الشرقية (الأردنية) والغربية (الفلسطينية) بعد مؤتمر أريحا عام 1951م الذي طالب بالوحدة. ظلت هذه الوحدة قائمة مع الضفة الشرقية واعتبار أهالي الضفة الغربية مواطنيين أردنيين حتى عام 1988 عندما قرر الملك حسين الراحل فك الارتباط القانوني والإداري والمالي (قرار فك الارتباط) بناءا على طلب منظمة التحرير الفلسطينية ماعدا الأوقاف التي بقيت مرتبطة مع الحكومة الأردنية حتى اليوم من إشراف وتعيينات وصيانة للأوقاف المسيحية والإسلامية والتزامات مالية.
3 | في 5 حزيران 1967 احتلت إسرائيل أراضي الضفة الغربية (وأراضٍ أخرى) إبان حرب الأيام الستة (النكسة) ولا تزال الضفة خاضعة لأحكام اتفاقية جنيف الرابعة للأراضي المحتلة. على الرغم من ذلك قامت إسرائيل ببناء العديد من الـمستوطنات في الضفة. كما قامت إسرائيل بضم القدس الشرقية وضواحيها بشكل أحادي الجانب لم يعترف بشرعيتة المجتمع الدولي. تطلق الحكومة الإسرائيلية على المنطقة اسم "يهودا وشومرون" (أي "يهوذا والسامرة")، حيث تذكر بهذا الاسم في الوثائق الإسرائيلية الرسمية.
4 | في عام 1993 وقعت إسرائيل ومنظمة التحرير الفلسطينية اتفاقية أوسلو التي نصت على إقامة حكومة ذاتية فلسطينية تدير الحياة المدنية في الضفة الغربية وقطاع غزة لفترة انتقالية، على أن تستأنف المفاوضات في القضايا المتبقية، كالقدس واللاجئين. وبالفعل وفي عام 1994 أقيمت السلطة الوطنية الفلسطينية في بعض المدن والقرى الفلسطينية بالتدريج، ولكنها منذ انتفاضة الأقصى لا تستطيع القيام بواجباتها بشكل ناجح. لا يزال هناك مفاوضات بين الإسرائليين والفلسطينيين ولكنها كثيراً ما تتعثر بسبب إصرار إسرائيل على متابعة استيطانها في الضفة الغربية.
5 | 
6 | 
7 | الجدار الفاصل جرف الكثير من أراضي الضفة الغربية
8 | في أبريل 2002 شرعت الحكومة الإسرائيلية برئاسة أرييل شارون ببناء جدار فاصل بينها وبين الفلسطينيين داخل أراضي الضفة الغربية قالت أنه بهدف حماية إسرائيل من العمليات العسكرية الفلسطينية. لكنه اقتضم الكثير من الأراضي الفلسطينية وساهم في إحكام الحصار على الشعب الفلسطيني وإفقار اقتصاده الوطني بشكل شبه كامل. كما تم عزل مدن وبلدات بكاملها عن محيطها الفلسطيني.


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_hebrew:
--------------------------------------------------------------------------------
 1 | מה זה יוניקוד (Unicode)?
 2 | 
 3 | יוניקוד מקצה מספר ייחודי לכל תו,
 4 | לא משנה על איזו פלטפורמה,
 5 | לא משנה באיזו תוכנית,
 6 | ולא משנה באיזו שפה.
 7 | 
 8 | באופן בסיסי, מחשבים עוסקים רק במספרים. הם מאחסנים אותיות ותווים אחרים על-ידי הקצאת מספר לכל אחד מהם. בטרם הומצא היוניקוד, היו מאות מערכות קידוד שונות להקצאת המספרים הללו. אף לא אחת מהן יכלה להכיל כמות תווים מספקת. לדוגמא: רק לאיחוד האירופאי נדרשים כמה סוגי קידודים שונים על מנת לכסות את כל השפות המדוברות בו. יתירה מזאת אף לשפה בודדת, כמו אנגלית למשל, לא היה די במערכת קידוד אחת בעבור כל האותיות, סימני הפיסוק והסמלים הטכניים שבשימוש שוטף.
 9 | 
10 | מערכות קידוד אלו אף סותרות זו את זו. כלומר, שני קידודים יכולים להשתמש באותו מספר לשני תוים נבדלים, או להשתמש במספרים שונים לאותו תו. על כל מחשב (ובמיוחד שרתים) לתמוך במספר רב של מערכות קידוד שונות; אולם כל אימת שנתונים עוברים בין מערכות קידוד או פלטפורמות שונות קיים הסיכון שייפגמו.
11 | יוניקוד משנה את כל זה!
12 | 
13 | יוניקוד מקצה מספר ייחודי לכל תו, ללא תלות בפלטפורמה, בתוכנית, או בשפה. תקן היוניקוד אומץ על-ידי המובילים בתעשייה כמו Apple‏, HP‏, IBM‏, JustSystem‏, Microsoft‏, Oracle‏, SAP‏, Sun‏, Sybase‏, Unisys‏ ורבים אחרים. יוניקוד נדרש על-ידי תקנים מודרניים כמו XML‏, Java‏, ECMAScript (JavaScript)‎‏, LDAP‏, CORBA 3.0‎‏, WML‏ וכדומה, ומהווה למעשה את היישום הרשמי של תקן ISO/IEC 10646. הוא נתמך על ידי מערכות הפעלה רבות, כל הדפדפנים החדישים, ומוצרים רבים אחרים. הופעת תקן היוניקוד וזמינות הכלים התומכים בו נמנות עם המגמות הכלל-עולמיות החשובות ביותר, אשר מסתמנות לאחרונה בטכנולוגיית התוכנה.
14 | 
15 | שילוב יוניקוד ביישומי שרת-לקוח או ביישומים רבי-שכבות ובאתרי אינטרנט מאפשר חיסכון ניכר בעלויות לעומת השימוש בסדרות התווים המסורתיות. הודות ליוניקוד, מוצר תוכנה אחד או אתר יחיד ברשת יכול להרחיב את יעדיו למגוון פלטפורמות, ארצות ושפות ללא צורך בשינויים מרחיקים. יוניקוד מאפשר מעבר נתונים דרך מערכות רבות ושונות מבלי שייפגמו.
16 | פרטים אודות הקונסורציום של יוניקוד (Unicode Consortium)
17 | 
18 | הקונסורציום של יוניקוד הוא ארגון ללא מטרת רווח שנוסד כדי לפתח, להרחיב ולקדם את השימוש בתקן יוניקוד, אשר מגדיר את ייצוג הטקסט במוצרי תוכנה ותקנים מודרניים. חברים בקונסורציום מגוון רחב של תאגידים וארגונים בתעשיית המחשבים ועיבוד המידע. הקונסורציום ממומן על-ידי דמי-חבר בלבד. החברות בקונסורציום יוניקוד פתוחה לארגונים ולאנשים פרטיים, בכל רחבי העולם, אשר תומכים בתקן יוניקוד ומעוניינים לסייע בהתפתחותו והטמעתו.
19 | 
20 | למידע נוסף, ראה מילון מונחים, רשימה חלקית של מוצרים מותאמים ליוניקוד, מבוא טכני ו- חומרי עזר [קישורים באנגלית].


--------------------------------------------------------------------------------
/src/test/data/encodings/lang_turkish:
--------------------------------------------------------------------------------
1 | Leylek (Ciconia ciconia), leylekgiller (Ciconiidae) familyasından büyük ve uzun bacaklı bir kuş türü. Siyah kanat uçuş tüylerinin dışında tamamen beyazdır, gagası ve bacakları erişkinlerde kırmızı, yavrularda ise siyahtır. Cüssesi biraz farklı olan iki alttürü ise Avrupa'da (kuzeyde Finlandiya'ya kadar), kuzeybatı Afrika'da ve güneybatı Asya'da (doğuda Kazakistan'ın güneyine kadar) bulunur. Leylekler uzun mesafelere göç ederler. Çoğunlukla tropikal Sahraaltı Afrika'dan Güney Afrika'nın güneyine ve hatta Hindistan altkıtasının güneyine kadar olan bölgede kışı geçirirler. Avrupa'dan Afrika'ya göç ederken Akdeniz üzerinden değil, doğuda Levant üzerinden, batıda da Cebelitarık Boğazı'ndan geçerler. Bunun nedeni uçmak için gereksinim duydukları hava termallerinin deniz üzerinde oluşmamasıdır. Yerde yürürken durmadan, yavaşça hareket ederler. Leylekgiller ailesinin diğer üyeleri gibi boynu tamamen gerilmiş şekilde uçarlar.
2 | Etçil olan leylek, böcekler, balık, amfibiler, sürüngenler, küçük memeliler ve küçük kuşlar gibi çok geniş bir yelpazede beslenir. Besinlerinin çoğunu yerden, kısa bitki örtüsü içinden ve sığ sulardan toplar. Tekeşli olarak ürerler ancak yaşam boyunca sürecek bir çift bağı kurmazlar. Hem erkeği hem de dişisi, çubuklardan oluşan ve birkaç yıl kullanılabilen büyük bir yuva yapar. Dişi leylek her yıl bir kereliğine olmak üzere dört yumurta yumurtlar ve yavrular 33-34 gün sonra aynı anda olmamak üzere yumurtadan çıkar. Çifti oluşturan kuşların ikisi de kuluçkaya yatar ve birlikte yavruları beslerler. Yavrular yumurtadan çıktıktan 58-64 gün sonra yuvadan ayrılır ve 7 ila 20 gün daha ebeveynler tarafından beslenir.
3 | Leylek, Dünya Doğa ve Doğal Kaynakları Koruma Birliği (IUCN) tarafından asgari endişe altındaki türler arasında sınıflandırılmıştır. Orta Çağ boyunca ormanların azalması leyleklerin yararına olmuştur ancak tarım pratiklerinin değişmesi ve sanayileşme 19. yüzyılda ve 20. yüzyılın başlarında Avrupa'nın bazı bölgelerinde popülasyonlarının azalmasına ve hatta yok olmasına neden olmuştur. Avrupa çapındaki koruma programlarının sonucunda leyleklerin tekrar Hollanda, Belçika, İsviçre ve İsveç'te üremeleri sağlanmıştır. Doğal düşmanlarının sayısı azdır ancak çeşitli parazitler taşıyabilir. Dikkat çekici bir tür olan leylek tarih boyunca bulunduğu bölgelerde çeşitli söylencelere konu olmuştur. Bunların en bilineni, bebeklerin leylekler tarafından getirildiği söylencesidir.
4 | € ‚  ƒ  „  …  †  ‡ Š


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "chardet",
 3 |   "version": "0.0.0-development",
 4 |   "homepage": "https://github.com/runk/node-chardet",
 5 |   "description": "Character encoding detector",
 6 |   "license": "MIT",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "https://github.com/runk/node-chardet.git"
10 |   },
11 |   "bugs": {
12 |     "mail": "deadrunk@gmail.com",
13 |     "url": "http://github.com/runk/node-chardet/issues"
14 |   },
15 |   "scripts": {
16 |     "build": "rm -rf lib/* && tsc",
17 |     "format": "prettier --write ./src/**/*.ts",
18 |     "format:check": "prettier --list-different ./src/**/*.ts",
19 |     "test": "jest",
20 |     "prepublish": "npm run build",
21 |     "semantic-release": "semantic-release",
22 |     "typecheck": "tsc"
23 |   },
24 |   "files": [
25 |     "lib"
26 |   ],
27 |   "main": "lib/index.js",
28 |   "typings": "lib/index.d.ts",
29 |   "engine": {
30 |     "node": ">=4"
31 |   },
32 |   "readmeFilename": "README.md",
33 |   "directories": {
34 |     "test": "test"
35 |   },
36 |   "devDependencies": {
37 |     "@types/jest": "^30.0.0",
38 |     "@types/node": "^24.0.0",
39 |     "jest": "^30.0.0",
40 |     "prettier": "^3.0.0",
41 |     "semantic-release": "^25.0.0",
42 |     "ts-jest": "^29.0.0",
43 |     "ts-node": "^10.9.1",
44 |     "typescript": "^5.0.0"
45 |   },
46 |   "keywords": [
47 |     "encoding",
48 |     "character",
49 |     "utf8",
50 |     "detector",
51 |     "chardet",
52 |     "icu",
53 |     "character detection",
54 |     "character encoding",
55 |     "language",
56 |     "iconv",
57 |     "iconv-light",
58 |     "UTF-8",
59 |     "UTF-16",
60 |     "UTF-32",
61 |     "ISO-2022-JP",
62 |     "ISO-2022-KR",
63 |     "ISO-2022-CN",
64 |     "Shift_JIS",
65 |     "Big5",
66 |     "EUC-JP",
67 |     "EUC-KR",
68 |     "GB18030",
69 |     "ISO-8859-1",
70 |     "ISO-8859-2",
71 |     "ISO-8859-5",
72 |     "ISO-8859-6",
73 |     "ISO-8859-7",
74 |     "ISO-8859-8",
75 |     "ISO-8859-9",
76 |     "windows-1250",
77 |     "windows-1251",
78 |     "windows-1252",
79 |     "windows-1253",
80 |     "windows-1254",
81 |     "windows-1255",
82 |     "windows-1256",
83 |     "KOI8-R"
84 |   ],
85 |   "author": "Dmitry Shirokov <deadrunk@gmail.com>",
86 |   "contributors": [
87 |     "@spikying",
88 |     "@wtgtybhertgeghgtwtg",
89 |     "@suisho",
90 |     "@seangarner",
91 |     "@zevanty"
92 |   ],
93 |   "browser": {
94 |     "./lib/fs/node.js": "./lib/fs/browser.js"
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/encoding/utf8.ts:
--------------------------------------------------------------------------------
 1 | import type { Context, Recogniser } from '.';
 2 | import match, { type EncodingName, type Match } from '../match';
 3 | 
 4 | export default class Utf8 implements Recogniser {
 5 |   name(): EncodingName {
 6 |     return 'UTF-8';
 7 |   }
 8 | 
 9 |   match(det: Context): Match | null {
10 |     let hasBOM = false,
11 |       numValid = 0,
12 |       numInvalid = 0,
13 |       trailBytes = 0,
14 |       confidence;
15 |     const input = det.rawInput;
16 | 
17 |     if (
18 |       det.rawLen >= 3 &&
19 |       (input[0] & 0xff) == 0xef &&
20 |       (input[1] & 0xff) == 0xbb &&
21 |       (input[2] & 0xff) == 0xbf
22 |     ) {
23 |       hasBOM = true;
24 |     }
25 | 
26 |     // Scan for multi-byte sequences
27 |     for (let i = 0; i < det.rawLen; i++) {
28 |       const b = input[i];
29 |       if ((b & 0x80) == 0) continue; // ASCII
30 | 
31 |       // Hi bit on char found.  Figure out how long the sequence should be
32 |       if ((b & 0x0e0) == 0x0c0) {
33 |         trailBytes = 1;
34 |       } else if ((b & 0x0f0) == 0x0e0) {
35 |         trailBytes = 2;
36 |       } else if ((b & 0x0f8) == 0xf0) {
37 |         trailBytes = 3;
38 |       } else {
39 |         numInvalid++;
40 |         if (numInvalid > 5) break;
41 |         trailBytes = 0;
42 |       }
43 | 
44 |       // Verify that we've got the right number of trail bytes in the sequence
45 |       for (;;) {
46 |         i++;
47 |         if (i >= det.rawLen) break;
48 | 
49 |         if ((input[i] & 0xc0) != 0x080) {
50 |           numInvalid++;
51 |           break;
52 |         }
53 |         if (--trailBytes == 0) {
54 |           numValid++;
55 |           break;
56 |         }
57 |       }
58 |     }
59 | 
60 |     // Cook up some sort of confidence score, based on presence of a BOM
61 |     //    and the existence of valid and/or invalid multi-byte sequences.
62 |     confidence = 0;
63 |     if (hasBOM && numInvalid == 0) confidence = 100;
64 |     else if (hasBOM && numValid > numInvalid * 10) confidence = 80;
65 |     else if (numValid > 3 && numInvalid == 0) confidence = 100;
66 |     else if (numValid > 0 && numInvalid == 0) confidence = 80;
67 |     else if (numValid == 0 && numInvalid == 0)
68 |       // Plain ASCII.
69 |       confidence = 10;
70 |     else if (numValid > numInvalid * 10)
71 |       // Probably corrupt utf-8 data.  Valid sequences aren't likely by chance.
72 |       confidence = 25;
73 |     else return null;
74 | 
75 |     return match(det, this, confidence);
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/encoding/sbcs.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '..';
 2 | 
 3 | describe('Singlebyte Character Sets', () => {
 4 |   const base = __dirname + '/../test/data/encodings';
 5 | 
 6 |   it('should return ISO-8859-1 (English)', () => {
 7 |     expect(chardet.detectFileSync(base + '/iso88591_en')).toBe('ISO-8859-1');
 8 |   });
 9 | 
10 |   it('should return ISO-8859-2 (Czech)', () => {
11 |     expect(chardet.detectFileSync(base + '/iso88592_cs')).toBe('ISO-8859-2');
12 |   });
13 | 
14 |   test.todo('should return ISO-8859-3');
15 |   test.todo('should return ISO-8859-4');
16 | 
17 |   it('should return ISO-8859-5 (Russian)', () => {
18 |     expect(chardet.detectFileSync(base + '/iso88595_ru')).toBe('ISO-8859-5');
19 |   });
20 | 
21 |   it('should return ISO-8859-6 (Arabic)', () => {
22 |     expect(chardet.detectFileSync(base + '/iso88596_ar')).toBe('ISO-8859-6');
23 |   });
24 | 
25 |   it('should return ISO-8859-7 (Greek)', () => {
26 |     expect(chardet.detectFileSync(base + '/iso88597_el')).toBe('ISO-8859-7');
27 |   });
28 | 
29 |   it('should return ISO-8859-8 (Hebrew)', () => {
30 |     expect(chardet.detectFileSync(base + '/iso88598_he')).toBe('ISO-8859-8');
31 |   });
32 | 
33 |   it('should return ISO-8859-9 (Turkish)', () => {
34 |     expect(chardet.detectFileSync(base + '/iso88599_tr')).toBe('ISO-8859-9');
35 |   });
36 | 
37 |   test.todo('should return ISO-8859-10');
38 |   test.todo('should return ISO-8859-11');
39 |   // iso-8859-12 is abandoned
40 |   test.todo('should return ISO-8859-13');
41 |   test.todo('should return ISO-8859-14');
42 |   test.todo('should return ISO-8859-15');
43 |   test.todo('should return ISO-8859-16');
44 | 
45 |   it('should return windows-1250 (Czech)', () => {
46 |     expect(chardet.detectFileSync(base + '/windows_1250')).toBe('windows-1250');
47 |   });
48 | 
49 |   it('should return windows-1251 (Russian)', () => {
50 |     expect(chardet.detectFileSync(base + '/windows_1251')).toBe('windows-1251');
51 |   });
52 | 
53 |   it('should return windows-1252 (English)', () => {
54 |     expect(chardet.detectFileSync(base + '/windows_1252')).toBe('windows-1252');
55 |   });
56 | 
57 |   it('should return windows-1253 (Greek)', () => {
58 |     expect(chardet.detectFileSync(base + '/windows_1253')).toBe('windows-1253');
59 |   });
60 | 
61 |   it('should return windows-1254 (Turkish)', () => {
62 |     expect(chardet.detectFileSync(base + '/windows_1254')).toBe('windows-1254');
63 |   });
64 | 
65 |   it('should return windows-1255 (Hebrew)', () => {
66 |     expect(chardet.detectFileSync(base + '/windows_1255')).toBe('windows-1255');
67 |   });
68 | 
69 |   it('should return windows-1256 (Arabic)', () => {
70 |     expect(chardet.detectFileSync(base + '/windows_1256')).toBe('windows-1256');
71 |   });
72 | 
73 |   it('should return KOI8-R (Russian)', () => {
74 |     expect(chardet.detectFileSync(base + '/koi8r')).toBe('KOI8-R');
75 |   });
76 | });
77 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/iso2022jp:
--------------------------------------------------------------------------------
 1 | $B%3%s%T%e!<%?!<$O!"K\<AE*$K$O?t;z$7$+07$&$3$H$,$G$-$^$;$s!#%3%s%T%e!<%?!<$O!"J8;z$d5-9f$J$I$N$=$l$>$l$KHV9f$r3d$j?6$k$3$H$K$h$C$F07$($k$h$&$K$7$^$9!#%f%K%3!<%I$,=PMh$k$^$G$O!"$3$l$i$NHV9f$r3d$j?6$k;EAH$_$,2?I4<oN`$bB8:_$7$^$7$?!#$I$N0l$D$r$H$C$F$b!"==J,$JJ8;z$r4^$s$G$O$$$^$;$s$G$7$?!#Nc$($P!"2$=#O"9g0l$D$r8+$F$b!"$=$N$9$Y$F$N8@8l$r%+%P!<$9$k$?$a$K$O!"$$$/$D$+$N0[$J$kId9f2=$N;EAH$_$,I,MW$G$7$?!#1Q8l$N$h$&$J0l$D$N8@8l$K8B$C$F$b!"0l$D$@$1$NId9f2=$N;EAH$_$G$O!"0lHLE*$K;H$o$l$k$9$Y$F$NJ8;z!"6gFIE@!"5;=QE*$J5-9f$J$I$r07$&$K$OIT==J,$G$7$?!#(B
 2 | 
 3 | $B$3$l$i$NId9f2=$N;EAH$_$O!"Aj8_$KL7=b$9$k$b$N$G$b$"$j$^$7$?!#Fs$D$N0[$J$kId9f2=$N;EAH$_$,!"Fs$D$N0[$J$kJ8;z$KF10l$NHV9f$rIU$1$k$3$H$b$G$-$k$7!"F1$8J8;z$K0[$J$kHV9f$rIU$1$k$3$H$b$G$-$k$N$G$9!#$I$N$h$&$J%3%s%T%e!<%?!<$b!JFC$K%5!<%P!<$O!KB?$/$N0[$J$C$?Id9f2=$N;EAH$_$r%5%]!<%H$9$kI,MW$,$"$j$^$9!#$?$H$(%G!<%?$,0[$J$kId9f2=$N;EAH$_$d%W%i%C%H%U%)!<%`$rDL2a$7$F$b!"$$$D$I$3$G%G!<%?$,Mp$l$k$+J,$+$i$J$$4m81$rKA$9$3$H$N$J$k$N$G$9!#(B
 4 | $B%f%K%3!<%I$O$9$Y$F$rJQ$($^$9(B
 5 | 
 6 | $B%f%K%3!<%I$O!"%W%i%C%H%U%)!<%`$K78$o$i$:!"%W%m%0%i%`$K78$o$i$:!"8@8l$K78$o$i$:!"$9$Y$F$NJ8;z$KFHN)$7$?HV9f$rM?$($^$9!#%f%K%3!<%II8=`$O!"%"%C%W%k!"%R%e!<%l%C%H%Q%C%+!<%I!"(BIBM$B!"%8%c%9%H%7%9%F%`!"%^%$%/%m%=%U%H!"%*%i%/%k!"(BSAP$B!"%5%s!"%5%$%Y!<%9$J$I$N;:6H3&$N<gF3E*4k6H$HB>$NB?$/$N4k6H$K:NMQ$5$l$F$$$^$9!#%f%K%3!<%I$O!"(BXML$B!"(BJava$B!"(BECMAScript(JavaScript)$B!"(BLDAP$B!"(BCORBA 3.0$B$J$I$N:G@hC<$NI8=`$NA0Ds$H$J$C$F$*$j!"%f%K%3!<%I$r<BAu$9$l$P!"(BISO/IEC 10646$B$KE,9g$9$k$3$H$K$J$j$^$9!#%f%K%3!<%I$O!"B?$/$N%*%Z%l!<%F%#%s%0%7%9%F%`$H$9$Y$F$N:G?7$N%V%i%&%6!<$HB>$NB?$/$N@=IJ$G%5%]!<%H$5$l$F$$$^$9!#%f%K%3!<%II8=`$N=P8=$H%f%K%3!<%I$r%5%]!<%H$9$k%D!<%kN`$O!":r:#82Cx$K$J$C$F$$$k%=%U%H%&%(%"5;=Q$N%0%m!<%P%k2=$NN.$l$KBP$7$F!"FC$KLr$KN)$C$F$$$^$9!#(B
 7 | 
 8 | $B%f%K%3!<%I$r%/%i%$%"%s%H%5!<%P!<7?$N%"%W%j%1!<%7%g%s$d!"B?AX9=B$$r;}$D%"%W%j%1!<%7%g%s!"%&%'%V%5%$%H$J$I$K$KAH$_9~$`$3$H$G!"=>Mh$NJ8;z%3!<%I%;%C%H$rMQ$$$k$h$j$bL@$i$+$J%3%9%H:o8:$,2DG=$G$9!#%f%K%3!<%I$O!"C10l$N%=%U%H%&%(%"@=IJ!"C10l$N%&%'%V%5%$%H$K!"2?$i<j$r2C$($k$3$H$J$/!"J#?t$N%W%i%C%H%U%)!<%`!"J#?t$N8@8l!"J#?t$N9q$r%+%P!<$9$k$3$H$,=PMh$k$N$G$9!#%f%K%3!<%I$O!"%G!<%?$,B?$/$N0[$J$k%7%9%F%`$N4V$r!"2?$NMp$l$b$J$7$KE>Aw$9$k$3$H$r2DG=$H$9$k$N$G$9!#(B
 9 | $B%f%K%3!<%I%3%s%=!<%7%"%`$K$D$$$F(B
10 | 
11 | $B%f%K%3!<%I%3%s%=!<%7%"%`$O!":G?7$N%=%U%H%&%(%"@=IJ$HI8=`$K$*$$$F%F%-%9%H$rI=8=$9$k$3$H$r0UL#$9$k!H%f%K%3!<%II8=`!I$N9=C[!"H/E8!"Ia5Z!"MxMQB%?J$rL\E*$H$7$F@_N)$5$l$?Hs1DMxAH?%$G$9!#F1%3%s%=!<%7%"%`$N2q0w$O!"%3%s%T%e!<%?!<$H>pJs=hM}$K78$o$k9-HF$J4k6H$dAH?%$+$i9=@.$5$l$F$$$^$9!#F1%3%s%=!<%7%"%`$O!":b@/E*$K$O!"=c?h$K2qHq$N$_$K$h$C$F1?1D$5$l$F$$$^$9!#%f%K%3!<%II8=`$r;Y;}$7!"$=$N3HD%$H<BAu$r;Y1g$9$k@$3&Cf$NAH?%$d8D?M$O!"$@$l$b$,%f%K%3!<%I%3%s%=!<%7%"%`$N2q0w$J$k$3$H$,$G$-$^$9!#(B
12 | 
13 | $B$h$j>\$7$$$3$H$r$*CN$j$K$J$j$?$$J}$O!"(BGlossary, Unicode-Enabled Products, Technical Introduction $B$*$h$S(B Useful Resources$B$r$4;2>H$/$@$5$$!#(B
14 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/iso88591_en:
--------------------------------------------------------------------------------
 1 | What is Unicode?
 2 | 
 3 | Unicode provides a unique number for every character,
 4 | no matter what the platform,
 5 | no matter what the program,
 6 | no matter what the language.
 7 | 
 8 | Fundamentally, computers just deal with numbers. They store letters and other characters by assigning a number for each one. Before Unicode was invented, there were hundreds of different encoding systems for assigning these numbers. No single encoding could contain enough characters: for example, the European Union alone requires several different encodings to cover all its languages. Even for a single language like English no single encoding was adequate for all the letters, punctuation, and technical symbols in common use.
 9 | 
10 | These encoding systems also conflict with one another. That is, two encodings can use the same number for two different characters, or use different numbers for the same character. Any given computer (especially servers) needs to support many different encodings; yet whenever data is passed between different encodings or platforms, that data always runs the risk of corruption.
11 | Unicode is changing all that!
12 | 
13 | Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language. The Unicode Standard has been adopted by such industry leaders as Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys and many others. Unicode is required by modern standards such as XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML, etc., and is the official way to implement ISO/IEC 10646. It is supported in many operating systems, all modern browsers, and many other products. The emergence of the Unicode Standard, and the availability of tools supporting it, are among the most significant recent global software technology trends.
14 | 
15 | Incorporating Unicode into client-server or multi-tiered applications and websites offers significant cost savings over the use of legacy character sets. Unicode enables a single software product or a single website to be targeted across multiple platforms, languages and countries without re-engineering. It allows data to be transported through many different systems without corruption.
16 | About the Unicode Consortium
17 | 
18 | The Unicode Consortium is a non-profit organization founded to develop, extend and promote use of the Unicode Standard, which specifies the representation of text in modern software products and standards. The membership of the consortium represents a broad spectrum of corporations and organizations in the computer and information processing industry. The consortium is supported financially solely through membership dues. Membership in the Unicode Consortium is open to organizations and individuals anywhere in the world who support the Unicode Standard and wish to assist in its extension and implementation.
19 | 
20 | For more information, see the Glossary, Unicode Enabled Products, Technical Introduction and Useful Resources.
21 | 


--------------------------------------------------------------------------------
/src/test/data/encodings/iso2022kr:
--------------------------------------------------------------------------------
 1 | $)C1b:;@{@87N DDG;EM4B <}@Z88 C38.GU4O4Y. 1[@Z3* 4Y8% 9.@Z?!55 <}@Z8& AvA$GO?) @z@eGU4O4Y. @/4ODZ5e0! 039_5G1b @|?!4B @L7/GQ <}@Z8& AvA$GO1b @'GX <v9i 0!Av@G 4Y8% 1bH#H- =C=:E[@; ;g?kG_=@4O4Y. 4\@O 1bH#H- 9f9}@87N4B 8p5g 9.@Z8& FwGTGR <v >x>z=@4O4Y. ?98& 5i>n @/74 ?,GU?!<-88 :84u6s55 8p5g 0" 3*6s:0 >p>n8& C38.GO7A8i ?)7/ 03@G 4Y8% 1bH#H- 9f9}@L GJ?dGU4O4Y. ?5>n?M 00@: 4\@O >p>n@G 0f?l55 0xEk@{@87N ;g?k5G4B 8p5g 1[@Z, 9.@e :NH# 9W EWE)4ODC 1bH#?! 8B4B 4\@O 1bH#H- 9f9}@; 0.0m @VAv 8xGO?4=@4O4Y.
 2 | 
 3 | $)C@L7/GQ 1bH#H- =C=:E[@: 6GGQ 4Y8% 1bH#H- =C=:E[0z Cf59GU4O4Y. Ao 5N 0!Av 1bH#H- 9f9}@L 5N 03@G 4Y8% 9.@Z?! 4kGX 00@: 9xH#8& ;g?kGO0E3* 00@: 9.@Z?! 4kGX 4Y8% 9xH#8& ;g?kGR <v @V=@4O4Y. AV>nAx 8p5g DDG;EM(F/Hw <-9v)4B <-7N 4Y8% ?)7/ 0!Av 1bH#H- 9f9}@; Av?xGX>_ GU4O4Y. 1W7/3*, 5%@LEM8& <-7N 4Y8% 1bH#H- 9f9}@L3* GC7'F{ 0#?! @|4^GR 6'864Y 1W 5%@LEM4B GW;s <U;s@G @'Gh@; 0^0T 5K4O4Y.
 4 | $)C@/4ODZ5e7N 8p5g 0M@; GX0aGR <v @V=@4O4Y!
 5 | 
 6 | $)C@/4ODZ5e4B ;g?k A_@N GC7'F{, GA7N1W7%, >p>n?! 0|0h>x@L 9.@Z864Y 0m@/GQ <}@Z8& A&0xGU4O4Y. @/4ODZ5e G%AX@: Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys 9W 1bE8 ?)7/ H8;g?M 00@: >w0h <15NAV@Z?! @GGX C$EC5G>z=@4O4Y. @/4ODZ5e4B XML, Java, ECMAScript(JavaScript), LDAP, CORBA 3.0, WML 5n0z 00@L Gv@g 3N8. ;g?k5G4B G%AX?!<- GJ?dGO8g @L4B ISO/IEC 10646@; 18GvGO4B 0x=D@{@N 9f9}@T4O4Y. @L4B 89@: ?n?5 C<A&, ?dAr ;g?k5G4B 8p5g :j6s?l@z 9W 1bE8 89@: A&G0?!<- Av?x5K4O4Y. @/4ODZ5e G%AX@G :N;s0z @L8& Av?xGO4B 5518@G 0!?k<:@: CV1Y @| <<0h?! :R0m @V4B 1b<z 0fGb?!<- 0!@e A_?dGQ :N:P@; BwAvGO0m @V=@4O4Y.
 7 | 
 8 | $)C@/4ODZ5e8& E,6s@L>pF.-<-9v 6G4B 4YA_-?,0a @@?k GA7N1W7%0z @% ;g@LF.?! EkGUGO8i 790E=C 9.@Z <<F. ;g?k?! @V>n<- ;s4gGQ :q?k @}0( H?0z0! 3*E8334O4Y. @/4ODZ5e8& EkGX 8.?#Av4O>n85 >x@L 4YA_ GC7'F{, >p>n 9W 190! 0#?! 4\@O <RGAF.?~>n GC7'F{ 6G4B 4\@O @% ;g@LF.8& 8qG%7N ;o@; <v @V=@4O4Y. @L8& ;g?kGO8i 5%@LEM8& <U;s >x@L ?)7/ =C=:E[@; EkGX @|<[GR <v @V=@4O4Y.
 9 | $)C@/4ODZ5e D\<R=C>v?! 4kGX
10 | 
11 | $)C@/4ODZ5e D\<R=C>v@: :q?58. A6Aw@87N<- Gv4k <RGAF.?~>n A&G00z G%AX?!<- EX=:F.@G G%Gv@; AvA$GO4B @/4ODZ5e G%AX@G ;g?k@; 039_GO0m H.@eGO8g @e7AGO1b @'GX <<?vA3=@4O4Y. D\<R=C>v 8b9v=1@: DDG;EM?M A$:8 C38. ;j>w?! A>;gGO0m @V4B 1$9|@'GQ H8;g 9W A6Aw@G 9|@'8& 3*E83@4O4Y. D\<R=C>v@G @gA$@: @|@{@87N H8:q?! @GGX Cf4g5K4O4Y. @/4ODZ5e DA<R=C>v?!<-@G 8b9v=1@: @| <<0h >n4@ 0w?!<-3* @/4ODZ5e G%AX@; Av?xGO0m 1W H.@e0z 18Gv@; Av?xGO0m@ZGO4B A6Aw0z 03@N?!0T 039f5G>n @V=@4O4Y.
12 | 
13 | $)C4u @Z<<GQ 3;?k@: ?k>nA}, ?9A& @/4ODZ5e ;g?k 0!4I A&G0, 1b<z A$:8 9W 1bE8 @/?kGQ A$:88& B|A6GO=J=C?@.
14 | 


--------------------------------------------------------------------------------
/src/index.test.ts:
--------------------------------------------------------------------------------
 1 | import * as chardet from '.';
 2 | import defaultChardet from '.';
 3 | import fs from 'fs';
 4 | 
 5 | describe('chardet', () => {
 6 | 
 7 |   const path = __dirname + '/test/data/encodings/utf8';
 8 |   const expectedEncodingsFromPath = [
 9 |     { 'confidence': 100, 'name': 'UTF-8', 'lang': undefined },
10 |     { 'confidence': 32, 'name': 'windows-1252', 'lang': 'fr' },
11 |     { 'confidence': 19, 'name': 'KOI8-R', 'lang': 'ru' },
12 |     { 'confidence': 10, 'name': 'Big5', 'lang': 'zh' },
13 |     { 'confidence': 10, 'name': 'GB18030', 'lang': 'zh' }, // Mandarin
14 |     { 'confidence': 10, 'name': 'windows-1253', 'lang': 'el' }, // Greek
15 |     { 'confidence': 6, 'name': 'windows-1250', 'lang': 'pl' },
16 |     { 'confidence': 4, 'name': 'windows-1254', 'lang': 'tr' },
17 |     { 'confidence': 2, 'name': 'windows-1251', 'lang': 'ru' },
18 |     { 'confidence': 0, 'name': 'ASCII', 'lang': undefined },
19 |   ];
20 | 
21 |   it('has both named and default exports', () => {
22 |     expect(defaultChardet.analyse).toBe(chardet.analyse);
23 |     expect(defaultChardet.detect).toBe(chardet.detect);
24 |     expect(defaultChardet.detectFile).toBe(chardet.detectFile);
25 |     expect(defaultChardet.detectFileSync).toBe(chardet.detectFileSync);
26 |   });
27 | 
28 |   describe('#detect', () => {
29 |     it('should detect encoding', () => {
30 |       expect(chardet.detect(fs.readFileSync(path))).toBe('UTF-8');
31 |     });
32 | 
33 |     it('should not block when non-buffer supplied', () => {
34 |       const invalid = [123, '123'];
35 |       const error = 'Input must be a byte array, e.g. Buffer or Uint8Array';
36 |       // @ts-expect-error Testing invalid inputs
37 |       invalid.forEach((input) => expect(() => chardet.detect(input)).toThrow(error));
38 |     })
39 |   });
40 | 
41 |   describe('#detectFile', () => {
42 |     it('should detect encoding', async () => {
43 |       const res = await chardet.detectFile(path);
44 |       expect(res).toBe('UTF-8');
45 |     });
46 | 
47 |     it('should detect encoding with smaller sample size', async () => {
48 |       const res = await chardet.detectFile(path, { sampleSize: 32 });
49 |       expect(res).toBe('UTF-8');
50 |     });
51 | 
52 |     it('should detect encoding with smaller sample size and offset', async () => {
53 |       const res = await chardet.detectFile(path, { sampleSize: 32, offset: 64 });
54 |       expect(res).toBe('UTF-8');
55 |     });
56 | 
57 |     it('should work as expected with sampleSize larger than actual file size (1)', async () => {
58 |       const res = await chardet.detectFile(path, { sampleSize: 1024 * 1024 });
59 |       expect(res).toBe('UTF-8');
60 |     });
61 | 
62 |     it('should work as expected with sampleSize larger than actual file size (2)', async () => {
63 |       const res = await chardet.detectFile(__dirname + '/test/data/encodings/koi8r', { sampleSize: 1024 * 1024 });
64 |       expect(res).toBe('KOI8-R');
65 |     });
66 |   });
67 | 
68 |   describe('#detectFileSync', () => {
69 |     it('should detect encoding', () => {
70 |       expect(chardet.detectFileSync(path)).toBe('UTF-8');
71 |     });
72 | 
73 |     it('should detect encoding with smaller sample size', () => {
74 |       expect(chardet.detectFileSync(path, { sampleSize: 32 })).toBe('UTF-8');
75 |     });
76 | 
77 |     it('should detect encoding with smaller sample size and offset', () => {
78 |       expect(chardet.detectFileSync(path, { sampleSize: 32, offset: 64 })).toBe('UTF-8');
79 |     });
80 |   });
81 | 
82 |   describe('#analyse', () => {
83 |     it('should return a list of encodings, sorted by confidence level in descending order', () => {
84 |       const matches = chardet.analyse(fs.readFileSync(path));
85 |       expect(matches).toEqual(expectedEncodingsFromPath);
86 |     });
87 |   });
88 | });
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # chardet
  2 | 
  3 | _Chardet_ is a character detection module written in pure JavaScript (TypeScript). Module uses occurrence analysis to determine the most probable encoding.
  4 | 
  5 | - Packed size is only **22 KB**
  6 | - Works in all environments: Node / Browser / Native
  7 | - Works on all platforms: Linux / Mac / Windows
  8 | - No dependencies
  9 | - No native code / bindings
 10 | - 100% written in TypeScript
 11 | - Extensive code coverage
 12 | 
 13 | ## Installation
 14 | 
 15 | ```
 16 | npm i chardet
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | To return the encoding with the highest confidence:
 22 | 
 23 | ```javascript
 24 | import chardet from 'chardet';
 25 | 
 26 | const encoding = chardet.detect(Buffer.from('hello there!'));
 27 | // or
 28 | const encoding = await chardet.detectFile('/path/to/file');
 29 | // or
 30 | const encoding = chardet.detectFileSync('/path/to/file');
 31 | ```
 32 | 
 33 | To return the full list of possible encodings use `analyse` method.
 34 | 
 35 | ```javascript
 36 | import chardet from 'chardet';
 37 | chardet.analyse(Buffer.from('hello there!'));
 38 | ```
 39 | 
 40 | Returned value is an array of objects sorted by confidence value in descending order
 41 | 
 42 | ```javascript
 43 | [
 44 |   { confidence: 90, name: 'UTF-8' },
 45 |   { confidence: 20, name: 'windows-1252', lang: 'fr' },
 46 | ];
 47 | ```
 48 | 
 49 | In browser, you can use [Uint8Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array) instead of the `Buffer`:
 50 | 
 51 | ```javascript
 52 | import chardet from 'chardet';
 53 | chardet.analyse(new Uint8Array([0x68, 0x65, 0x6c, 0x6c, 0x6f]));
 54 | ```
 55 | 
 56 | ## Working with large data sets
 57 | 
 58 | Sometimes, when data set is huge and you want to optimize performance (with a trade off of less accuracy),
 59 | you can sample only the first N bytes of the buffer:
 60 | 
 61 | ```javascript
 62 | const encoding = await chardet.detectFile('/path/to/file', { sampleSize: 32 });
 63 | ```
 64 | 
 65 | You can also specify where to begin reading from in the buffer:
 66 | 
 67 | ```javascript
 68 | const encoding = await chardet.detectFile('/path/to/file', {
 69 |   sampleSize: 32,
 70 |   offset: 128,
 71 | });
 72 | ```
 73 | 
 74 | ## Working with strings
 75 | 
 76 | In both Node.js and browsers, all strings in memory are represented in UTF-16 encoding. This is a fundamental aspect of the JavaScript language specification. Therefore, you cannot use plain strings directly as input for `chardet.analyse()` or `chardet.detect()`. Instead, you need the original string data in the form of a Buffer or Uint8Array.
 77 | 
 78 | In other words, if you receive a piece of data over the network and want to detect its encoding, use the original data payload, not its string representation. By the time you convert data to a string, it will be in UTF-16 encoding.
 79 | 
 80 | Note on [TextEncoder](https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/TextEncoder): By default, it returns a UTF-8 encoded buffer, which means the buffer will not be in the original encoding of the string.
 81 | 
 82 | ## Supported Encodings:
 83 | 
 84 | - UTF-8
 85 | - UTF-16 LE
 86 | - UTF-16 BE
 87 | - UTF-32 LE
 88 | - UTF-32 BE
 89 | - ISO-2022-JP
 90 | - ISO-2022-KR
 91 | - ISO-2022-CN
 92 | - Shift_JIS
 93 | - Big5
 94 | - EUC-JP
 95 | - EUC-KR
 96 | - GB18030
 97 | - ISO-8859-1
 98 | - ISO-8859-2
 99 | - ISO-8859-5
100 | - ISO-8859-6
101 | - ISO-8859-7
102 | - ISO-8859-8
103 | - ISO-8859-9
104 | - windows-1250
105 | - windows-1251
106 | - windows-1252
107 | - windows-1253
108 | - windows-1254
109 | - windows-1255
110 | - windows-1256
111 | - KOI8-R
112 | 
113 | Currently only these encodings are supported.
114 | 
115 | ## TypeScript?
116 | 
117 | Yes. Type definitions are included.
118 | 
119 | ### References
120 | 
121 | - ICU project http://site.icu-project.org/
122 | 


--------------------------------------------------------------------------------
/src/encoding/unicode.ts:
--------------------------------------------------------------------------------
  1 | import type { Context, Recogniser } from '.';
  2 | import match, { type Match, type EncodingName } from '../match';
  3 | 
  4 | /**
  5 |  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  6 |  * BOM will be used if it is present.
  7 |  */
  8 | export class UTF_16BE implements Recogniser {
  9 |   name(): EncodingName {
 10 |     return 'UTF-16BE';
 11 |   }
 12 | 
 13 |   match(det: Context): Match | null {
 14 |     const input = det.rawInput;
 15 | 
 16 |     if (
 17 |       input.length >= 2 &&
 18 |       (input[0] & 0xff) == 0xfe &&
 19 |       (input[1] & 0xff) == 0xff
 20 |     ) {
 21 |       return match(det, this, 100); // confidence = 100
 22 |     }
 23 | 
 24 |     // TODO: Do some statistics to check for unsigned UTF-16BE
 25 |     return null;
 26 |   }
 27 | }
 28 | 
 29 | export class UTF_16LE implements Recogniser {
 30 |   name(): EncodingName {
 31 |     return 'UTF-16LE';
 32 |   }
 33 | 
 34 |   match(det: Context): Match | null {
 35 |     const input = det.rawInput;
 36 | 
 37 |     if (
 38 |       input.length >= 2 &&
 39 |       (input[0] & 0xff) == 0xff &&
 40 |       (input[1] & 0xff) == 0xfe
 41 |     ) {
 42 |       // LE BOM is present.
 43 |       if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
 44 |         // It is probably UTF-32 LE, not UTF-16
 45 |         return null;
 46 |       }
 47 |       return match(det, this, 100); // confidence = 100
 48 |     }
 49 | 
 50 |     // TODO: Do some statistics to check for unsigned UTF-16LE
 51 |     return null;
 52 |   }
 53 | }
 54 | 
 55 | interface WithGetChar {
 56 |   getChar(input: Uint8Array, index: number): number;
 57 | }
 58 | 
 59 | class UTF_32 implements Recogniser, WithGetChar {
 60 |   name(): EncodingName {
 61 |     return 'UTF-32';
 62 |   }
 63 | 
 64 |   getChar(_input: Uint8Array, _index: number): number {
 65 |     return -1;
 66 |   }
 67 | 
 68 |   match(det: Context): Match | null {
 69 |     let numValid = 0,
 70 |       numInvalid = 0,
 71 |       hasBOM = false,
 72 |       confidence = 0;
 73 |     const limit = (det.rawLen / 4) * 4;
 74 |     const input = det.rawInput;
 75 | 
 76 |     if (limit == 0) {
 77 |       return null;
 78 |     }
 79 | 
 80 |     if (this.getChar(input, 0) == 0x0000feff) {
 81 |       hasBOM = true;
 82 |     }
 83 | 
 84 |     for (let i = 0; i < limit; i += 4) {
 85 |       const ch = this.getChar(input, i);
 86 | 
 87 |       if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) {
 88 |         numInvalid += 1;
 89 |       } else {
 90 |         numValid += 1;
 91 |       }
 92 |     }
 93 | 
 94 |     // Cook up some sort of confidence score, based on presence of a BOM
 95 |     //    and the existence of valid and/or invalid multi-byte sequences.
 96 |     if (hasBOM && numInvalid == 0) {
 97 |       confidence = 100;
 98 |     } else if (hasBOM && numValid > numInvalid * 10) {
 99 |       confidence = 80;
100 |     } else if (numValid > 3 && numInvalid == 0) {
101 |       confidence = 100;
102 |     } else if (numValid > 0 && numInvalid == 0) {
103 |       confidence = 80;
104 |     } else if (numValid > numInvalid * 10) {
105 |       // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
106 |       confidence = 25;
107 |     }
108 | 
109 |     // return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
110 |     return confidence == 0 ? null : match(det, this, confidence);
111 |   }
112 | }
113 | 
114 | export class UTF_32BE extends UTF_32 {
115 |   name(): EncodingName {
116 |     return 'UTF-32BE';
117 |   }
118 |   getChar(input: Uint8Array, index: number) {
119 |     return (
120 |       ((input[index + 0] & 0xff) << 24) |
121 |       ((input[index + 1] & 0xff) << 16) |
122 |       ((input[index + 2] & 0xff) << 8) |
123 |       (input[index + 3] & 0xff)
124 |     );
125 |   }
126 | }
127 | 
128 | export class UTF_32LE extends UTF_32 {
129 |   name(): EncodingName {
130 |     return 'UTF-32LE';
131 |   }
132 | 
133 |   getChar(input: Uint8Array, index: number) {
134 |     return (
135 |       ((input[index + 3] & 0xff) << 24) |
136 |       ((input[index + 2] & 0xff) << 16) |
137 |       ((input[index + 1] & 0xff) << 8) |
138 |       (input[index + 0] & 0xff)
139 |     );
140 |   }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/encoding/iso2022.ts:
--------------------------------------------------------------------------------
  1 | import type { Context, Recogniser } from '.';
  2 | import match, { type Match, type EncodingName } from '../match';
  3 | 
  4 | /**
  5 |  * This is a superclass for the individual detectors for
  6 |  * each of the detectable members of the ISO 2022 family
  7 |  * of encodings.
  8 |  */
  9 | 
 10 | class ISO_2022 implements Recogniser {
 11 |   escapeSequences: number[][] = [];
 12 | 
 13 |   name(): EncodingName {
 14 |     return 'ISO_2022';
 15 |   }
 16 | 
 17 |   match(det: Context): Match | null {
 18 |     /**
 19 |      * Matching function shared among the 2022 detectors JP, CN and KR
 20 |      * Counts up the number of legal an unrecognized escape sequences in
 21 |      * the sample of text, and computes a score based on the total number &
 22 |      * the proportion that fit the encoding.
 23 |      *
 24 |      *
 25 |      * @param text the byte buffer containing text to analyse
 26 |      * @param textLen  the size of the text in the byte.
 27 |      * @param escapeSequences the byte escape sequences to test for.
 28 |      * @return match quality, in the range of 0-100.
 29 |      */
 30 | 
 31 |     let i, j;
 32 |     let escN;
 33 |     let hits = 0;
 34 |     let misses = 0;
 35 |     let shifts = 0;
 36 |     let confidence;
 37 | 
 38 |     // TODO: refactor me
 39 |     const text = det.inputBytes;
 40 |     const textLen = det.inputLen;
 41 | 
 42 |     scanInput: for (i = 0; i < textLen; i++) {
 43 |       if (text[i] == 0x1b) {
 44 |         checkEscapes: for (
 45 |           escN = 0;
 46 |           escN < this.escapeSequences.length;
 47 |           escN++
 48 |         ) {
 49 |           const seq = this.escapeSequences[escN];
 50 | 
 51 |           if (textLen - i < seq.length) continue checkEscapes;
 52 | 
 53 |           for (j = 1; j < seq.length; j++)
 54 |             if (seq[j] != text[i + j]) continue checkEscapes;
 55 | 
 56 |           hits++;
 57 |           i += seq.length - 1;
 58 |           continue scanInput;
 59 |         }
 60 | 
 61 |         misses++;
 62 |       }
 63 | 
 64 |       // Shift in/out
 65 |       if (text[i] == 0x0e || text[i] == 0x0f) shifts++;
 66 |     }
 67 | 
 68 |     if (hits == 0) return null;
 69 | 
 70 |     //
 71 |     // Initial quality is based on relative proportion of recognized vs.
 72 |     //   unrecognized escape sequences.
 73 |     //   All good:  quality = 100;
 74 |     //   half or less good: quality = 0;
 75 |     //   linear in between.
 76 |     confidence = (100 * hits - 100 * misses) / (hits + misses);
 77 | 
 78 |     // Back off quality if there were too few escape sequences seen.
 79 |     //   Include shifts in this computation, so that KR does not get penalized
 80 |     //   for having only a single Escape sequence, but many shifts.
 81 |     if (hits + shifts < 5) confidence -= (5 - (hits + shifts)) * 10;
 82 | 
 83 |     return confidence <= 0 ? null : match(det, this, confidence);
 84 |   }
 85 | }
 86 | 
 87 | export class ISO_2022_JP extends ISO_2022 {
 88 |   name(): EncodingName {
 89 |     return 'ISO-2022-JP';
 90 |   }
 91 | 
 92 |   language() {
 93 |     return 'ja';
 94 |   }
 95 | 
 96 |   escapeSequences = [
 97 |     [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
 98 |     [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
 99 |     [0x1b, 0x24, 0x40], // JIS C 6226-1978
100 |     [0x1b, 0x24, 0x41], // GB 2312-80
101 |     [0x1b, 0x24, 0x42], // JIS X 208-1983
102 |     [0x1b, 0x26, 0x40], // JIS X 208 1990, 1997
103 |     [0x1b, 0x28, 0x42], // ASCII
104 |     [0x1b, 0x28, 0x48], // JIS-Roman
105 |     [0x1b, 0x28, 0x49], // Half-width katakana
106 |     [0x1b, 0x28, 0x4a], // JIS-Roman
107 |     [0x1b, 0x2e, 0x41], // ISO 8859-1
108 |     [0x1b, 0x2e, 0x46], // ISO 8859-7
109 |   ];
110 | }
111 | 
112 | export class ISO_2022_KR extends ISO_2022 {
113 |   name(): EncodingName {
114 |     return 'ISO-2022-KR';
115 |   }
116 |   language() {
117 |     return 'kr';
118 |   }
119 |   escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
120 | }
121 | 
122 | export class ISO_2022_CN extends ISO_2022 {
123 |   name(): EncodingName {
124 |     return 'ISO-2022-CN';
125 |   }
126 |   language() {
127 |     return 'zh';
128 |   }
129 |   escapeSequences = [
130 |     [0x1b, 0x24, 0x29, 0x41], // GB 2312-80
131 |     [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
132 |     [0x1b, 0x24, 0x2a, 0x48], // CNS 11643-1992 Plane 2
133 |     [0x1b, 0x24, 0x29, 0x45], // ISO-IR-165
134 |     [0x1b, 0x24, 0x2b, 0x49], // CNS 11643-1992 Plane 3
135 |     [0x1b, 0x24, 0x2b, 0x4a], // CNS 11643-1992 Plane 4
136 |     [0x1b, 0x24, 0x2b, 0x4b], // CNS 11643-1992 Plane 5
137 |     [0x1b, 0x24, 0x2b, 0x4c], // CNS 11643-1992 Plane 6
138 |     [0x1b, 0x24, 0x2b, 0x4d], // CNS 11643-1992 Plane 7
139 |     [0x1b, 0x4e], // SS2
140 |     [0x1b, 0x4f], // SS3
141 |   ];
142 | }
143 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | import { Match } from './match';
  2 | import { Recogniser, Context } from './encoding';
  3 | 
  4 | import loadFs from './fs/node';
  5 | 
  6 | import Ascii from './encoding/ascii';
  7 | import Utf8 from './encoding/utf8';
  8 | import * as unicode from './encoding/unicode';
  9 | import * as mbcs from './encoding/mbcs';
 10 | import * as sbcs from './encoding/sbcs';
 11 | import * as iso2022 from './encoding/iso2022';
 12 | import { isByteArray } from './utils';
 13 | 
 14 | interface FullOptions {
 15 |   sampleSize: number;
 16 |   offset: number;
 17 | }
 18 | 
 19 | export type Options = Partial<FullOptions>;
 20 | 
 21 | const recognisers: Recogniser[] = [
 22 |   new Utf8(),
 23 |   new unicode.UTF_16BE(),
 24 |   new unicode.UTF_16LE(),
 25 |   new unicode.UTF_32BE(),
 26 |   new unicode.UTF_32LE(),
 27 |   new mbcs.sjis(),
 28 |   new mbcs.big5(),
 29 |   new mbcs.euc_jp(),
 30 |   new mbcs.euc_kr(),
 31 |   new mbcs.gb_18030(),
 32 |   new iso2022.ISO_2022_JP(),
 33 |   new iso2022.ISO_2022_KR(),
 34 |   new iso2022.ISO_2022_CN(),
 35 |   new sbcs.ISO_8859_1(),
 36 |   new sbcs.ISO_8859_2(),
 37 |   new sbcs.ISO_8859_5(),
 38 |   new sbcs.ISO_8859_6(),
 39 |   new sbcs.ISO_8859_7(),
 40 |   new sbcs.ISO_8859_8(),
 41 |   new sbcs.ISO_8859_9(),
 42 |   new sbcs.windows_1251(),
 43 |   new sbcs.windows_1256(),
 44 |   new sbcs.KOI8_R(),
 45 |   new Ascii(),
 46 | ];
 47 | 
 48 | export type AnalyseResult = Match[];
 49 | export type DetectResult = string | null;
 50 | 
 51 | export const detect = (buffer: Uint8Array): string | null => {
 52 |   const matches: Match[] = analyse(buffer);
 53 |   return matches.length > 0 ? matches[0].name : null;
 54 | };
 55 | 
 56 | export const analyse = (buffer: Uint8Array): AnalyseResult => {
 57 |   if (!isByteArray(buffer)) {
 58 |     throw new Error('Input must be a byte array, e.g. Buffer or Uint8Array');
 59 |   }
 60 | 
 61 |   // Tally up the byte occurrence statistics.
 62 |   const byteStats = [];
 63 |   for (let i = 0; i < 256; i++) byteStats[i] = 0;
 64 | 
 65 |   for (let i = buffer.length - 1; i >= 0; i--) byteStats[buffer[i] & 0x00ff]++;
 66 | 
 67 |   let c1Bytes = false;
 68 |   for (let i = 0x80; i <= 0x9f; i += 1) {
 69 |     if (byteStats[i] !== 0) {
 70 |       c1Bytes = true;
 71 |       break;
 72 |     }
 73 |   }
 74 | 
 75 |   const context: Context = {
 76 |     byteStats,
 77 |     c1Bytes,
 78 |     rawInput: buffer,
 79 |     rawLen: buffer.length,
 80 |     inputBytes: buffer,
 81 |     inputLen: buffer.length,
 82 |   };
 83 | 
 84 |   const matches = recognisers
 85 |     .map((rec) => {
 86 |       return rec.match(context);
 87 |     })
 88 |     .filter((match) => {
 89 |       return !!match;
 90 |     })
 91 |     .sort((a, b) => {
 92 |       return b!.confidence - a!.confidence;
 93 |     });
 94 | 
 95 |   return matches as Match[];
 96 | };
 97 | 
 98 | export const detectFile = (
 99 |   filepath: string,
100 |   opts: Options = {}
101 | ): Promise<DetectResult> =>
102 |   new Promise((resolve, reject) => {
103 |     let fd: any;
104 |     const fs = loadFs();
105 | 
106 |     const handler = (err: Error | null, buffer: Buffer | null) => {
107 |       if (fd) {
108 |         fs.closeSync(fd);
109 |       }
110 | 
111 |       if (err) {
112 |         reject(err);
113 |       } else if (buffer) {
114 |         resolve(detect(buffer));
115 |       } else {
116 |         reject(new Error('No error and no buffer received'));
117 |       }
118 |     };
119 | 
120 |     const sampleSize = opts?.sampleSize || 0;
121 |     if (sampleSize > 0) {
122 |       fd = fs.openSync(filepath, 'r');
123 |       let sample = Buffer.allocUnsafe(sampleSize);
124 | 
125 |       fs.read(fd, sample, 0, sampleSize, opts.offset, (err: NodeJS.ErrnoException | null, bytesRead: number) => {
126 |         if (err) {
127 |           handler(err, null);
128 |         } else {
129 |           if (bytesRead < sampleSize) {
130 |             sample = sample.subarray(0, bytesRead);
131 |           }
132 |           handler(null, sample);
133 |         }
134 |       });
135 |       return;
136 |     }
137 | 
138 |     fs.readFile(filepath, handler);
139 |   });
140 | 
141 | export const detectFileSync = (
142 |   filepath: string,
143 |   opts: Options = {}
144 | ): DetectResult => {
145 |   const fs = loadFs();
146 | 
147 |   if (opts && opts.sampleSize) {
148 |     const fd = fs.openSync(filepath, 'r');
149 |     let sample = Buffer.allocUnsafe(opts.sampleSize);
150 | 
151 |     const bytesRead = fs.readSync(fd, sample, 0, opts.sampleSize, opts.offset);
152 |     if (bytesRead < opts.sampleSize) {
153 |       sample = sample.subarray(0, bytesRead);
154 |     }
155 |     fs.closeSync(fd);
156 |     return detect(sample);
157 |   }
158 | 
159 |   return detect(fs.readFileSync(filepath));
160 | };
161 | 
162 | export default {
163 |   analyse,
164 |   detect,
165 |   detectFileSync,
166 |   detectFile,
167 | };
168 | 
169 | export { Match, EncodingName } from './match';
170 | 


--------------------------------------------------------------------------------
/src/encoding/mbcs.ts:
--------------------------------------------------------------------------------
  1 | import type { Context, Recogniser } from '.';
  2 | import match, { type Match, type EncodingName } from '../match';
  3 | 
  4 | /**
  5 |  * Binary search implementation (recursive)
  6 |  */
  7 | function binarySearch(arr: number[], searchValue: number) {
  8 |   const find = (
  9 |     arr: number[],
 10 |     searchValue: number,
 11 |     left: number,
 12 |     right: number,
 13 |   ): number => {
 14 |     if (right < left) return -1;
 15 | 
 16 |     /*
 17 |     int mid = mid = (left + right) / 2;
 18 |     There is a bug in the above line;
 19 |     Joshua Bloch suggests the following replacement:
 20 |     */
 21 |     const mid = Math.floor((left + right) >>> 1);
 22 |     if (searchValue > arr[mid]) return find(arr, searchValue, mid + 1, right);
 23 | 
 24 |     if (searchValue < arr[mid]) return find(arr, searchValue, left, mid - 1);
 25 | 
 26 |     return mid;
 27 |   };
 28 | 
 29 |   return find(arr, searchValue, 0, arr.length - 1);
 30 | }
 31 | 
 32 | // 'Character'  iterated character class.
 33 | //    Recognizers for specific mbcs encodings make their 'characters' available
 34 | //    by providing a nextChar() function that fills in an instance of iteratedChar
 35 | //    with the next char from the input.
 36 | //    The returned characters are not converted to Unicode, but remain as the raw
 37 | //    bytes (concatenated into an int) from the codepage data.
 38 | //
 39 | //  For Asian charsets, use the raw input rather than the input that has been
 40 | //   stripped of markup.  Detection only considers multi-byte chars, effectively
 41 | //   stripping markup anyway, and double byte chars do occur in markup too.
 42 | //
 43 | class IteratedChar {
 44 |   charValue: number; // 1-4 bytes from the raw input data
 45 |   index: number;
 46 |   nextIndex: number;
 47 |   error: boolean;
 48 |   done: boolean;
 49 | 
 50 |   constructor() {
 51 |     this.charValue = 0; // 1-4 bytes from the raw input data
 52 |     this.index = 0;
 53 |     this.nextIndex = 0;
 54 |     this.error = false;
 55 |     this.done = false;
 56 |   }
 57 | 
 58 |   reset() {
 59 |     this.charValue = 0;
 60 |     this.index = -1;
 61 |     this.nextIndex = 0;
 62 |     this.error = false;
 63 |     this.done = false;
 64 |   }
 65 | 
 66 |   nextByte(det: Context) {
 67 |     if (this.nextIndex >= det.rawLen) {
 68 |       this.done = true;
 69 |       return -1;
 70 |     }
 71 |     const byteValue = det.rawInput[this.nextIndex++] & 0x00ff;
 72 |     return byteValue;
 73 |   }
 74 | }
 75 | 
 76 | /**
 77 |  * Asian double or multi-byte - charsets.
 78 |  * Match is determined mostly by the input data adhering to the
 79 |  * encoding scheme for the charset, and, optionally,
 80 |  * frequency-of-occurrence of characters.
 81 |  */
 82 | 
 83 | class mbcs implements Recogniser {
 84 |   commonChars: number[] = [];
 85 | 
 86 |   name(): EncodingName {
 87 |     return 'mbcs';
 88 |   }
 89 | 
 90 |   /**
 91 |    * Test the match of this charset with the input text data
 92 |    *      which is obtained via the CharsetDetector object.
 93 |    *
 94 |    * @param det  The CharsetDetector, which contains the input text
 95 |    *             to be checked for being in this charset.
 96 |    * @return     Two values packed into one int  (Damn java, anyhow)
 97 |    *             bits 0-7:  the match confidence, ranging from 0-100
 98 |    *             bits 8-15: The match reason, an enum-like value.
 99 |    */
100 |   match(det: Context): Match | null {
101 |     let doubleByteCharCount = 0,
102 |       commonCharCount = 0,
103 |       badCharCount = 0,
104 |       totalCharCount = 0,
105 |       confidence = 0;
106 | 
107 |     const iter = new IteratedChar();
108 | 
109 |     detectBlock: {
110 |       for (iter.reset(); this.nextChar(iter, det); ) {
111 |         totalCharCount++;
112 |         if (iter.error) {
113 |           badCharCount++;
114 |         } else {
115 |           const cv = iter.charValue & 0xffffffff;
116 | 
117 |           if (cv > 0xff) {
118 |             doubleByteCharCount++;
119 |             if (this.commonChars != null) {
120 |               // NOTE: This assumes that there are no 4-byte common chars.
121 |               if (binarySearch(this.commonChars, cv) >= 0) {
122 |                 commonCharCount++;
123 |               }
124 |             }
125 |           }
126 |         }
127 |         if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
128 |           // console.log('its here!')
129 |           // Bail out early if the byte data is not matching the encoding scheme.
130 |           break detectBlock;
131 |         }
132 |       }
133 | 
134 |       if (doubleByteCharCount <= 10 && badCharCount == 0) {
135 |         // Not many multi-byte chars.
136 |         if (doubleByteCharCount == 0 && totalCharCount < 10) {
137 |           // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
138 |           // We don't have enough data to have any confidence.
139 |           // Statistical analysis of single byte non-ASCII characters would probably help here.
140 |           confidence = 0;
141 |         } else {
142 |           //   ASCII or ISO file?  It's probably not our encoding,
143 |           //   but is not incompatible with our encoding, so don't give it a zero.
144 |           confidence = 10;
145 |         }
146 |         break detectBlock;
147 |       }
148 | 
149 |       //
150 |       //  No match if there are too many characters that don't fit the encoding scheme.
151 |       //    (should we have zero tolerance for these?)
152 |       //
153 |       if (doubleByteCharCount < 20 * badCharCount) {
154 |         confidence = 0;
155 |         break detectBlock;
156 |       }
157 | 
158 |       if (this.commonChars == null) {
159 |         // We have no statistics on frequently occurring characters.
160 |         //  Assess confidence purely on having a reasonable number of
161 |         //  multi-byte characters (the more the better
162 |         confidence = 30 + doubleByteCharCount - 20 * badCharCount;
163 |         if (confidence > 100) {
164 |           confidence = 100;
165 |         }
166 |       } else {
167 |         // Frequency of occurrence statistics exist.
168 |         const maxVal = Math.log(doubleByteCharCount / 4);
169 |         const scaleFactor = 90.0 / maxVal;
170 |         confidence = Math.floor(
171 |           Math.log(commonCharCount + 1) * scaleFactor + 10,
172 |         );
173 |         confidence = Math.min(confidence, 100);
174 |       }
175 |     } // end of detectBlock:
176 | 
177 |     return confidence == 0 ? null : match(det, this, confidence);
178 |   }
179 | 
180 |   /**
181 |    * Get the next character (however many bytes it is) from the input data
182 |    *    Subclasses for specific charset encodings must implement this function
183 |    *    to get characters according to the rules of their encoding scheme.
184 |    *
185 |    *  This function is not a method of class iteratedChar only because
186 |    *   that would require a lot of extra derived classes, which is awkward.
187 |    * @param it  The iteratedChar 'struct' into which the returned char is placed.
188 |    * @param det The charset detector, which is needed to get at the input byte data
189 |    *            being iterated over.
190 |    * @return    True if a character was returned, false at end of input.
191 |    */
192 |   nextChar(_iter: IteratedChar, _det: Context): boolean {
193 |     return true;
194 |   }
195 | }
196 | 
197 | /**
198 |  * Shift_JIS charset recognizer.
199 |  */
200 | export class sjis extends mbcs {
201 |   name(): EncodingName {
202 |     return 'Shift_JIS';
203 |   }
204 | 
205 |   language() {
206 |     return 'ja';
207 |   }
208 | 
209 |   // TODO:  This set of data comes from the character frequency-
210 |   //        of-occurrence analysis tool.  The data needs to be moved
211 |   //        into a resource and loaded from there.
212 |   commonChars = [
213 |     0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176,
214 |     0x82a0, 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1,
215 |     0x82b3, 0x82b5, 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6,
216 |     0x82c8, 0x82c9, 0x82cc, 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9,
217 |     0x82ea, 0x82f0, 0x82f1, 0x8341, 0x8343, 0x834e, 0x834f, 0x8358, 0x835e,
218 |     0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 0x838a, 0x838b, 0x838d, 0x8393,
219 |     0x8e96, 0x93fa, 0x95aa,
220 |   ];
221 | 
222 |   nextChar(iter: IteratedChar, det: Context) {
223 |     iter.index = iter.nextIndex;
224 |     iter.error = false;
225 | 
226 |     const firstByte = (iter.charValue = iter.nextByte(det));
227 |     if (firstByte < 0) return false;
228 | 
229 |     if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
230 |       return true;
231 | 
232 |     const secondByte = iter.nextByte(det);
233 |     if (secondByte < 0) return false;
234 | 
235 |     iter.charValue = (firstByte << 8) | secondByte;
236 |     if (
237 |       !(
238 |         (secondByte >= 0x40 && secondByte <= 0x7f) ||
239 |         (secondByte >= 0x80 && secondByte <= 0xff)
240 |       )
241 |     ) {
242 |       // Illegal second byte value.
243 |       iter.error = true;
244 |     }
245 |     return true;
246 |   }
247 | }
248 | 
249 | /**
250 |  *   Big5 charset recognizer.
251 |  */
252 | export class big5 extends mbcs {
253 |   name(): EncodingName {
254 |     return 'Big5';
255 |   }
256 | 
257 |   language() {
258 |     return 'zh';
259 |   }
260 |   // TODO:  This set of data comes from the character frequency-
261 |   //        of-occurrence analysis tool.  The data needs to be moved
262 |   //        into a resource and loaded from there.
263 |   commonChars = [
264 |     0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440,
265 |     0xa446, 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c,
266 |     0xa477, 0xa4a3, 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8,
267 |     0xa4fd, 0xa540, 0xa548, 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661,
268 |     0xa662, 0xa668, 0xa670, 0xa6a8, 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6,
269 |     0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1,
270 |     0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 0xaa6b, 0xaaba, 0xaabe,
271 |     0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 0xaec9, 0xafe0,
272 |     0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 0xb5a5,
273 |     0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
274 |     0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
275 |   ];
276 | 
277 |   nextChar(iter: IteratedChar, det: Context) {
278 |     iter.index = iter.nextIndex;
279 |     iter.error = false;
280 | 
281 |     const firstByte = (iter.charValue = iter.nextByte(det));
282 | 
283 |     if (firstByte < 0) return false;
284 | 
285 |     // single byte character.
286 |     if (firstByte <= 0x7f || firstByte == 0xff) return true;
287 | 
288 |     const secondByte = iter.nextByte(det);
289 | 
290 |     if (secondByte < 0) return false;
291 | 
292 |     iter.charValue = (iter.charValue << 8) | secondByte;
293 | 
294 |     if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
295 |       iter.error = true;
296 | 
297 |     return true;
298 |   }
299 | }
300 | 
301 | /**
302 |  *  EUC charset recognizers.  One abstract class that provides the common function
303 |  *  for getting the next character according to the EUC encoding scheme,
304 |  *  and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
305 |  *
306 |  *  Get the next character value for EUC based encodings.
307 |  *  Character 'value' is simply the raw bytes that make up the character
308 |  *     packed into an int.
309 |  */
310 | function eucNextChar(iter: IteratedChar, det: Context) {
311 |   iter.index = iter.nextIndex;
312 |   iter.error = false;
313 |   let firstByte = 0;
314 |   let secondByte = 0;
315 |   let thirdByte = 0;
316 |   //int fourthByte = 0;
317 |   buildChar: {
318 |     firstByte = iter.charValue = iter.nextByte(det);
319 |     if (firstByte < 0) {
320 |       // Ran off the end of the input data
321 |       iter.done = true;
322 |       break buildChar;
323 |     }
324 |     if (firstByte <= 0x8d) {
325 |       // single byte char
326 |       break buildChar;
327 |     }
328 |     secondByte = iter.nextByte(det);
329 |     iter.charValue = (iter.charValue << 8) | secondByte;
330 |     if (firstByte >= 0xa1 && firstByte <= 0xfe) {
331 |       // Two byte Char
332 |       if (secondByte < 0xa1) {
333 |         iter.error = true;
334 |       }
335 |       break buildChar;
336 |     }
337 |     if (firstByte == 0x8e) {
338 |       // Code Set 2.
339 |       //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
340 |       //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
341 |       // We don't know which we've got.
342 |       // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
343 |       //   bytes will look like a well formed 2 byte char.
344 |       if (secondByte < 0xa1) {
345 |         iter.error = true;
346 |       }
347 |       break buildChar;
348 |     }
349 |     if (firstByte == 0x8f) {
350 |       // Code set 3.
351 |       // Three byte total char size, two bytes of actual char value.
352 |       thirdByte = iter.nextByte(det);
353 |       iter.charValue = (iter.charValue << 8) | thirdByte;
354 |       if (thirdByte < 0xa1) {
355 |         iter.error = true;
356 |       }
357 |     }
358 |   }
359 |   return iter.done == false;
360 | }
361 | 
362 | /**
363 |  * The charset recognize for EUC-JP.  A singleton instance of this class
364 |  *    is created and kept by the public CharsetDetector class
365 |  */
366 | export class euc_jp extends mbcs {
367 |   name(): EncodingName {
368 |     return 'EUC-JP';
369 |   }
370 | 
371 |   language() {
372 |     return 'ja';
373 |   }
374 | 
375 |   // TODO:  This set of data comes from the character frequency-
376 |   //        of-occurrence analysis tool.  The data needs to be moved
377 |   //        into a resource and loaded from there.
378 |   commonChars = [
379 |     0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7,
380 |     0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af,
381 |     0xa4b1, 0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0,
382 |     0xa4c1, 0xa4c3, 0xa4c4, 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb,
383 |     0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8,
384 |     0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3,
385 |     0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 0xa5b0, 0xa5b3, 0xa5b5,
386 |     0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 0xa5c8, 0xa5c9,
387 |     0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 0xa5e5,
388 |     0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
389 |     0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc,
390 |     0xcdd1,
391 |   ];
392 | 
393 |   nextChar = eucNextChar;
394 | }
395 | 
396 | /**
397 |  * The charset recognize for EUC-KR.  A singleton instance of this class
398 |  *    is created and kept by the public CharsetDetector class
399 |  */
400 | export class euc_kr extends mbcs {
401 |   name(): EncodingName {
402 |     return 'EUC-KR';
403 |   }
404 | 
405 |   language() {
406 |     return 'ko';
407 |   }
408 | 
409 |   // TODO:  This set of data comes from the character frequency-
410 |   //        of-occurrence analysis tool.  The data needs to be moved
411 |   //        into a resource and loaded from there.
412 |   commonChars = [
413 |     0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa,
414 |     0xb0fc, 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2,
415 |     0xb4cf, 0xb4d9, 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3,
416 |     0xb7af, 0xb7c2, 0xb7ce, 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9,
417 |     0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1,
418 |     0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0,
419 |     0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 0xbef8, 0xbefa, 0xbfa1,
420 |     0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 0xc0af, 0xc0b8,
421 |     0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 0xc0da,
422 |     0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
423 |     0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5,
424 |     0xc8ad,
425 |   ];
426 | 
427 |   nextChar = eucNextChar;
428 | }
429 | 
430 | /**
431 |  *   GB-18030 recognizer. Uses simplified Chinese statistics.
432 |  */
433 | export class gb_18030 extends mbcs {
434 |   name(): EncodingName {
435 |     return 'GB18030';
436 |   }
437 | 
438 |   language() {
439 |     return 'zh';
440 |   }
441 | 
442 |   /*
443 |    *  Get the next character value for EUC based encodings.
444 |    *  Character 'value' is simply the raw bytes that make up the character
445 |    *     packed into an int.
446 |    */
447 | 
448 |   nextChar(iter: IteratedChar, det: Context) {
449 |     iter.index = iter.nextIndex;
450 |     iter.error = false;
451 |     let firstByte = 0;
452 |     let secondByte = 0;
453 |     let thirdByte = 0;
454 |     let fourthByte = 0;
455 |     buildChar: {
456 |       firstByte = iter.charValue = iter.nextByte(det);
457 |       if (firstByte < 0) {
458 |         // Ran off the end of the input data
459 |         iter.done = true;
460 |         break buildChar;
461 |       }
462 |       if (firstByte <= 0x80) {
463 |         // single byte char
464 |         break buildChar;
465 |       }
466 |       secondByte = iter.nextByte(det);
467 |       iter.charValue = (iter.charValue << 8) | secondByte;
468 |       if (firstByte >= 0x81 && firstByte <= 0xfe) {
469 |         // Two byte Char
470 |         if (
471 |           (secondByte >= 0x40 && secondByte <= 0x7e) ||
472 |           (secondByte >= 80 && secondByte <= 0xfe)
473 |         ) {
474 |           break buildChar;
475 |         }
476 |         // Four byte char
477 |         if (secondByte >= 0x30 && secondByte <= 0x39) {
478 |           thirdByte = iter.nextByte(det);
479 |           if (thirdByte >= 0x81 && thirdByte <= 0xfe) {
480 |             fourthByte = iter.nextByte(det);
481 |             if (fourthByte >= 0x30 && fourthByte <= 0x39) {
482 |               iter.charValue =
483 |                 (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
484 |               break buildChar;
485 |             }
486 |           }
487 |         }
488 |         iter.error = true;
489 |         break buildChar;
490 |       }
491 |     }
492 |     return iter.done == false;
493 |   }
494 | 
495 |   // TODO:  This set of data comes from the character frequency-
496 |   //        of-occurrence analysis tool.  The data needs to be moved
497 |   //        into a resource and loaded from there.
498 |   commonChars = [
499 |     0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1,
500 |     0xa3ac, 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3,
501 |     0xb5bd, 0xb5c4, 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd,
502 |     0xb7d6, 0xb7dd, 0xb8b4, 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa,
503 |     0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe,
504 |     0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb,
505 |     0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 0xc7f8, 0xc8ab, 0xc8cb,
506 |     0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 0xcad0, 0xcad6,
507 |     0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 0xcfb5,
508 |     0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
509 |     0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2,
510 |     0xd6d0,
511 |   ];
512 | }
513 | 


--------------------------------------------------------------------------------
/src/encoding/sbcs.ts:
--------------------------------------------------------------------------------
  1 | import type { Context, Recogniser } from '.';
  2 | import match, { type EncodingName, type Match } from '../match';
  3 | 
  4 | /**
  5 |  * This class recognizes single-byte encodings. Because the encoding scheme is so
  6 |  * simple, language statistics are used to do the matching.
  7 |  */
  8 | 
  9 | const N_GRAM_MASK = 0xffffff;
 10 | 
 11 | class NGramParser {
 12 |   byteIndex: number = 0;
 13 |   ngram: number = 0;
 14 | 
 15 |   ngramCount: number = 0;
 16 |   hitCount: number = 0;
 17 | 
 18 |   ngramList: number[];
 19 |   byteMap: number[];
 20 | 
 21 |   // TODO: is it safe to set it like this?
 22 |   spaceChar: number = 0x20;
 23 | 
 24 |   constructor(theNgramList: number[], theByteMap: number[]) {
 25 |     this.ngramList = theNgramList;
 26 |     this.byteMap = theByteMap;
 27 |   }
 28 | 
 29 |   /*
 30 |    * Binary search for value in table, which must have exactly 64 entries.
 31 |    */
 32 |   search(table: number[], value: number) {
 33 |     let index = 0;
 34 | 
 35 |     if (table[index + 32] <= value) index += 32;
 36 |     if (table[index + 16] <= value) index += 16;
 37 |     if (table[index + 8] <= value) index += 8;
 38 |     if (table[index + 4] <= value) index += 4;
 39 |     if (table[index + 2] <= value) index += 2;
 40 |     if (table[index + 1] <= value) index += 1;
 41 |     if (table[index] > value) index -= 1;
 42 | 
 43 |     if (index < 0 || table[index] != value) return -1;
 44 | 
 45 |     return index;
 46 |   }
 47 | 
 48 |   lookup(thisNgram: number) {
 49 |     this.ngramCount += 1;
 50 |     if (this.search(this.ngramList, thisNgram) >= 0) {
 51 |       this.hitCount += 1;
 52 |     }
 53 |   }
 54 | 
 55 |   addByte(b: number) {
 56 |     this.ngram = ((this.ngram << 8) + (b & 0xff)) & N_GRAM_MASK;
 57 |     this.lookup(this.ngram);
 58 |   }
 59 | 
 60 |   nextByte(det: Context) {
 61 |     if (this.byteIndex >= det.inputLen) return -1;
 62 | 
 63 |     return det.inputBytes[this.byteIndex++] & 0xff;
 64 |   }
 65 | 
 66 |   parse(det: Context, spaceCh: number) {
 67 |     let b,
 68 |       ignoreSpace = false;
 69 |     this.spaceChar = spaceCh;
 70 | 
 71 |     while ((b = this.nextByte(det)) >= 0) {
 72 |       const mb = this.byteMap[b];
 73 | 
 74 |       // TODO: 0x20 might not be a space in all character sets...
 75 |       if (mb != 0) {
 76 |         if (!(mb == this.spaceChar && ignoreSpace)) {
 77 |           this.addByte(mb);
 78 |         }
 79 | 
 80 |         ignoreSpace = mb == this.spaceChar;
 81 |       }
 82 |     }
 83 | 
 84 |     // TODO: Is this OK? The buffer could have ended in the middle of a word...
 85 |     this.addByte(this.spaceChar);
 86 | 
 87 |     const rawPercent = this.hitCount / this.ngramCount;
 88 | 
 89 |     // TODO - This is a bit of a hack to take care of a case
 90 |     // were we were getting a confidence of 135...
 91 |     if (rawPercent > 0.33) return 98;
 92 | 
 93 |     return Math.floor(rawPercent * 300.0);
 94 |   }
 95 | }
 96 | 
 97 | class NGramsPlusLang {
 98 |   fLang: string;
 99 |   fNGrams: number[];
100 | 
101 |   constructor(la: string, ng: number[]) {
102 |     this.fLang = la;
103 |     this.fNGrams = ng;
104 |   }
105 | }
106 | 
107 | const isFlatNgrams = (val: NGramsPlusLang[] | number[]): val is number[] =>
108 |   Array.isArray(val) && isFinite(val[0] as number);
109 | 
110 | class sbcs implements Recogniser {
111 |   spaceChar = 0x20;
112 | 
113 |   private nGramLang?: string = undefined;
114 | 
115 |   ngrams(): NGramsPlusLang[] | number[] {
116 |     return [];
117 |   }
118 | 
119 |   byteMap(): number[] {
120 |     return [];
121 |   }
122 | 
123 |   name(_input: Context): EncodingName {
124 |     return 'sbcs';
125 |   }
126 | 
127 |   language(): string | undefined {
128 |     return this.nGramLang;
129 |   }
130 | 
131 |   match(det: Context): Match | null {
132 |     // This feels a bit dirty. Simpler alternative would be
133 |     // splitting classes ISO_8859_1 etc into language-specific ones
134 |     // with hardcoded languages like ISO_8859_9.
135 |     this.nGramLang = undefined;
136 | 
137 |     const ngrams = this.ngrams();
138 | 
139 |     if (isFlatNgrams(ngrams)) {
140 |       const parser = new NGramParser(ngrams, this.byteMap());
141 |       const confidence = parser.parse(det, this.spaceChar);
142 |       return confidence <= 0 ? null : match(det, this, confidence);
143 |     }
144 | 
145 |     let bestConfidence = -1;
146 | 
147 |     for (let i = ngrams.length - 1; i >= 0; i--) {
148 |       const ngl = ngrams[i];
149 | 
150 |       const parser = new NGramParser(ngl.fNGrams, this.byteMap());
151 |       const confidence = parser.parse(det, this.spaceChar);
152 |       if (confidence > bestConfidence) {
153 |         bestConfidence = confidence;
154 |         this.nGramLang = ngl.fLang;
155 |       }
156 |     }
157 | 
158 |     return bestConfidence <= 0 ? null : match(det, this, bestConfidence);
159 |   }
160 | }
161 | 
162 | export class ISO_8859_1 extends sbcs {
163 |   byteMap() {
164 |     return [
165 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
166 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
167 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
168 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
169 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
170 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
171 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
172 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
173 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
174 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
175 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
178 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
179 |       0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
180 |       0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0x20,
181 |       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
182 |       0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20,
183 |       0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
184 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
185 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb,
186 |       0xfc, 0xfd, 0xfe, 0xff,
187 |     ];
188 |   }
189 | 
190 |   ngrams() {
191 |     return [
192 |       new NGramsPlusLang(
193 |         'da',
194 |         [
195 |           0x206166, 0x206174, 0x206465, 0x20656e, 0x206572, 0x20666f, 0x206861,
196 |           0x206920, 0x206d65, 0x206f67, 0x2070e5, 0x207369, 0x207374, 0x207469,
197 |           0x207669, 0x616620, 0x616e20, 0x616e64, 0x617220, 0x617420, 0x646520,
198 |           0x64656e, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656e20,
199 |           0x656e64, 0x657220, 0x657265, 0x657320, 0x657420, 0x666f72, 0x676520,
200 |           0x67656e, 0x676572, 0x696765, 0x696c20, 0x696e67, 0x6b6520, 0x6b6b65,
201 |           0x6c6572, 0x6c6967, 0x6c6c65, 0x6d6564, 0x6e6465, 0x6e6520, 0x6e6720,
202 |           0x6e6765, 0x6f6720, 0x6f6d20, 0x6f7220, 0x70e520, 0x722064, 0x722065,
203 |           0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696c,
204 |           0x766572,
205 |         ],
206 |       ),
207 |       new NGramsPlusLang(
208 |         'de',
209 |         [
210 |           0x20616e, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569,
211 |           0x206765, 0x206861, 0x20696e, 0x206d69, 0x207363, 0x207365, 0x20756e,
212 |           0x207665, 0x20766f, 0x207765, 0x207a75, 0x626572, 0x636820, 0x636865,
213 |           0x636874, 0x646173, 0x64656e, 0x646572, 0x646965, 0x652064, 0x652073,
214 |           0x65696e, 0x656974, 0x656e20, 0x657220, 0x657320, 0x67656e, 0x68656e,
215 |           0x687420, 0x696368, 0x696520, 0x696e20, 0x696e65, 0x697420, 0x6c6963,
216 |           0x6c6c65, 0x6e2061, 0x6e2064, 0x6e2073, 0x6e6420, 0x6e6465, 0x6e6520,
217 |           0x6e6720, 0x6e6765, 0x6e7465, 0x722064, 0x726465, 0x726569, 0x736368,
218 |           0x737465, 0x742064, 0x746520, 0x74656e, 0x746572, 0x756e64, 0x756e67,
219 |           0x766572,
220 |         ],
221 |       ),
222 |       new NGramsPlusLang(
223 |         'en',
224 |         [
225 |           0x206120, 0x20616e, 0x206265, 0x20636f, 0x20666f, 0x206861, 0x206865,
226 |           0x20696e, 0x206d61, 0x206f66, 0x207072, 0x207265, 0x207361, 0x207374,
227 |           0x207468, 0x20746f, 0x207768, 0x616964, 0x616c20, 0x616e20, 0x616e64,
228 |           0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061,
229 |           0x652073, 0x652074, 0x656420, 0x656e74, 0x657220, 0x657320, 0x666f72,
230 |           0x686174, 0x686520, 0x686572, 0x696420, 0x696e20, 0x696e67, 0x696f6e,
231 |           0x697320, 0x6e2061, 0x6e2074, 0x6e6420, 0x6e6720, 0x6e7420, 0x6f6620,
232 |           0x6f6e20, 0x6f7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169,
233 |           0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696f, 0x746f20,
234 |           0x747320,
235 |         ],
236 |       ),
237 |       new NGramsPlusLang(
238 |         'es',
239 |         [
240 |           0x206120, 0x206361, 0x20636f, 0x206465, 0x20656c, 0x20656e, 0x206573,
241 |           0x20696e, 0x206c61, 0x206c6f, 0x207061, 0x20706f, 0x207072, 0x207175,
242 |           0x207265, 0x207365, 0x20756e, 0x207920, 0x612063, 0x612064, 0x612065,
243 |           0x61206c, 0x612070, 0x616369, 0x61646f, 0x616c20, 0x617220, 0x617320,
244 |           0x6369f3, 0x636f6e, 0x646520, 0x64656c, 0x646f20, 0x652064, 0x652065,
245 |           0x65206c, 0x656c20, 0x656e20, 0x656e74, 0x657320, 0x657374, 0x69656e,
246 |           0x69f36e, 0x6c6120, 0x6c6f73, 0x6e2065, 0x6e7465, 0x6f2064, 0x6f2065,
247 |           0x6f6e20, 0x6f7220, 0x6f7320, 0x706172, 0x717565, 0x726120, 0x726573,
248 |           0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746f20, 0x756520,
249 |           0xf36e20,
250 |         ],
251 |       ),
252 |       new NGramsPlusLang(
253 |         'fr',
254 |         [
255 |           0x206175, 0x20636f, 0x206461, 0x206465, 0x206475, 0x20656e, 0x206574,
256 |           0x206c61, 0x206c65, 0x207061, 0x20706f, 0x207072, 0x207175, 0x207365,
257 |           0x20736f, 0x20756e, 0x20e020, 0x616e74, 0x617469, 0x636520, 0x636f6e,
258 |           0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065,
259 |           0x65206c, 0x652070, 0x652073, 0x656e20, 0x656e74, 0x657220, 0x657320,
260 |           0x657420, 0x657572, 0x696f6e, 0x697320, 0x697420, 0x6c6120, 0x6c6520,
261 |           0x6c6573, 0x6d656e, 0x6e2064, 0x6e6520, 0x6e7320, 0x6e7420, 0x6f6e20,
262 |           0x6f6e74, 0x6f7572, 0x717565, 0x72206c, 0x726520, 0x732061, 0x732064,
263 |           0x732065, 0x73206c, 0x732070, 0x742064, 0x746520, 0x74696f, 0x756520,
264 |           0x757220,
265 |         ],
266 |       ),
267 |       new NGramsPlusLang(
268 |         'it',
269 |         [
270 |           0x20616c, 0x206368, 0x20636f, 0x206465, 0x206469, 0x206520, 0x20696c,
271 |           0x20696e, 0x206c61, 0x207065, 0x207072, 0x20756e, 0x612063, 0x612064,
272 |           0x612070, 0x612073, 0x61746f, 0x636865, 0x636f6e, 0x64656c, 0x646920,
273 |           0x652061, 0x652063, 0x652064, 0x652069, 0x65206c, 0x652070, 0x652073,
274 |           0x656c20, 0x656c6c, 0x656e74, 0x657220, 0x686520, 0x692061, 0x692063,
275 |           0x692064, 0x692073, 0x696120, 0x696c20, 0x696e20, 0x696f6e, 0x6c6120,
276 |           0x6c6520, 0x6c6920, 0x6c6c61, 0x6e6520, 0x6e6920, 0x6e6f20, 0x6e7465,
277 |           0x6f2061, 0x6f2064, 0x6f2069, 0x6f2073, 0x6f6e20, 0x6f6e65, 0x706572,
278 |           0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746f20,
279 |           0x7a696f,
280 |         ],
281 |       ),
282 |       new NGramsPlusLang(
283 |         'nl',
284 |         [
285 |           0x20616c, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656e,
286 |           0x206765, 0x206865, 0x20696e, 0x206d61, 0x206d65, 0x206f70, 0x207465,
287 |           0x207661, 0x207665, 0x20766f, 0x207765, 0x207a69, 0x61616e, 0x616172,
288 |           0x616e20, 0x616e64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656e,
289 |           0x646572, 0x652062, 0x652076, 0x65656e, 0x656572, 0x656e20, 0x657220,
290 |           0x657273, 0x657420, 0x67656e, 0x686574, 0x696520, 0x696e20, 0x696e67,
291 |           0x697320, 0x6e2062, 0x6e2064, 0x6e2065, 0x6e2068, 0x6e206f, 0x6e2076,
292 |           0x6e6465, 0x6e6720, 0x6f6e64, 0x6f6f72, 0x6f7020, 0x6f7220, 0x736368,
293 |           0x737465, 0x742064, 0x746520, 0x74656e, 0x746572, 0x76616e, 0x766572,
294 |           0x766f6f,
295 |         ],
296 |       ),
297 |       new NGramsPlusLang(
298 |         'no',
299 |         [
300 |           0x206174, 0x206176, 0x206465, 0x20656e, 0x206572, 0x20666f, 0x206861,
301 |           0x206920, 0x206d65, 0x206f67, 0x2070e5, 0x207365, 0x20736b, 0x20736f,
302 |           0x207374, 0x207469, 0x207669, 0x20e520, 0x616e64, 0x617220, 0x617420,
303 |           0x646520, 0x64656e, 0x646574, 0x652073, 0x656420, 0x656e20, 0x656e65,
304 |           0x657220, 0x657265, 0x657420, 0x657474, 0x666f72, 0x67656e, 0x696b6b,
305 |           0x696c20, 0x696e67, 0x6b6520, 0x6b6b65, 0x6c6520, 0x6c6c65, 0x6d6564,
306 |           0x6d656e, 0x6e2073, 0x6e6520, 0x6e6720, 0x6e6765, 0x6e6e65, 0x6f6720,
307 |           0x6f6d20, 0x6f7220, 0x70e520, 0x722073, 0x726520, 0x736f6d, 0x737465,
308 |           0x742073, 0x746520, 0x74656e, 0x746572, 0x74696c, 0x747420, 0x747465,
309 |           0x766572,
310 |         ],
311 |       ),
312 |       new NGramsPlusLang(
313 |         'pt',
314 |         [
315 |           0x206120, 0x20636f, 0x206461, 0x206465, 0x20646f, 0x206520, 0x206573,
316 |           0x206d61, 0x206e6f, 0x206f20, 0x207061, 0x20706f, 0x207072, 0x207175,
317 |           0x207265, 0x207365, 0x20756d, 0x612061, 0x612063, 0x612064, 0x612070,
318 |           0x616465, 0x61646f, 0x616c20, 0x617220, 0x617261, 0x617320, 0x636f6d,
319 |           0x636f6e, 0x646120, 0x646520, 0x646f20, 0x646f73, 0x652061, 0x652064,
320 |           0x656d20, 0x656e74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6d656e,
321 |           0x6e7465, 0x6e746f, 0x6f2061, 0x6f2063, 0x6f2064, 0x6f2065, 0x6f2070,
322 |           0x6f7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064,
323 |           0x732065, 0x732070, 0x737461, 0x746520, 0x746f20, 0x756520, 0xe36f20,
324 |           0xe7e36f,
325 |         ],
326 |       ),
327 |       new NGramsPlusLang(
328 |         'sv',
329 |         [
330 |           0x206174, 0x206176, 0x206465, 0x20656e, 0x2066f6, 0x206861, 0x206920,
331 |           0x20696e, 0x206b6f, 0x206d65, 0x206f63, 0x2070e5, 0x20736b, 0x20736f,
332 |           0x207374, 0x207469, 0x207661, 0x207669, 0x20e472, 0x616465, 0x616e20,
333 |           0x616e64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656e, 0x646572,
334 |           0x646574, 0x656420, 0x656e20, 0x657220, 0x657420, 0x66f672, 0x67656e,
335 |           0x696c6c, 0x696e67, 0x6b6120, 0x6c6c20, 0x6d6564, 0x6e2073, 0x6e6120,
336 |           0x6e6465, 0x6e6720, 0x6e6765, 0x6e696e, 0x6f6368, 0x6f6d20, 0x6f6e20,
337 |           0x70e520, 0x722061, 0x722073, 0x726120, 0x736b61, 0x736f6d, 0x742073,
338 |           0x746120, 0x746520, 0x746572, 0x74696c, 0x747420, 0x766172, 0xe47220,
339 |           0xf67220,
340 |         ],
341 |       ),
342 |     ];
343 |   }
344 | 
345 |   name(input: Context): EncodingName {
346 |     return input && input.c1Bytes ? 'windows-1252' : 'ISO-8859-1';
347 |   }
348 | }
349 | 
350 | export class ISO_8859_2 extends sbcs {
351 |   byteMap() {
352 |     return [
353 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
354 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
359 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
360 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
361 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
362 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
363 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
364 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
365 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
366 |       0x20, 0x20, 0x20, 0x20, 0x20, 0xb1, 0x20, 0xb3, 0x20, 0xb5, 0xb6, 0x20,
367 |       0x20, 0xb9, 0xba, 0xbb, 0xbc, 0x20, 0xbe, 0xbf, 0x20, 0xb1, 0x20, 0xb3,
368 |       0x20, 0xb5, 0xb6, 0xb7, 0x20, 0xb9, 0xba, 0xbb, 0xbc, 0x20, 0xbe, 0xbf,
369 |       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
370 |       0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20,
371 |       0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
372 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
373 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb,
374 |       0xfc, 0xfd, 0xfe, 0x20,
375 |     ];
376 |   }
377 | 
378 |   ngrams() {
379 |     return [
380 |       new NGramsPlusLang(
381 |         'cs',
382 |         [
383 |           0x206120, 0x206279, 0x20646f, 0x206a65, 0x206e61, 0x206e65, 0x206f20,
384 |           0x206f64, 0x20706f, 0x207072, 0x2070f8, 0x20726f, 0x207365, 0x20736f,
385 |           0x207374, 0x20746f, 0x207620, 0x207679, 0x207a61, 0x612070, 0x636520,
386 |           0x636820, 0x652070, 0x652073, 0x652076, 0x656d20, 0x656eed, 0x686f20,
387 |           0x686f64, 0x697374, 0x6a6520, 0x6b7465, 0x6c6520, 0x6c6920, 0x6e6120,
388 |           0x6ee920, 0x6eec20, 0x6eed20, 0x6f2070, 0x6f646e, 0x6f6a69, 0x6f7374,
389 |           0x6f7520, 0x6f7661, 0x706f64, 0x706f6a, 0x70726f, 0x70f865, 0x736520,
390 |           0x736f75, 0x737461, 0x737469, 0x73746e, 0x746572, 0x746eed, 0x746f20,
391 |           0x752070, 0xbe6520, 0xe16eed, 0xe9686f, 0xed2070, 0xed2073, 0xed6d20,
392 |           0xf86564,
393 |         ],
394 |       ),
395 |       new NGramsPlusLang(
396 |         'hu',
397 |         [
398 |           0x206120, 0x20617a, 0x206265, 0x206567, 0x20656c, 0x206665, 0x206861,
399 |           0x20686f, 0x206973, 0x206b65, 0x206b69, 0x206bf6, 0x206c65, 0x206d61,
400 |           0x206d65, 0x206d69, 0x206e65, 0x20737a, 0x207465, 0x20e973, 0x612061,
401 |           0x61206b, 0x61206d, 0x612073, 0x616b20, 0x616e20, 0x617a20, 0x62616e,
402 |           0x62656e, 0x656779, 0x656b20, 0x656c20, 0x656c65, 0x656d20, 0x656e20,
403 |           0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686f67, 0x696e74,
404 |           0x697320, 0x6b2061, 0x6bf67a, 0x6d6567, 0x6d696e, 0x6e2061, 0x6e616b,
405 |           0x6e656b, 0x6e656d, 0x6e7420, 0x6f6779, 0x732061, 0x737a65, 0x737a74,
406 |           0x737ae1, 0x73e967, 0x742061, 0x747420, 0x74e173, 0x7a6572, 0xe16e20,
407 |           0xe97320,
408 |         ],
409 |       ),
410 |       new NGramsPlusLang(
411 |         'pl',
412 |         [
413 |           0x20637a, 0x20646f, 0x206920, 0x206a65, 0x206b6f, 0x206d61, 0x206d69,
414 |           0x206e61, 0x206e69, 0x206f64, 0x20706f, 0x207072, 0x207369, 0x207720,
415 |           0x207769, 0x207779, 0x207a20, 0x207a61, 0x612070, 0x612077, 0x616e69,
416 |           0x636820, 0x637a65, 0x637a79, 0x646f20, 0x647a69, 0x652070, 0x652073,
417 |           0x652077, 0x65207a, 0x65676f, 0x656a20, 0x656d20, 0x656e69, 0x676f20,
418 |           0x696120, 0x696520, 0x69656a, 0x6b6120, 0x6b6920, 0x6b6965, 0x6d6965,
419 |           0x6e6120, 0x6e6961, 0x6e6965, 0x6f2070, 0x6f7761, 0x6f7769, 0x706f6c,
420 |           0x707261, 0x70726f, 0x70727a, 0x727a65, 0x727a79, 0x7369ea, 0x736b69,
421 |           0x737461, 0x776965, 0x796368, 0x796d20, 0x7a6520, 0x7a6965, 0x7a7920,
422 |           0xf37720,
423 |         ],
424 |       ),
425 |       new NGramsPlusLang(
426 |         'ro',
427 |         [
428 |           0x206120, 0x206163, 0x206361, 0x206365, 0x20636f, 0x206375, 0x206465,
429 |           0x206469, 0x206c61, 0x206d61, 0x207065, 0x207072, 0x207365, 0x2073e3,
430 |           0x20756e, 0x20ba69, 0x20ee6e, 0x612063, 0x612064, 0x617265, 0x617420,
431 |           0x617465, 0x617520, 0x636172, 0x636f6e, 0x637520, 0x63e320, 0x646520,
432 |           0x652061, 0x652063, 0x652064, 0x652070, 0x652073, 0x656120, 0x656920,
433 |           0x656c65, 0x656e74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070,
434 |           0x696520, 0x696920, 0x696e20, 0x6c6120, 0x6c6520, 0x6c6f72, 0x6c7569,
435 |           0x6e6520, 0x6e7472, 0x6f7220, 0x70656e, 0x726520, 0x726561, 0x727520,
436 |           0x73e320, 0x746520, 0x747275, 0x74e320, 0x756920, 0x756c20, 0xba6920,
437 |           0xee6e20,
438 |         ],
439 |       ),
440 |     ];
441 |   }
442 | 
443 |   name(det: Context): EncodingName {
444 |     return det && det.c1Bytes ? 'windows-1250' : 'ISO-8859-2';
445 |   }
446 | }
447 | 
448 | export class ISO_8859_5 extends sbcs {
449 |   byteMap() {
450 |     return [
451 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
455 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
457 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
458 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
459 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
460 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
462 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
463 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
464 |       0x20, 0x20, 0x20, 0x20, 0x20, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
465 |       0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x20, 0xfe, 0xff, 0xd0, 0xd1, 0xd2, 0xd3,
466 |       0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
467 |       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
468 |       0xec, 0xed, 0xee, 0xef, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
469 |       0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
470 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
471 |       0x20, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
472 |       0xfc, 0x20, 0xfe, 0xff,
473 |     ];
474 |   }
475 | 
476 |   ngrams() {
477 |     return [
478 |       0x20d220, 0x20d2de, 0x20d4de, 0x20d7d0, 0x20d820, 0x20dad0, 0x20dade,
479 |       0x20ddd0, 0x20ddd5, 0x20ded1, 0x20dfde, 0x20dfe0, 0x20e0d0, 0x20e1de,
480 |       0x20e1e2, 0x20e2de, 0x20e7e2, 0x20ede2, 0xd0ddd8, 0xd0e2ec, 0xd3de20,
481 |       0xd5dbec, 0xd5ddd8, 0xd5e1e2, 0xd5e220, 0xd820df, 0xd8d520, 0xd8d820,
482 |       0xd8ef20, 0xdbd5dd, 0xdbd820, 0xdbecdd, 0xddd020, 0xddd520, 0xddd8d5,
483 |       0xddd8ef, 0xddde20, 0xddded2, 0xde20d2, 0xde20df, 0xde20e1, 0xded220,
484 |       0xded2d0, 0xded3de, 0xded920, 0xdedbec, 0xdedc20, 0xdee1e2, 0xdfdedb,
485 |       0xdfe0d5, 0xdfe0d8, 0xdfe0de, 0xe0d0d2, 0xe0d5d4, 0xe1e2d0, 0xe1e2d2,
486 |       0xe1e2d8, 0xe1ef20, 0xe2d5db, 0xe2de20, 0xe2dee0, 0xe2ec20, 0xe7e2de,
487 |       0xebe520,
488 |     ];
489 |   }
490 | 
491 |   name(): EncodingName {
492 |     return 'ISO-8859-5';
493 |   }
494 | 
495 |   language() {
496 |     return 'ru';
497 |   }
498 | }
499 | 
500 | export class ISO_8859_6 extends sbcs {
501 |   byteMap() {
502 |     return [
503 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
504 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
505 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
506 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
507 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
508 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
509 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
510 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
511 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
512 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
513 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
515 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519 |       0x20, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
520 |       0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
521 |       0xd8, 0xd9, 0xda, 0x20, 0x20, 0x20, 0x20, 0x20, 0xe0, 0xe1, 0xe2, 0xe3,
522 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0x20, 0x20, 0x20, 0x20, 0x20,
523 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
524 |       0x20, 0x20, 0x20, 0x20,
525 |     ];
526 |   }
527 | 
528 |   ngrams() {
529 |     return [
530 |       0x20c7e4, 0x20c7e6, 0x20c8c7, 0x20d9e4, 0x20e1ea, 0x20e4e4, 0x20e5e6,
531 |       0x20e8c7, 0xc720c7, 0xc7c120, 0xc7ca20, 0xc7d120, 0xc7e420, 0xc7e4c3,
532 |       0xc7e4c7, 0xc7e4c8, 0xc7e4ca, 0xc7e4cc, 0xc7e4cd, 0xc7e4cf, 0xc7e4d3,
533 |       0xc7e4d9, 0xc7e4e2, 0xc7e4e5, 0xc7e4e8, 0xc7e4ea, 0xc7e520, 0xc7e620,
534 |       0xc7e6ca, 0xc820c7, 0xc920c7, 0xc920e1, 0xc920e4, 0xc920e5, 0xc920e8,
535 |       0xca20c7, 0xcf20c7, 0xcfc920, 0xd120c7, 0xd1c920, 0xd320c7, 0xd920c7,
536 |       0xd9e4e9, 0xe1ea20, 0xe420c7, 0xe4c920, 0xe4e920, 0xe4ea20, 0xe520c7,
537 |       0xe5c720, 0xe5c920, 0xe5e620, 0xe620c7, 0xe720c7, 0xe7c720, 0xe8c7e4,
538 |       0xe8e620, 0xe920c7, 0xea20c7, 0xea20e5, 0xea20e8, 0xeac920, 0xead120,
539 |       0xeae620,
540 |     ];
541 |   }
542 | 
543 |   name(): EncodingName {
544 |     return 'ISO-8859-6';
545 |   }
546 | 
547 |   language() {
548 |     return 'ar';
549 |   }
550 | }
551 | 
552 | export class ISO_8859_7 extends sbcs {
553 |   byteMap() {
554 |     return [
555 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
556 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
557 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
558 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
561 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
562 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
563 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
564 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
565 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
566 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
567 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
568 |       0x20, 0x20, 0x20, 0x20, 0x20, 0xa1, 0xa2, 0x20, 0x20, 0x20, 0x20, 0x20,
569 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
570 |       0x20, 0x20, 0xdc, 0x20, 0xdd, 0xde, 0xdf, 0x20, 0xfc, 0x20, 0xfd, 0xfe,
571 |       0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
572 |       0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0x20, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
573 |       0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
574 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
575 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
576 |       0xfc, 0xfd, 0xfe, 0x20,
577 |     ];
578 |   }
579 | 
580 |   ngrams() {
581 |     return [
582 |       0x20e1ed, 0x20e1f0, 0x20e3e9, 0x20e4e9, 0x20e5f0, 0x20e720, 0x20eae1,
583 |       0x20ece5, 0x20ede1, 0x20ef20, 0x20f0e1, 0x20f0ef, 0x20f0f1, 0x20f3f4,
584 |       0x20f3f5, 0x20f4e7, 0x20f4ef, 0xdfe120, 0xe120e1, 0xe120f4, 0xe1e920,
585 |       0xe1ed20, 0xe1f0fc, 0xe1f220, 0xe3e9e1, 0xe5e920, 0xe5f220, 0xe720f4,
586 |       0xe7ed20, 0xe7f220, 0xe920f4, 0xe9e120, 0xe9eade, 0xe9f220, 0xeae1e9,
587 |       0xeae1f4, 0xece520, 0xed20e1, 0xed20e5, 0xed20f0, 0xede120, 0xeff220,
588 |       0xeff520, 0xf0eff5, 0xf0f1ef, 0xf0fc20, 0xf220e1, 0xf220e5, 0xf220ea,
589 |       0xf220f0, 0xf220f4, 0xf3e520, 0xf3e720, 0xf3f4ef, 0xf4e120, 0xf4e1e9,
590 |       0xf4e7ed, 0xf4e7f2, 0xf4e9ea, 0xf4ef20, 0xf4eff5, 0xf4f9ed, 0xf9ed20,
591 |       0xfeed20,
592 |     ];
593 |   }
594 | 
595 |   name(det: Context): EncodingName {
596 |     return det && det.c1Bytes ? 'windows-1253' : 'ISO-8859-7';
597 |   }
598 | 
599 |   language() {
600 |     return 'el';
601 |   }
602 | }
603 | 
604 | export class ISO_8859_8 extends sbcs {
605 |   byteMap() {
606 |     return [
607 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
608 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
609 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
610 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
611 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
612 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
613 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
614 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
615 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
616 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
617 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
618 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
620 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
621 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
622 |       0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
623 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
624 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
625 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xe0, 0xe1, 0xe2, 0xe3,
626 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
627 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0x20,
628 |       0x20, 0x20, 0x20, 0x20,
629 |     ];
630 |   }
631 | 
632 |   ngrams() {
633 |     return [
634 |       new NGramsPlusLang(
635 |         'he',
636 |         [
637 |           0x20e0e5, 0x20e0e7, 0x20e0e9, 0x20e0fa, 0x20e1e9, 0x20e1ee, 0x20e4e0,
638 |           0x20e4e5, 0x20e4e9, 0x20e4ee, 0x20e4f2, 0x20e4f9, 0x20e4fa, 0x20ece0,
639 |           0x20ece4, 0x20eee0, 0x20f2ec, 0x20f9ec, 0xe0fa20, 0xe420e0, 0xe420e1,
640 |           0xe420e4, 0xe420ec, 0xe420ee, 0xe420f9, 0xe4e5e0, 0xe5e020, 0xe5ed20,
641 |           0xe5ef20, 0xe5f820, 0xe5fa20, 0xe920e4, 0xe9e420, 0xe9e5fa, 0xe9e9ed,
642 |           0xe9ed20, 0xe9ef20, 0xe9f820, 0xe9fa20, 0xec20e0, 0xec20e4, 0xece020,
643 |           0xece420, 0xed20e0, 0xed20e1, 0xed20e4, 0xed20ec, 0xed20ee, 0xed20f9,
644 |           0xeee420, 0xef20e4, 0xf0e420, 0xf0e920, 0xf0e9ed, 0xf2ec20, 0xf820e4,
645 |           0xf8e9ed, 0xf9ec20, 0xfa20e0, 0xfa20e1, 0xfa20e4, 0xfa20ec, 0xfa20ee,
646 |           0xfa20f9,
647 |         ],
648 |       ),
649 |       new NGramsPlusLang(
650 |         'he',
651 |         [
652 |           0x20e0e5, 0x20e0ec, 0x20e4e9, 0x20e4ec, 0x20e4ee, 0x20e4f0, 0x20e9f0,
653 |           0x20ecf2, 0x20ecf9, 0x20ede5, 0x20ede9, 0x20efe5, 0x20efe9, 0x20f8e5,
654 |           0x20f8e9, 0x20fae0, 0x20fae5, 0x20fae9, 0xe020e4, 0xe020ec, 0xe020ed,
655 |           0xe020fa, 0xe0e420, 0xe0e5e4, 0xe0ec20, 0xe0ee20, 0xe120e4, 0xe120ed,
656 |           0xe120fa, 0xe420e4, 0xe420e9, 0xe420ec, 0xe420ed, 0xe420ef, 0xe420f8,
657 |           0xe420fa, 0xe4ec20, 0xe5e020, 0xe5e420, 0xe7e020, 0xe9e020, 0xe9e120,
658 |           0xe9e420, 0xec20e4, 0xec20ed, 0xec20fa, 0xecf220, 0xecf920, 0xede9e9,
659 |           0xede9f0, 0xede9f8, 0xee20e4, 0xee20ed, 0xee20fa, 0xeee120, 0xeee420,
660 |           0xf2e420, 0xf920e4, 0xf920ed, 0xf920fa, 0xf9e420, 0xfae020, 0xfae420,
661 |           0xfae5e9,
662 |         ],
663 |       ),
664 |     ];
665 |   }
666 | 
667 |   name(det: Context): EncodingName {
668 |     return det && det.c1Bytes ? 'windows-1255' : 'ISO-8859-8';
669 |   }
670 | 
671 |   language() {
672 |     return 'he';
673 |   }
674 | }
675 | 
676 | export class ISO_8859_9 extends sbcs {
677 |   byteMap() {
678 |     return [
679 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
680 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
681 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
682 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
683 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
684 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
685 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
686 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
687 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
688 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
689 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
690 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
691 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
692 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
693 |       0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
694 |       0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0x20,
695 |       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
696 |       0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20,
697 |       0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
698 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
699 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x20, 0xf8, 0xf9, 0xfa, 0xfb,
700 |       0xfc, 0xfd, 0xfe, 0xff,
701 |     ];
702 |   }
703 | 
704 |   ngrams() {
705 |     return [
706 |       0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861,
707 |       0x20696c, 0x206b61, 0x206b6f, 0x206d61, 0x206f6c, 0x207361, 0x207461,
708 |       0x207665, 0x207961, 0x612062, 0x616b20, 0x616c61, 0x616d61, 0x616e20,
709 |       0x616efd, 0x617220, 0x617261, 0x6172fd, 0x6173fd, 0x617961, 0x626972,
710 |       0x646120, 0x646520, 0x646920, 0x652062, 0x65206b, 0x656469, 0x656e20,
711 |       0x657220, 0x657269, 0x657369, 0x696c65, 0x696e20, 0x696e69, 0x697220,
712 |       0x6c616e, 0x6c6172, 0x6c6520, 0x6c6572, 0x6e2061, 0x6e2062, 0x6e206b,
713 |       0x6e6461, 0x6e6465, 0x6e6520, 0x6e6920, 0x6e696e, 0x6efd20, 0x72696e,
714 |       0x72fd6e, 0x766520, 0x796120, 0x796f72, 0xfd6e20, 0xfd6e64, 0xfd6efd,
715 |       0xfdf0fd,
716 |     ];
717 |   }
718 | 
719 |   name(det: Context): EncodingName {
720 |     return det && det.c1Bytes ? 'windows-1254' : 'ISO-8859-9';
721 |   }
722 | 
723 |   language() {
724 |     return 'tr';
725 |   }
726 | }
727 | 
728 | export class windows_1251 extends sbcs {
729 |   byteMap() {
730 |     return [
731 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
732 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
733 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
734 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
735 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
736 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
737 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
738 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
739 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
740 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
741 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x90, 0x83, 0x20, 0x83,
742 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9a, 0x20, 0x9c, 0x9d, 0x9e, 0x9f,
743 |       0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9a, 0x20,
744 |       0x9c, 0x9d, 0x9e, 0x9f, 0x20, 0xa2, 0xa2, 0xbc, 0x20, 0xb4, 0x20, 0x20,
745 |       0xb8, 0x20, 0xba, 0x20, 0x20, 0x20, 0x20, 0xbf, 0x20, 0x20, 0xb3, 0xb3,
746 |       0xb4, 0xb5, 0x20, 0x20, 0xb8, 0x20, 0xba, 0x20, 0xbc, 0xbe, 0xbe, 0xbf,
747 |       0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
748 |       0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
749 |       0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xe0, 0xe1, 0xe2, 0xe3,
750 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
751 |       0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
752 |       0xfc, 0xfd, 0xfe, 0xff,
753 |     ];
754 |   }
755 | 
756 |   ngrams() {
757 |     return [
758 |       0x20e220, 0x20e2ee, 0x20e4ee, 0x20e7e0, 0x20e820, 0x20eae0, 0x20eaee,
759 |       0x20ede0, 0x20ede5, 0x20eee1, 0x20efee, 0x20eff0, 0x20f0e0, 0x20f1ee,
760 |       0x20f1f2, 0x20f2ee, 0x20f7f2, 0x20fdf2, 0xe0ede8, 0xe0f2fc, 0xe3ee20,
761 |       0xe5ebfc, 0xe5ede8, 0xe5f1f2, 0xe5f220, 0xe820ef, 0xe8e520, 0xe8e820,
762 |       0xe8ff20, 0xebe5ed, 0xebe820, 0xebfced, 0xede020, 0xede520, 0xede8e5,
763 |       0xede8ff, 0xedee20, 0xedeee2, 0xee20e2, 0xee20ef, 0xee20f1, 0xeee220,
764 |       0xeee2e0, 0xeee3ee, 0xeee920, 0xeeebfc, 0xeeec20, 0xeef1f2, 0xefeeeb,
765 |       0xeff0e5, 0xeff0e8, 0xeff0ee, 0xf0e0e2, 0xf0e5e4, 0xf1f2e0, 0xf1f2e2,
766 |       0xf1f2e8, 0xf1ff20, 0xf2e5eb, 0xf2ee20, 0xf2eef0, 0xf2fc20, 0xf7f2ee,
767 |       0xfbf520,
768 |     ];
769 |   }
770 | 
771 |   name(): EncodingName {
772 |     return 'windows-1251';
773 |   }
774 | 
775 |   language() {
776 |     return 'ru';
777 |   }
778 | }
779 | 
780 | export class windows_1256 extends sbcs {
781 |   byteMap() {
782 |     return [
783 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
784 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
785 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
786 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
787 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
788 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
789 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
790 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
791 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
792 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
793 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x81, 0x20, 0x83,
794 |       0x20, 0x20, 0x20, 0x20, 0x88, 0x20, 0x8a, 0x20, 0x9c, 0x8d, 0x8e, 0x8f,
795 |       0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x98, 0x20, 0x9a, 0x20,
796 |       0x9c, 0x20, 0x20, 0x9f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
797 |       0x20, 0x20, 0xaa, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
798 |       0x20, 0xb5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
799 |       0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
800 |       0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x20,
801 |       0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
802 |       0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
803 |       0x20, 0x20, 0x20, 0x20, 0xf4, 0x20, 0x20, 0x20, 0x20, 0xf9, 0x20, 0xfb,
804 |       0xfc, 0x20, 0x20, 0xff,
805 |     ];
806 |   }
807 | 
808 |   ngrams() {
809 |     return [
810 |       0x20c7e1, 0x20c7e4, 0x20c8c7, 0x20dae1, 0x20dded, 0x20e1e1, 0x20e3e4,
811 |       0x20e6c7, 0xc720c7, 0xc7c120, 0xc7ca20, 0xc7d120, 0xc7e120, 0xc7e1c3,
812 |       0xc7e1c7, 0xc7e1c8, 0xc7e1ca, 0xc7e1cc, 0xc7e1cd, 0xc7e1cf, 0xc7e1d3,
813 |       0xc7e1da, 0xc7e1de, 0xc7e1e3, 0xc7e1e6, 0xc7e1ed, 0xc7e320, 0xc7e420,
814 |       0xc7e4ca, 0xc820c7, 0xc920c7, 0xc920dd, 0xc920e1, 0xc920e3, 0xc920e6,
815 |       0xca20c7, 0xcf20c7, 0xcfc920, 0xd120c7, 0xd1c920, 0xd320c7, 0xda20c7,
816 |       0xdae1ec, 0xdded20, 0xe120c7, 0xe1c920, 0xe1ec20, 0xe1ed20, 0xe320c7,
817 |       0xe3c720, 0xe3c920, 0xe3e420, 0xe420c7, 0xe520c7, 0xe5c720, 0xe6c7e1,
818 |       0xe6e420, 0xec20c7, 0xed20c7, 0xed20e3, 0xed20e6, 0xedc920, 0xedd120,
819 |       0xede420,
820 |     ];
821 |   }
822 | 
823 |   name(): EncodingName {
824 |     return 'windows-1256';
825 |   }
826 | 
827 |   language() {
828 |     return 'ar';
829 |   }
830 | }
831 | 
832 | export class KOI8_R extends sbcs {
833 |   byteMap() {
834 |     return [
835 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
836 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
837 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
838 |       0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
839 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
840 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
841 |       0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
842 |       0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20,
843 |       0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
844 |       0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
845 |       0x78, 0x79, 0x7a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
846 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
847 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
848 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xa3, 0x20, 0x20, 0x20, 0x20,
849 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xa3,
850 |       0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
851 |       0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
852 |       0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
853 |       0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xc0, 0xc1, 0xc2, 0xc3,
854 |       0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
855 |       0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
856 |       0xdc, 0xdd, 0xde, 0xdf,
857 |     ];
858 |   }
859 | 
860 |   ngrams() {
861 |     return [
862 |       0x20c4cf, 0x20c920, 0x20cbc1, 0x20cbcf, 0x20cec1, 0x20cec5, 0x20cfc2,
863 |       0x20d0cf, 0x20d0d2, 0x20d2c1, 0x20d3cf, 0x20d3d4, 0x20d4cf, 0x20d720,
864 |       0x20d7cf, 0x20dac1, 0x20dcd4, 0x20ded4, 0xc1cec9, 0xc1d4d8, 0xc5ccd8,
865 |       0xc5cec9, 0xc5d3d4, 0xc5d420, 0xc7cf20, 0xc920d0, 0xc9c520, 0xc9c920,
866 |       0xc9d120, 0xccc5ce, 0xccc920, 0xccd8ce, 0xcec120, 0xcec520, 0xcec9c5,
867 |       0xcec9d1, 0xcecf20, 0xcecfd7, 0xcf20d0, 0xcf20d3, 0xcf20d7, 0xcfc7cf,
868 |       0xcfca20, 0xcfccd8, 0xcfcd20, 0xcfd3d4, 0xcfd720, 0xcfd7c1, 0xd0cfcc,
869 |       0xd0d2c5, 0xd0d2c9, 0xd0d2cf, 0xd2c1d7, 0xd2c5c4, 0xd3d120, 0xd3d4c1,
870 |       0xd3d4c9, 0xd3d4d7, 0xd4c5cc, 0xd4cf20, 0xd4cfd2, 0xd4d820, 0xd9c820,
871 |       0xded4cf,
872 |     ];
873 |   }
874 | 
875 |   name(): EncodingName {
876 |     return 'KOI8-R';
877 |   }
878 | 
879 |   language() {
880 |     return 'ru';
881 |   }
882 | }
883 | 
884 | /*
885 | module.exports.ISO_8859_7 = function() {
886 |   this.byteMap = function() {
887 |     return [
888 | 
889 |     ];
890 |   };
891 | 
892 |   this.ngrams = function() {
893 |     return [
894 | 
895 |     ];
896 |   };
897 | 
898 |   this.name = function(det) {
899 |     if (typeof det == 'undefined')
900 |       return 'ISO-8859-7';
901 |     return det.c1Bytes ? 'windows-1253' : 'ISO-8859-7';
902 |   };
903 | 
904 |   language() {
905 |     return 'el';
906 |   };
907 | };
908 | util.inherits(module.exports.ISO_8859_7, sbcs);
909 | */
910 | 


--------------------------------------------------------------------------------