├── .prettierignore ├── .gitignore ├── src ├── hash │ ├── index.ts │ ├── hash.ts │ └── crc32.ts ├── model │ ├── index.ts │ ├── types.ts │ └── model.ts ├── predict │ ├── index.ts │ ├── types.ts │ ├── proba.ts │ └── predict.ts ├── feature │ ├── index.ts │ ├── types.ts │ ├── char.ts │ ├── regexp.ts │ └── feature.ts ├── index.ts ├── util.ts └── tokenize.ts ├── docs ├── image │ ├── github.png │ ├── npm.svg │ └── pypi.svg ├── style │ └── reset.css └── index.html ├── tsconfig.cjs.json ├── test ├── tokenize │ └── tokenize.ts ├── feature │ ├── char.ts │ └── features.ts ├── predict │ ├── proba.ts │ └── predict.ts ├── hash │ └── hash.ts └── util │ └── ngram.ts ├── .prettierrc.json ├── jest.config.js ├── tsconfig.json ├── LICENSE ├── badges ├── badge-lines.svg ├── badge-branches.svg ├── badge-functions.svg └── badge-statements.svg ├── package.json └── README.md /.prettierignore: -------------------------------------------------------------------------------- 1 | /lib 2 | /coverage 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules/ 3 | /lib 4 | /coverage 5 | -------------------------------------------------------------------------------- /src/hash/index.ts: -------------------------------------------------------------------------------- 1 | export * from './crc32' 2 | export * from './hash' 3 | -------------------------------------------------------------------------------- /src/model/index.ts: -------------------------------------------------------------------------------- 1 | export * from './types' 2 | export * from './model' 3 | -------------------------------------------------------------------------------- /docs/image/github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuhsak/wakachigaki/HEAD/docs/image/github.png -------------------------------------------------------------------------------- /src/predict/index.ts: -------------------------------------------------------------------------------- 1 | export * from './types' 2 | export * from './proba' 3 | export * from './predict' 4 | -------------------------------------------------------------------------------- /src/feature/index.ts: -------------------------------------------------------------------------------- 1 | export * from './types' 2 | export * from './regexp' 3 | export * from './char' 4 | export * from './feature' 5 | -------------------------------------------------------------------------------- /src/predict/types.ts: -------------------------------------------------------------------------------- 1 | import type { NgramFeature } from '../feature' 2 | 3 | export type NgramFeatureWithDistance = NgramFeature & { distance: number } 4 | -------------------------------------------------------------------------------- /src/hash/hash.ts: -------------------------------------------------------------------------------- 1 | import { crc32 } from './crc32' 2 | 3 | export const hash = (nBuckets: number) => (text: string) => 4 | (crc32(new TextEncoder().encode(text)) % nBuckets).toString(16).toLowerCase() 5 | -------------------------------------------------------------------------------- /src/feature/types.ts: -------------------------------------------------------------------------------- 1 | export type NgramFeature = { 2 | char: string 3 | features: { 4 | kind: 'type' | 'hash' 5 | size: number 6 | offset: number 7 | value: string 8 | }[] 9 | } 10 | -------------------------------------------------------------------------------- /tsconfig.cjs.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "module": "commonjs", 5 | "outDir": "./lib/cjs", 6 | "tsBuildInfoFile": "./lib/cjs/.tsbuildinfo", 7 | "declaration": true, 8 | "declarationMap": true, 9 | "declarationDir": "./lib/types" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/model/types.ts: -------------------------------------------------------------------------------- 1 | type Weights = Record>> 2 | 3 | export type Weight = { 4 | hash: Weights 5 | type: Weights 6 | distance: number 7 | bias: number 8 | } 9 | 10 | export type Model = { 11 | version: number 12 | config: { 13 | nBuckets: number 14 | size: number 15 | offset: number 16 | scale: number 17 | } 18 | weight: Weight 19 | } 20 | -------------------------------------------------------------------------------- /test/tokenize/tokenize.ts: -------------------------------------------------------------------------------- 1 | import { tokenize, segment } from '../../src/tokenize' 2 | 3 | describe('tokenize functions', () => { 4 | const tokens = tokenize('This is a Test') 5 | test('tokenize', () => { 6 | expect(Array.isArray(tokens)).toBe(true) 7 | expect(Array.isArray(tokenize('あいうえお'))).toBe(true) 8 | }) 9 | test('segment', () => { 10 | expect(segment('This is a Test')).toStrictEqual(tokens) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /src/feature/char.ts: -------------------------------------------------------------------------------- 1 | import * as R from './regexp' 2 | 3 | const rules = [ 4 | { 5 | fn: R.isKanji, 6 | rep: 'C', 7 | }, 8 | { fn: R.isNumeralKanji, rep: 'S' }, 9 | { fn: R.isHiragana, rep: 'H' }, 10 | { fn: R.isKatakana, rep: 'K' }, 11 | { fn: R.isAlphabet, rep: 'A' }, 12 | { fn: R.isNumeral, rep: 'N' }, 13 | ] 14 | 15 | export const getCharType = (char: string) => 16 | rules.reduce((rep, rule) => (rule.fn(char) ? rule.rep : rep), 'O') 17 | -------------------------------------------------------------------------------- /test/feature/char.ts: -------------------------------------------------------------------------------- 1 | import { getCharType } from '../../src/feature' 2 | 3 | describe('char functions', () => { 4 | test('getCharType', () => { 5 | expect(getCharType('a')).toBe('A') 6 | expect(getCharType('0')).toBe('N') 7 | expect(getCharType('あ')).toBe('H') 8 | expect(getCharType('ア')).toBe('K') 9 | expect(getCharType('漢')).toBe('C') 10 | expect(getCharType('百')).toBe('S') 11 | expect(getCharType('!')).toBe('O') 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export { tokenize, segment } from './tokenize' 2 | export { predictProba, predict } from './predict' 3 | export { 4 | regexp, 5 | isKanji, 6 | isNumeralKanji, 7 | isHiragana, 8 | isKatakana, 9 | isNumeral, 10 | isAlphabet, 11 | features, 12 | } from './feature' 13 | export { crc32, hash } from './hash' 14 | export { threshold, model } from './model' 15 | export { ngram, sigmoid } from './util' 16 | export type { NgramFeature } from './feature' 17 | -------------------------------------------------------------------------------- /docs/image/npm.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 9 | 10 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "arrowParens": "always", 3 | "bracketSpacing": true, 4 | "jsxSingleQuote": true, 5 | "printWidth": 80, 6 | "quoteProps": "as-needed", 7 | "semi": false, 8 | "singleQuote": true, 9 | "tabWidth": 2, 10 | "trailingComma": "all", 11 | "useTabs": false, 12 | "overrides": [ 13 | { 14 | "files": ["*.md", "README"], 15 | "options": { "parser": "markdown-nocjsp" } 16 | }, 17 | { 18 | "files": ["*.mdx"], 19 | "options": { "parser": "mdx-nocjsp" } 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | clearMocks: true, 3 | roots: ['/test', '/src'], 4 | testMatch: [ 5 | '**/test/**/*.(ts|tsx|js|jsx)', 6 | '**/*.(spec|test).(ts|tsx|js|jsx)', 7 | ], 8 | transform: { 9 | '^.+\\.(t|j)sx?$': [ 10 | '@swc/jest', 11 | { 12 | sourceMaps: true, 13 | }, 14 | ], 15 | }, 16 | collectCoverage: true, 17 | coverageDirectory: 'coverage', 18 | coverageProvider: 'babel', 19 | coverageReporters: ['clover', 'json', 'lcov', 'text', 'json-summary'], 20 | verbose: true, 21 | } 22 | -------------------------------------------------------------------------------- /src/util.ts: -------------------------------------------------------------------------------- 1 | export const ngram = (chars: string[]) => (index: number) => { 2 | const get = (size: number, offset: number): string => { 3 | if (size === 1) { 4 | return chars[index + offset] || '' 5 | } 6 | return get(size - 1, offset) + get(1, offset + (size - 1)) 7 | } 8 | return get 9 | } 10 | 11 | export const range = (start: number, end: number) => { 12 | const tmp: number[] = [] 13 | for (let i = start; i < end; i++) { 14 | tmp.push(i) 15 | } 16 | return tmp 17 | } 18 | 19 | export const sigmoid = (n: number) => 1 / (1 + Math.exp(-1 * n)) 20 | -------------------------------------------------------------------------------- /test/feature/features.ts: -------------------------------------------------------------------------------- 1 | import { featurer } from '../../src/feature' 2 | 3 | describe('feature functions', () => { 4 | test('features', () => { 5 | const text = 'aあ0漢カ百bhjかいオ' 6 | const feats = featurer(262144, 3, 3)(text) 7 | expect(feats).toHaveLength(text.length) 8 | feats.forEach((f) => 9 | f.features.forEach((f) => { 10 | expect(['type', 'hash'].includes(f.kind)).toEqual(true) 11 | expect(typeof f.size).toEqual('number') 12 | expect(typeof f.offset).toEqual('number') 13 | expect(typeof f.value).toEqual('string') 14 | }), 15 | ) 16 | }) 17 | }) 18 | -------------------------------------------------------------------------------- /src/predict/proba.ts: -------------------------------------------------------------------------------- 1 | import type { NgramFeatureWithDistance } from './types' 2 | import { Weight } from '../model' 3 | import { sigmoid } from '../util' 4 | 5 | export const proba = (weight: Weight, scale: number) => { 6 | const { bias } = weight 7 | 8 | return (feature: NgramFeatureWithDistance) => { 9 | const features = feature.features.reduce((score, f) => { 10 | return score + (weight[f.kind][f.size]?.[f.offset]?.[f.value] || 0) 11 | }, 0) 12 | 13 | const distance = feature.distance * weight.distance 14 | 15 | return sigmoid((bias + features + distance) / scale) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /test/predict/proba.ts: -------------------------------------------------------------------------------- 1 | import { proba } from '../../src/predict/proba' 2 | 3 | describe('probability functions', () => { 4 | describe('proba', () => { 5 | test('works even if empty weights are given', () => { 6 | const p = proba({ type: {}, hash: {}, distance: 0, bias: 0 }, 1) 7 | const v = p({ 8 | char: 'a', 9 | features: [ 10 | { 11 | kind: 'type', 12 | size: 1, 13 | offset: 0, 14 | value: 'A', 15 | }, 16 | ], 17 | distance: 0, 18 | }) 19 | expect(v).toEqual(0.5) 20 | }) 21 | }) 22 | }) 23 | -------------------------------------------------------------------------------- /test/hash/hash.ts: -------------------------------------------------------------------------------- 1 | import { crc32, hash } from '../../src/hash' 2 | 3 | describe('hash functions', () => { 4 | describe('crc32', () => { 5 | test('provides correct value', () => { 6 | expect(crc32(new TextEncoder().encode('abcdef'))).toEqual(1267612143) 7 | expect(crc32(new TextEncoder().encode('CRC32関数のテスト'))).toEqual( 8 | 549135931, 9 | ) 10 | }) 11 | }) 12 | 13 | describe('hash', () => { 14 | test('provides correct hex', () => { 15 | expect(hash(262144)('abcdef')).toEqual('239ef') 16 | expect(hash(262144)('CRC32関数のテスト')).toEqual('3263b') 17 | }) 18 | }) 19 | }) 20 | -------------------------------------------------------------------------------- /docs/style/reset.css: -------------------------------------------------------------------------------- 1 | *, 2 | *::before, 3 | *::after { 4 | box-sizing: border-box; 5 | } 6 | body, 7 | h1, 8 | h2, 9 | h3, 10 | h4, 11 | p, 12 | figure, 13 | blockquote, 14 | dl, 15 | dd { 16 | margin: 0; 17 | } 18 | ul[role='list'], 19 | ol[role='list'] { 20 | list-style: none; 21 | } 22 | html:focus-within { 23 | scroll-behavior: smooth; 24 | } 25 | body { 26 | /* min-height: 100vh; */ 27 | text-rendering: optimizeSpeed; 28 | line-height: 1.5; 29 | } 30 | a:not([class]) { 31 | text-decoration-skip-ink: auto; 32 | } 33 | img, 34 | picture { 35 | max-width: 100%; 36 | display: block; 37 | } 38 | input, 39 | button, 40 | textarea, 41 | select { 42 | font: inherit; 43 | } 44 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["./src"], 3 | "compilerOptions": { 4 | "module": "esnext", 5 | "moduleResolution": "node", 6 | "target": "es6", 7 | "outDir": "./lib/esm", 8 | "tsBuildInfoFile": "./lib/esm/.tsbuildinfo", 9 | "rootDir": "./src", 10 | "sourceMap": true, 11 | "incremental": true, 12 | "isolatedModules": true, 13 | "strict": true, 14 | "noImplicitReturns": true, 15 | "noFallthroughCasesInSwitch": true, 16 | "noUncheckedIndexedAccess": true, 17 | "noImplicitOverride": true, 18 | "downlevelIteration": true, 19 | "esModuleInterop": true, 20 | "skipLibCheck": true, 21 | "forceConsistentCasingInFileNames": true 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /test/predict/predict.ts: -------------------------------------------------------------------------------- 1 | import { predictor, probaPredictor } from '../../src/predict/predict' 2 | 3 | const weight = { type: {}, hash: {}, distance: 0, bias: 0 } 4 | 5 | const predictProba = probaPredictor(weight, 1) 6 | 7 | const predict = predictor(weight, 1) 8 | 9 | const features = [{ char: 'a', features: [] }] 10 | 11 | describe('prediction functions', () => { 12 | describe('predictProba', () => { 13 | test('works', () => { 14 | const probas = predictProba(features) 15 | expect(probas[0]).toEqual(0.5) 16 | }) 17 | }) 18 | 19 | describe('predict', () => { 20 | test('works with threshold', () => { 21 | expect(predict(features, 0.6)[0]).toEqual(false) 22 | expect(predict(features, 0.4)[0]).toEqual(true) 23 | }) 24 | }) 25 | }) 26 | -------------------------------------------------------------------------------- /src/feature/regexp.ts: -------------------------------------------------------------------------------- 1 | export const regexp = { 2 | Kanji: 3 | /^[々〇〻\u2E80-\u2FDF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]|[\uD840-\uD87F][\uDC00-\uDFFF]+$/, 4 | NumeralKanji: /^[一二三四五六七八九十百千万億兆]+$/, 5 | Hiragana: /^[ぁ-ん]+$/, 6 | Katakana: /^[ァ-ヴーァ-ン゙゚]+$/, 7 | Alphabet: /^[a-zA-Za-zA-Z]+$/, 8 | Numeral: /^[0-90-9]+$/, 9 | } 10 | 11 | export const isKanji = (text: string) => regexp.Kanji.test(text) 12 | export const isNumeralKanji = (text: string) => regexp.NumeralKanji.test(text) 13 | export const isHiragana = (text: string) => regexp.Hiragana.test(text) 14 | export const isKatakana = (text: string) => regexp.Katakana.test(text) 15 | export const isAlphabet = (text: string) => regexp.Alphabet.test(text) 16 | export const isNumeral = (text: string) => regexp.Numeral.test(text) 17 | -------------------------------------------------------------------------------- /src/tokenize.ts: -------------------------------------------------------------------------------- 1 | import { featurer } from './feature' 2 | import { predict } from './predict' 3 | import { model } from './model' 4 | 5 | const { nBuckets, size, offset } = model.config 6 | 7 | export const tokenizer = (nBuckets: number, size: number, offset: number) => { 8 | const f = featurer(nBuckets, size, offset) 9 | 10 | return (text: string) => { 11 | const chars = f(text) 12 | return predict(chars) 13 | .reduce( 14 | (acc, willBreak, i) => { 15 | acc[acc.length - 1] += chars[i]!.char 16 | if (willBreak) acc.push('') 17 | return acc 18 | }, 19 | [''], 20 | ) 21 | .filter((c) => !!c) 22 | } 23 | } 24 | 25 | export const tokenize = tokenizer(nBuckets, size, offset) 26 | 27 | export { tokenize as segment } 28 | -------------------------------------------------------------------------------- /test/util/ngram.ts: -------------------------------------------------------------------------------- 1 | import { ngram, range } from '../../src/util' 2 | 3 | describe('utility Functions', () => { 4 | describe('ngram', () => { 5 | const chars = [...'abcdefghijklmn'] 6 | const last = chars.length - 1 7 | test('works', () => { 8 | expect(ngram(chars)(4)(3, -2)).toBe('cde') 9 | expect(ngram(chars)(2)(3, 2)).toBe('efg') 10 | }) 11 | test('works even if given index is out of range', () => { 12 | expect(ngram(chars)(-2)(5, -2)).toBe('a') 13 | expect(ngram(chars)(last + 1)(1, -1)).toBe('n') 14 | }) 15 | test('works even if given position is out of range', () => { 16 | expect(ngram(chars)(0)(4, -2)).toBe('ab') 17 | expect(ngram(chars)(last)(1, 2)).toBe('') 18 | }) 19 | test('works even if given size is out of range', () => { 20 | expect(ngram(chars)(0)(999, -2)).toBe(chars.join('')) 21 | }) 22 | }) 23 | 24 | test('range', () => { 25 | expect(range(0, 3)).toStrictEqual([0, 1, 2]) 26 | expect(range(-2, 1)).toStrictEqual([-2, -1, 0]) 27 | }) 28 | }) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yuhsak Inoue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /badges/badge-lines.svg: -------------------------------------------------------------------------------- 1 | Coverage:lines: 100%Coverage:lines100% -------------------------------------------------------------------------------- /badges/badge-branches.svg: -------------------------------------------------------------------------------- 1 | Coverage:branches: 100%Coverage:branches100% -------------------------------------------------------------------------------- /badges/badge-functions.svg: -------------------------------------------------------------------------------- 1 | Coverage:functions: 100%Coverage:functions100% -------------------------------------------------------------------------------- /badges/badge-statements.svg: -------------------------------------------------------------------------------- 1 | Coverage:statements: 100%Coverage:statements100% -------------------------------------------------------------------------------- /src/predict/predict.ts: -------------------------------------------------------------------------------- 1 | import type { NgramFeature } from '../feature' 2 | import { threshold as t, model, Weight } from '../model' 3 | import { proba } from './proba' 4 | 5 | const { 6 | weight, 7 | config: { scale }, 8 | } = model 9 | 10 | export const probaPredictor = (weight: Weight, scale: number) => { 11 | const p = proba(weight, scale) 12 | 13 | return (features: NgramFeature[], threshold = 0.5) => { 14 | return features.reduce<{ value: number[]; distance: number }>( 15 | (acc, feature) => { 16 | const _p = p({ ...feature, distance: acc.distance }) 17 | const willBreak = _p > threshold 18 | const distance = willBreak ? 0 : acc.distance + 1 19 | return { value: [...acc.value, _p], distance } 20 | }, 21 | { value: [], distance: 0 }, 22 | ).value 23 | } 24 | } 25 | 26 | export const predictProba = probaPredictor(weight, scale) 27 | 28 | export const predictor = (weight: Weight, scale: number) => { 29 | const predictProba = probaPredictor(weight, scale) 30 | 31 | return (features: NgramFeature[], threshold = t) => 32 | predictProba(features, threshold).map((p) => p > threshold) 33 | } 34 | 35 | export const predict = predictor(weight, scale) 36 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wakachigaki", 3 | "version": "1.3.2", 4 | "description": "Minimal japanese sentence tokenizer written in 100% pure TypeScript.", 5 | "author": { 6 | "name": "Yuhsak Inoue", 7 | "email": "yuhsak.inoue@gmail.com", 8 | "url": "https://github.com/yuhsak" 9 | }, 10 | "license": "MIT", 11 | "repository": { 12 | "type": "git", 13 | "url": "https://github.com/yuhsak/wakachigaki.git" 14 | }, 15 | "main": "lib/cjs/index.js", 16 | "types": "lib/types/index.d.ts", 17 | "module": "lib/esm/index.js", 18 | "files": [ 19 | "lib", 20 | "src", 21 | "!lib/**/*.tsbuildinfo", 22 | "tsconfig.json", 23 | "tsconfig.cjs.json" 24 | ], 25 | "scripts": { 26 | "clean:dist": "rimraf lib", 27 | "clean:test": "rimraf coverage", 28 | "clean": "run-p clean:*", 29 | "check:tsc": "tsc --noEmit", 30 | "check:perttier": "prettier --check .", 31 | "check": "run-s check:*", 32 | "test": "jest", 33 | "build:watch": "npm run build -- --watch", 34 | "build": "tsc -b tsconfig.json tsconfig.cjs.json", 35 | "prebuild": "run-p clean:dist", 36 | "start": "npm run watch", 37 | "prepublishOnly": "npm run build" 38 | }, 39 | "devDependencies": { 40 | "@swc/core": "^1.2.107", 41 | "@swc/jest": "^0.2.5", 42 | "@types/jest": "^27.0.2", 43 | "jest": "27.3.1", 44 | "jest-coverage-badges": "^1.1.2", 45 | "npm-run-all": "^4.1.5", 46 | "prettier": "^2.4.1", 47 | "prettier-plugin-md-nocjsp": "^1.2.0", 48 | "rimraf": "^3.0.2", 49 | "typescript": "^4.4.4" 50 | }, 51 | "sideEffects": false, 52 | "unpkg": "lib/esm/index.js" 53 | } 54 | -------------------------------------------------------------------------------- /src/feature/feature.ts: -------------------------------------------------------------------------------- 1 | import type { NgramFeature } from './types' 2 | import { ngram, range } from '../util' 3 | import { hash } from '../hash' 4 | import { getCharType } from './char' 5 | import { model } from '../model' 6 | 7 | const { 8 | config: { nBuckets, size, offset }, 9 | } = model 10 | 11 | const markers = [ 12 | 'B', 13 | 'D', 14 | 'E', 15 | 'F', 16 | 'G', 17 | 'I', 18 | 'J', 19 | 'L', 20 | 'M', 21 | 'P', 22 | 'Q', 23 | 'R', 24 | 'T', 25 | 'U', 26 | 'V', 27 | 'W', 28 | 'X', 29 | 'Y', 30 | 'Z', 31 | ] 32 | 33 | export const featurer = (nBuckets: number, size: number, offset: number) => { 34 | const prefix = markers.slice(0, offset) 35 | const suffix = markers.slice().reverse().slice(0, offset) 36 | const h = hash(nBuckets) 37 | 38 | return (text: string) => { 39 | const source = text.normalize() 40 | const chars = [...source] 41 | 42 | const ngramByChars = ngram([...prefix, ...source.toLowerCase(), ...suffix]) 43 | const ngramByTypes = ngram([ 44 | ...prefix, 45 | ...chars.map(getCharType), 46 | ...suffix, 47 | ]) 48 | 49 | return chars.map((char, i): NgramFeature => { 50 | const index = i + offset 51 | const ngramByCharsAt = ngramByChars(index) 52 | const ngramByTypesAt = ngramByTypes(index) 53 | 54 | return range(1, size + 1).reduce( 55 | (acc, s) => { 56 | return range(-1 * offset, offset + 1 + 1 - s).reduce( 57 | (acc, o) => { 58 | const _t = ngramByTypesAt(s, o) 59 | const _h = h(ngramByCharsAt(s, o)) 60 | return { 61 | ...acc, 62 | features: [ 63 | ...acc.features, 64 | { kind: 'type', size: s, offset: o, value: _t }, 65 | { kind: 'hash', size: s, offset: o, value: _h }, 66 | ], 67 | } 68 | }, 69 | acc, 70 | ) 71 | }, 72 | { char, features: [] }, 73 | ) 74 | }) 75 | } 76 | } 77 | 78 | export const features = featurer(nBuckets, size, offset) 79 | -------------------------------------------------------------------------------- /docs/image/pypi.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/hash/crc32.ts: -------------------------------------------------------------------------------- 1 | const xor = (a: number, b: number) => (a ^ b) >>> 0 2 | 3 | const and = (a: number, b: number) => (a & b) >>> 0 4 | 5 | const CRC32_TABLE = [ 6 | 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 7 | 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 8 | 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 9 | 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 10 | 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 11 | 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 12 | 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, 13 | 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 14 | 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 15 | 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 16 | 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, 17 | 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 18 | 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 19 | 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 20 | 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 21 | 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 22 | 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 23 | 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 24 | 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 25 | 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 26 | 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 27 | 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 28 | 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 29 | 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 30 | 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 31 | 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 32 | 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, 33 | 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 34 | 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 35 | 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 36 | 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 37 | 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 38 | 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 39 | 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 40 | 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, 41 | 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 42 | 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 43 | 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 44 | 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 45 | 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 46 | 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 47 | 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 48 | 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, 49 | ] 50 | 51 | export const crc32 = (buffer: ArrayBuffer | Uint8Array) => { 52 | let crc = 0xffffffff 53 | const view = new Uint8Array(buffer) 54 | for (let i = 0; i < view.byteLength; i++) { 55 | crc = xor(crc >>> 8, CRC32_TABLE[and(xor(crc, view[i]!), 0xff)]!) 56 | } 57 | return xor(crc, 0xffffffff) 58 | } 59 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | wakachigaki | わずか6.2Kbの日本語分かち書きライブラリ 10 | 11 | 32 | 174 | 175 | 176 |
177 |

178 | wakachigaki 179 |

180 |
181 |

わずか6.2Kbの日本語分かち書きライブラリ

182 |
183 | 189 | 195 | 201 |
202 |
203 |
204 | 209 | 210 |
211 |
212 | 213 | 214 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wakachigaki 2 | 3 | ![](./badges/badge-statements.svg) 4 | ![](./badges/badge-branches.svg) 5 | ![](./badges/badge-functions.svg) 6 | ![](./badges/badge-lines.svg) 7 | 8 | 6.2Kbの軽量日本語分かち書きライブラリ 9 | 10 | [動作確認用のデモサイト](https://yuhsak.github.io/wakachigaki/) 11 | 12 | Python版は[こちら](https://github.com/yuhsak/wakachigaki-py) 13 | 14 | ## 紹介 15 | 16 | `wakachigaki` は辞書を使わない軽量の日本語分かち書き用ライブラリです。 17 | 18 | ピュアなJavaScriptなのでNode.jsやDeno, ブラウザなど環境を問わず動作し、TypeScriptやES Module[^1]にも対応しています。 19 | 20 | 予め分かち書きされた大量の日本語テキストから作成した機械学習モデルを内包することで辞書不要の分かち書きを実現しています。 21 | 22 | 学習には[Wikipedia日本語版のダンプデータ](https://dumps.wikimedia.org/jawiki/)全量を用いました。[MeCab](https://taku910.github.io/mecab/) + [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) で得られる分かち書き結果を約90%の精度で再現することが出来ています。 23 | 24 | 単語境界の判定には文中に出現する文字の種類や並び順の情報のみを用いるようになっており、文字や単語単位で固有の情報を一切利用していないため未知語に非常に強いのが特徴です。 25 | 26 | 辞書を用いる [kuromoji.js](https://github.com/takuyaa/kuromoji.js/) などと異なり品詞の推定機能はありませんが、その分インストールも実行も軽量で環境を問わず動作します。 27 | 28 | [^1]: ブラウザ用にES Module形式のコードを配布していますが、パッケージ自体が厳密にNode.jsのNative ESMに対応しているわけではありません。Node.jsでは従来通りCommonJS形式のパッケージとして読み込まれます。TypeScriptが正式にNative ESMに対応した段階でDual Package化する予定です。 29 | 30 | ## 使い方 31 | 32 | ### インストール 33 | 34 | ```sh 35 | npm install wakachigaki 36 | ``` 37 | 38 | ### 分かち書きの実行 39 | 40 | ```ts 41 | import { tokenize } from 'wakachigaki' 42 | 43 | /** CommonJSの場合 */ 44 | // const { tokenize } = require('wakachigaki') 45 | 46 | tokenize('非常に効果的な機械学習モデル') 47 | // => [ '非常', 'に', '効果', '的', 'な', '機械学習', 'モデル' ] 48 | ``` 49 | 50 | ### 境界確率の取得 51 | 52 | 分かち書きされた結果を直接取得するだけでなく、テキスト内の各文字についてその文字の直後に単語境界がある確率を0~1の範囲の数値で取得することが出来ます。 53 | 54 | ```ts 55 | import { features, predictProba } from 'wakachigaki' 56 | 57 | // 特徴量を取得 58 | const feats = features('非常に効果的な機械学習モデル') 59 | 60 | predictProba(feats) 61 | /* 62 | => [ 63 | 0.0327992634970502, // 非 64 | 0.9901071412275622, // 常 65 | 0.9742190417471894, // に 66 | 0.04298367736033199, // 効 67 | 0.7249201317135311, // 果 68 | 0.9920294555733393, // 的 69 | 0.904908994982585, // な 70 | 0.10174356598870479, // 機 71 | 0.3827245071932094, // 械 72 | 0.11608892206899486, // 学 73 | 0.6410674063348171, // 習 74 | 0.0045548383234342614, // モ 75 | 0.00006214363582036111, // デ 76 | 0.9720230891240956 // ル 77 | ] 78 | */ 79 | ``` 80 | 81 | `tokenize()` 関数は内部でこの数値が予め定義された閾値を超えているかどうかを判定の基準にしていて、閾値もまた `threshold` という変数でexportされています。 82 | 83 | ```ts 84 | import { features, predictProba, predict, threshold } from 'wakachigaki' 85 | 86 | const feats = features('非常に効果的な機械学習モデル') 87 | 88 | const probas = predictProba(feats) 89 | 90 | // probas.map(p => p >= threshold) と同じ結果 91 | predict(feats) 92 | /* 93 | => [ 94 | false, // 非 95 | true, // 常 96 | true, // に 97 | false, // 効 98 | true, // 果 99 | true, // 的 100 | true, // な 101 | false, // 機 102 | false, // 械 103 | false, // 学 104 | true, // 習 105 | false, // モ 106 | false, // デ 107 | true // ル 108 | ] 109 | */ 110 | ``` 111 | 112 | ### ユーティリティ 113 | 114 | 内部で文字種の判定に利用している正規表現と判定用の関数も利用可能です。 115 | 116 | 正規表現の内容は [src/feature/regexp.ts](./src/feature/regexp.ts) を参照して下さい。 117 | 118 | ```ts 119 | import { 120 | regexp, 121 | isHiragana, 122 | isKatakana, 123 | isKanji, 124 | isNumeralKanji, 125 | isAlphabet, 126 | isNumeral, 127 | } from 'wakachigaki' 128 | 129 | /** 130 | * 与えられた文字列が文字種判定用の正規表現を満たすかどうかチェックする関数。入力は複数文字でもOK 131 | * 以下は全てtrueになる例 132 | **/ 133 | 134 | // ひらがな 135 | isHiragana('あ') 136 | 137 | // カタカナ 138 | isKatakana('カ') 139 | 140 | // 漢字 141 | isKanji('漢字') 142 | 143 | // 漢数字 144 | isNumeralKanji('一二三四五六七八九十百千万億兆') 145 | 146 | // アルファベット (半角, 全角を無視) 147 | isAlphabet('aa') 148 | 149 | // 数字 (半角, 全角を無視) 150 | isNumeral('99') 151 | ``` 152 | 153 | ### ブラウザ対応 154 | 155 | `wakachigaki` 自体では特にブラウザ用にビルドしたコードを配布していませんが、元々他パッケージへの依存がなくES Module形式に対応しているため [unpkg.com](https://unpkg.com) などのCDNを経由すればすぐに動作させることが出来ます。 156 | 157 | 下記のコードをhtmlに貼り付ければ多くのブラウザでそのまま動くはずです。 158 | 159 | ```html 160 | 165 | ``` 166 | 167 | ### Deno対応 168 | 169 | `wakachigaki` はNode.js固有のAPIを使用していないためDeno環境でも動作します。 170 | 171 | ブラウザ同様にCDNとしてunpkgを利用することも出来ますが、TypeScriptの型定義の配布に対応した [Skypack](https://www.skypack.dev) を利用するのがおすすめです。 172 | 173 | ```ts 174 | import { tokenize } from 'https://cdn.skypack.dev/wakachigaki@1.3.2?dts' 175 | 176 | console.log(tokenize('Denoで分かち書きのテスト')) 177 | // => ['Deno', 'で', '分かち', '書き', 'の', 'テスト'] 178 | ``` 179 | 180 | ## 精度の比較 181 | 182 | 下記の表はJS環境で利用可能な類似のライブラリと併せて精度の比較を行ったものです。 183 | 184 | 比較用のコーパスにはNHN Japan株式会社提供の[livedoor ニュースコーパス](https://www.rondhuit.com/download.html#ldcc)を利用しました。 185 | 186 | 各記事を行単位に分解し適宜URLのみのものを取り除くなど前処理をして得られた約10万の文章について、[MeCab](https://taku910.github.io/mecab/) + [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd)で行った分かち書き結果を正解として各数値を算出しました。 187 | 188 | 従来の手法と比べても遜色ない結果が得られていることがわかります。 189 | 190 | ただし`wakachigaki`がそもそも学習に`mecab-ipadic-NEologd`を用いているために新語や複合語に強くなっており、同列の条件での比較にはなっていない点に注意して下さい。あくまで参考程度の結果です。 191 | 192 | 特に高く見える適合率については、他ライブラリと比べて`判定した単語境界の数`が少なく`実際の単語境界の数`に近い数字になっていることからも`mecab-ipadic-NEologd`から学習した複合語の判定傾向の影響が強く出ていると考えられます。 193 | 194 | | ライブラリ | 単語境界の数 | 判定数 | 正答数 | 一致率 | 適合率 | 再現率 | F2スコア | 195 | | :--------------------------- | -----------: | ------: | ------: | -----: | -----: | -----: | -------: | 196 | | wakachigaki | 4611683 | 4589286 | 4234587 | 0.919 | 0.923 | 0.918 | 0.920 | 197 | | TinySegmenter | 4611683 | 5055596 | 4170853 | 0.824 | 0.825 | 0.904 | 0.863 | 198 | | kuromoji.js (デフォルト辞書) | 4611683 | 5015672 | 4312946 | 0.872 | 0.860 | 0.935 | 0.896 | 199 | 200 | **項目の意味** 201 | 202 | | 項目名 | 説明 | 203 | | :----------- | :---------------------------------------------------- | 204 | | 単語境界の数 | コーパス全体に対してMeCabが出力した単語境界の総数 | 205 | | 判定数 | ライブラリが出力した単語境界の総数 | 206 | | 正答数 | ライブラリが出力した単語境界のうち正解だったものの数 | 207 | | 一致率 | 非単語境界と判定したものも含む全体の一致率 (Accuracy) | 208 | | 適合率 | 正答数 ÷ 判定数 (Precision) | 209 | | 再現率 | 正答数 ÷ 単語境界の数 (Recall) | 210 | | F2スコア | 適合率と再現率の調和平均 | 211 | 212 | ## 開発の動機 213 | 214 | JSでアプリケーションを開発していると検索やレコメンド機能の実装などで日本語の分かち書きを行いたい時があります。 215 | 216 | そんな時に `npm install` したらサクっと使えてブラウザやDenoでも動いてESM, TypeScriptにも対応しているものがあれば嬉しいと思ったのが直接の動機です。 217 | 218 | 特に最近はサーバとブラウザどちらでも動く処理を書く機会が多かったりサーバレス系のインフラの出番が増えて取り回しの良い軽量なライブラリが求められます。 219 | 220 | そこで `wakachigaki` は **動作に環境依存がないこと**、**バンドルサイズがごく軽量なこと**、**ES Module, TypeScriptに対応していること**を目指して開発されました。 221 | 222 | ## モデルの学習方法 223 | 224 | 機械学習モデルを利用した分かち書きの学習方法を検討している際 [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/) がほぼ同様の構成になっているのを発見し大いに参考にさせて頂きました。 225 | 226 | データの加工と学習にはPythonを使い、[Wikipedia日本語版のダンプデータ](https://dumps.wikimedia.org/jawiki/)を全量使ってモデルの学習を行っています。 227 | 228 | 教師データの分かち書きには [MeCab](https://taku910.github.io/mecab/) + [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) を利用しました。 229 | 230 | コーパス内の各文字に対して複数のパラメータでNgramを取得し、漢字、ひらがな、カタカナなど文字の種類と文字のハッシュ値から算出した文字グループを特徴量として抽出、その文字の直後が単語境界かどうかの二値分類を学習しています。 231 | 232 | Tiny Segmenterでも解説されていますが、クライアントに配布することも考えると学習結果として取り出すモデルのパラメータは出来る限り軽量にしたかったのでモデルはL1ノルム正則化ロジスティック回帰を採用し、確率的勾配降下法によるミニバッチ学習を行いました。 233 | 234 | 結果、モデルを表現するパラメータのJSONファイルはgzip後でわずか3Kbという軽量サイズになっています。 235 | 236 | (学習用のコードもGitHubで公開する予定です) 237 | 238 | ## クレジット 239 | 240 | 機械学習モデルの構築にあたり非常に優秀な機械学習エンジニアの友人たちに多大な助力を頂きました。感謝します🙌 241 | 242 | [@mski-iksm (GitHub)](https://github.com/mski-iksm) 243 | 244 | [@Gashin_Learning (Twitter)](https://twitter.com/Gashin_Learning) 245 | -------------------------------------------------------------------------------- /src/model/model.ts: -------------------------------------------------------------------------------- 1 | import type { Model } from './types' 2 | 3 | export const threshold = 0.5 4 | 5 | export const model: Model = { 6 | version: 2, 7 | config: { nBuckets: 262144, size: 3, offset: 3, scale: 1000 }, 8 | weight: { 9 | type: { 10 | '1': { 11 | '-3': { C: 86, K: 39, A: 125, N: 103, B: -73, D: -1, E: -90 }, 12 | '-2': { N: 358, O: -75, D: -73, E: -1 }, 13 | '-1': { S: 651, C: 40, H: -508, O: -400, E: -73 }, 14 | '0': { C: 237, H: 2347, O: 1581 }, 15 | '1': { S: -30, C: -704, O: 2188, Z: 224 }, 16 | '2': { S: 0, C: 204, A: -63, O: -1, Y: 224 }, 17 | '3': { H: 44, N: 147, O: -10, Z: -72, X: 224 }, 18 | }, 19 | '2': { 20 | '-3': { CH: 10, HC: 78, KK: 165, BD: -73, DE: -1 }, 21 | '-2': { HC: 153, HO: -23, KC: 208, NC: -159, NN: 148, OH: 0, DE: -73 }, 22 | '-1': { 23 | SC: 26, 24 | CC: 853, 25 | CN: -311, 26 | CO: 750, 27 | HC: -17, 28 | HO: 1542, 29 | KK: 204, 30 | AA: 478, 31 | AO: -208, 32 | NC: 858, 33 | NN: -357, 34 | NO: -31, 35 | OA: -35, 36 | OO: 894, 37 | }, 38 | '0': { 39 | SC: -401, 40 | CS: -552, 41 | CC: -1815, 42 | CN: 171, 43 | CO: 720, 44 | HH: -3399, 45 | HN: 8421, 46 | KH: 9425, 47 | KK: -10891, 48 | AC: 805, 49 | AH: 327, 50 | AA: -3803, 51 | NC: -336, 52 | NN: -3235, 53 | OC: 1211, 54 | OH: 501, 55 | OO: -4040, 56 | }, 57 | '1': { CC: 321, CO: -160, KH: -13044, OC: 4, ZY: 224 }, 58 | '2': { HK: -1, KK: 8, YX: 224 }, 59 | }, 60 | '3': { 61 | '-3': { 62 | CCC: 316, 63 | CCH: -35, 64 | HHH: 292, 65 | HOC: 145, 66 | HOH: -175, 67 | KKC: 2, 68 | KKK: 255, 69 | NCN: -203, 70 | OCC: -62, 71 | BDE: -73, 72 | }, 73 | '-2': { 74 | SCC: -4, 75 | CCC: -345, 76 | CCH: 459, 77 | CCO: 155, 78 | CHH: 732, 79 | HCC: 358, 80 | HHH: 534, 81 | HHO: 420, 82 | HOH: -591, 83 | KHC: -22, 84 | KKK: 705, 85 | AAA: 72, 86 | NCC: -83, 87 | NCN: -107, 88 | OCC: 131, 89 | OCH: -163, 90 | OHH: -5, 91 | }, 92 | '-1': { 93 | SCC: 283, 94 | CCS: -345, 95 | CCH: 870, 96 | CCK: -1126, 97 | CCO: 247, 98 | CHC: -927, 99 | CHH: 669, 100 | CHN: 8835, 101 | HCC: -698, 102 | HCH: -1216, 103 | HCN: 725, 104 | HHC: 150, 105 | HHH: -66, 106 | HNN: -2984, 107 | KCC: -472, 108 | KKH: 11749, 109 | KKK: 908, 110 | KOK: -2409, 111 | AAA: -1543, 112 | AOA: -322, 113 | NCC: 746, 114 | NCH: 788, 115 | NCN: -486, 116 | NNN: -2, 117 | OCC: -217, 118 | OCH: -655, 119 | OCN: 388, 120 | OHC: 0, 121 | OHH: -412, 122 | OAA: -6, 123 | OOO: -61, 124 | }, 125 | '0': { 126 | SCC: -176, 127 | CCC: 285, 128 | CCH: -52, 129 | CCK: -87, 130 | CHH: -704, 131 | HCC: 1613, 132 | HHC: -55, 133 | KCC: -90, 134 | KCH: 84, 135 | KHH: 6955, 136 | KKH: -14071, 137 | KKK: 5059, 138 | KOK: -2315, 139 | AAA: -66, 140 | AOA: -1259, 141 | NCN: -43, 142 | NNC: -7229, 143 | NNN: -208, 144 | OCC: 67, 145 | }, 146 | '1': { 147 | CCC: -359, 148 | CCH: 235, 149 | CCO: 69, 150 | CHC: 34, 151 | CKK: -162, 152 | HHC: -211, 153 | HHH: 364, 154 | HHK: -2, 155 | HHO: 167, 156 | KHH: -6718, 157 | KKK: 784, 158 | NNC: -301, 159 | OCC: 11, 160 | ZYX: 224, 161 | }, 162 | }, 163 | }, 164 | hash: { 165 | '1': { 166 | '-3': { 167 | '5a92': -90, 168 | a098: -60, 169 | cf31: -73, 170 | '16a1d': 0, 171 | '1bf4a': 250, 172 | '2755a': 0, 173 | '28187': -192, 174 | '36a04': -1, 175 | '3900e': -11, 176 | '3a93d': -415, 177 | }, 178 | '-2': { 179 | '353b': -1, 180 | '5a92': -1, 181 | a098: -185, 182 | '16481': -239, 183 | '19034': -13, 184 | '25417': 41, 185 | '2585d': 0, 186 | '28187': -301, 187 | '36a04': -73, 188 | '3dc1f': 204, 189 | }, 190 | '-1': { 191 | '353b': -740, 192 | '3695': -65, 193 | '5a92': -73, 194 | '74df': 0, 195 | '83c4': 145, 196 | a098: -192, 197 | a429: 0, 198 | cdac: -423, 199 | '157b9': -109, 200 | '16481': -680, 201 | '19034': 21, 202 | '19c16': -400, 203 | '1b3f8': 2006, 204 | '1bf4a': -901, 205 | '206fe': 174, 206 | '2152a': -113, 207 | '2178a': -222, 208 | '23eba': -350, 209 | '25417': 396, 210 | '28229': 180, 211 | '2aef9': 114, 212 | '2af2e': -423, 213 | '2ecb3': 10, 214 | '2fb50': -19, 215 | '3024d': 0, 216 | '319cd': -299, 217 | '34449': 339, 218 | '34509': 4, 219 | '34630': 257, 220 | '3900e': 12, 221 | '394bf': 171, 222 | '3d03d': 487, 223 | '3d1ea': -576, 224 | '3d393': 1, 225 | '3dc1f': 619, 226 | '3f118': -736, 227 | }, 228 | '0': { 229 | '1667': -765, 230 | '183c': 4001, 231 | '353b': 438, 232 | '371d': -533, 233 | '542d': 32, 234 | '59d8': -49, 235 | '74df': -1159, 236 | '7771': 14, 237 | '7bf3': 68, 238 | '806a': -1226, 239 | '8764': 448, 240 | '8d9f': -103, 241 | a098: 1831, 242 | a429: 189, 243 | cdac: 4162, 244 | cf45: 1445, 245 | d0e4: 1, 246 | e0ab: 474, 247 | e17c: -414, 248 | e587: 0, 249 | '1271c': -116, 250 | '16481': 671, 251 | '184ff': -217, 252 | '19034': 161, 253 | '19c16': 11555, 254 | '1b3f8': -485, 255 | '1b7ee': 399, 256 | '1b8d3': 124, 257 | '1bb04': -1036, 258 | '1bf4a': 8956, 259 | '1d26f': 22, 260 | '1ecca': -277, 261 | '206fe': -150, 262 | '215f3': 86, 263 | '2178a': 1867, 264 | '2244d': -261, 265 | '238f4': 394, 266 | '25417': -1292, 267 | '26fbc': 23, 268 | '274e5': -679, 269 | '28187': 115, 270 | '28229': 1545, 271 | '2aef9': -3861, 272 | '2b3af': -82, 273 | '2ecb3': 368, 274 | '30603': 28, 275 | '328aa': 907, 276 | '33a9e': 947, 277 | '34449': -27, 278 | '345f6': 0, 279 | '34630': -168, 280 | '3817a': -244, 281 | '38e48': 36, 282 | '3900e': 773, 283 | '394bf': 468, 284 | '3a93d': 809, 285 | '3d1ea': 1438, 286 | '3dc1f': -2113, 287 | '3dd5f': 225, 288 | '3f118': -2566, 289 | '3fced': -1246, 290 | '3fd0d': 0, 291 | }, 292 | '1': { 293 | f20: -554, 294 | '1667': -1376, 295 | '183c': 557, 296 | '244f': -504, 297 | '353b': 1777, 298 | '3a80': -184, 299 | '5767': 224, 300 | '580f': -3866, 301 | '59d8': -1220, 302 | '76a6': -925, 303 | '7895': 0, 304 | '792a': 499, 305 | '83c4': -2879, 306 | '8764': 354, 307 | '8d9f': -1529, 308 | a098: 2740, 309 | a429: 956, 310 | bdce: -390, 311 | bede: 357, 312 | cdac: 360, 313 | cf45: 1708, 314 | e17c: -2210, 315 | e242: -6, 316 | '10597': -264, 317 | '10a1b': 850, 318 | '1249b': -384, 319 | '1271c': -18580, 320 | '157b9': -39, 321 | '15a8b': 741, 322 | '16481': 2401, 323 | '19034': 2069, 324 | '19c16': 1530, 325 | '19d61': 0, 326 | '1bb04': -2232, 327 | '1bd33': -2423, 328 | '1bf4a': 13687, 329 | '1e617': 112, 330 | '1fcd7': -4962, 331 | '20311': 1025, 332 | '20de5': -224, 333 | '2152a': -380, 334 | '215f3': -1190, 335 | '2178a': -43779, 336 | '2244d': -113, 337 | '22a6f': 5, 338 | '238f4': -653, 339 | '23bca': -806, 340 | '23eba': -695, 341 | '24a51': -804, 342 | '25417': 1793, 343 | '274e5': 118, 344 | '27910': -1378, 345 | '28050': -2071, 346 | '28fdc': -1055, 347 | '29ab9': -142, 348 | '2af2e': 973, 349 | '2ecb3': -8687, 350 | '305ad': -614, 351 | '3255f': 830, 352 | '328aa': 2761, 353 | '33a9e': 427, 354 | '34449': -213, 355 | '34d65': -121, 356 | '364bb': 292, 357 | '3817a': 1244, 358 | '3833b': 23, 359 | '38d21': 0, 360 | '3900e': 2337, 361 | '39e55': -131, 362 | '3a65f': 2, 363 | '3a93d': -2185, 364 | '3b0fc': -1239, 365 | '3b22a': -9, 366 | '3bd09': 1256, 367 | '3d03d': 22, 368 | '3d1ea': 2051, 369 | '3d393': -711, 370 | '3d511': 0, 371 | '3dc1f': 398, 372 | '3f118': 832, 373 | '3fc75': 2, 374 | '3fced': 2525, 375 | '3fe94': -2285, 376 | }, 377 | '2': { 378 | f20: -145, 379 | '353b': 119, 380 | '59d8': 143, 381 | '74df': 244, 382 | '792a': -23, 383 | cdac: -177, 384 | e17c: 283, 385 | '106dd': 224, 386 | '16481': -348, 387 | '184ff': -8, 388 | '19034': 267, 389 | '1b7ee': -203, 390 | '1bb04': 501, 391 | '1bf4a': -145, 392 | '1e2f9': 163, 393 | '20de5': -211, 394 | '2178a': 197, 395 | '21806': -325, 396 | '219b9': 0, 397 | '23501': -398, 398 | '23eba': -911, 399 | '26458': 89, 400 | '2ecb3': -80, 401 | '328aa': -71, 402 | '33a9e': -205, 403 | '34449': 28, 404 | '3d1ea': -29, 405 | '3fced': 282, 406 | }, 407 | '3': { 408 | '183c': -564, 409 | '5767': -72, 410 | '792a': 6, 411 | cdac: 412, 412 | '1271c': 151, 413 | '16481': -31, 414 | '1b111': -4, 415 | '21806': 78, 416 | '2364b': 224, 417 | '23eba': -833, 418 | '2ecb3': 65, 419 | '3900e': -14, 420 | '3a93d': -495, 421 | '3dda0': 0, 422 | }, 423 | }, 424 | '2': { 425 | '-3': { 426 | '10ce0': 0, 427 | '17db9': 0, 428 | '1c4ea': 0, 429 | '1e501': -1322, 430 | '1fb90': 0, 431 | '2190f': 0, 432 | '22de1': -1, 433 | '33238': 0, 434 | '34c04': 1387, 435 | '356c6': 0, 436 | '3baf1': -73, 437 | }, 438 | '-2': { 439 | '21c8': 203, 440 | ad00: 0, 441 | f3ce: 57, 442 | '159c6': 0, 443 | '16fab': -91, 444 | '1d279': 0, 445 | '22de1': -81, 446 | '25424': 0, 447 | '28c6e': -339, 448 | '2c552': 0, 449 | '2d7c9': 0, 450 | '36264': -12, 451 | '369e1': 0, 452 | '39545': 0, 453 | }, 454 | '-1': { 455 | '21c8': -505, 456 | '7473': 873, 457 | a50f: 0, 458 | dd0c: 0, 459 | f3ce: 472, 460 | '13b7f': 0, 461 | '18738': -4, 462 | '197be': 627, 463 | '1c06d': 0, 464 | '1cf23': 493, 465 | '1e54a': 435, 466 | '21bac': 0, 467 | '24ca7': -456, 468 | '28008': -363, 469 | '2ae60': 475, 470 | '2fd84': 0, 471 | '30239': 0, 472 | '32806': 275, 473 | '36ef7': 0, 474 | '39334': 0, 475 | '3b5e9': 0, 476 | '3c196': 390, 477 | '3ca75': 0, 478 | '3cba2': 0, 479 | '3cdcc': -122, 480 | '3ef8f': -403, 481 | }, 482 | '0': { 483 | '167b': -58, 484 | '21c8': -4261, 485 | '34ae': 647, 486 | '508b': -2429, 487 | '7473': -480, 488 | bc2a: 0, 489 | c3af: -223, 490 | d793: 0, 491 | e98f: 0, 492 | ec66: -3884, 493 | ed42: 236, 494 | f3ce: -2708, 495 | '11b00': 3580, 496 | '11bd5': 0, 497 | '12122': -977, 498 | '1246c': 0, 499 | '129b8': 0, 500 | '13455': 0, 501 | '1440c': 455, 502 | '1482a': -1121, 503 | '1487f': -124, 504 | '161ed': -546, 505 | '164d9': 0, 506 | '16f5f': -29, 507 | '19215': -395, 508 | '197be': 650, 509 | '1b57b': -4388, 510 | '1c00f': 39, 511 | '1cf0a': 0, 512 | '1cf23': -4628, 513 | '1d62b': 1428, 514 | '1e34f': -2630, 515 | '1e4ca': 0, 516 | '21cd1': -3514, 517 | '2499e': 37, 518 | '289f7': -801, 519 | '28c6e': -110, 520 | '2a6a2': 0, 521 | '2ae60': 245, 522 | '2b1ef': -27, 523 | '2c570': 0, 524 | '2cbde': -896, 525 | '2d9d5': -1, 526 | '2ea4c': 0, 527 | '2fbe3': 0, 528 | '2ff65': 0, 529 | '30e87': -465, 530 | '32b7a': 0, 531 | '32ec3': 0, 532 | '3311b': -7386, 533 | '339e7': 151, 534 | '344d5': 0, 535 | '356fb': -593, 536 | '36264': -2124, 537 | '399bc': -3824, 538 | '3b99a': 0, 539 | '3c196': 440, 540 | '3cdcc': 0, 541 | '3ef0a': 0, 542 | }, 543 | '1': { 544 | '167b': -572, 545 | '3826': -3877, 546 | '4cad': 0, 547 | '7a9f': -539, 548 | a153: 665, 549 | be59: 452, 550 | f3ce: 1553, 551 | faa4: 0, 552 | '10e08': 0, 553 | '11569': 381, 554 | '11941': 1574, 555 | '11b00': 616, 556 | '16efe': -1, 557 | '17e39': 0, 558 | '197be': 2983, 559 | '19d6d': 0, 560 | '1bab9': 60, 561 | '1cf23': 2859, 562 | '1f9cf': 0, 563 | '2068a': 0, 564 | '21cd1': 425, 565 | '24e71': 199, 566 | '2544f': 581, 567 | '2549e': 0, 568 | '289f7': 2461, 569 | '2ae60': -2323, 570 | '2b4e1': 3494, 571 | '2e033': 0, 572 | '31d2a': 2423, 573 | '3311b': 1154, 574 | '35463': 393, 575 | '388ed': 0, 576 | '399bc': 1085, 577 | '3b623': 0, 578 | '3c196': 481, 579 | '3c89c': 0, 580 | '3fdbd': 0, 581 | }, 582 | '2': { 583 | '2d24': 222, 584 | '1149d': 985, 585 | '17420': 0, 586 | '197be': 336, 587 | '19bd4': 0, 588 | '25627': 0, 589 | '2b1ef': -475, 590 | '2d94a': 0, 591 | '3e185': 401, 592 | }, 593 | }, 594 | '3': { 595 | '-3': { bfa2: -72 }, 596 | '-2': { '8dcb': -7, '38afb': -411 }, 597 | '-1': { 598 | '879': -493, 599 | bba3: -222, 600 | '1604f': -1, 601 | '18934': -1, 602 | '359f0': -1415, 603 | '3fb76': -1500, 604 | }, 605 | '0': { '10615': -418, '359f0': -2722, '3c742': 2648 }, 606 | '1': { '34daf': 221, '3526b': 427, '359f0': 0, '3ef72': 213 }, 607 | }, 608 | }, 609 | distance: -140, 610 | bias: 147, 611 | }, 612 | } 613 | --------------------------------------------------------------------------------