├── .gitattributes ├── logo.png ├── banner.png ├── .eslintignore ├── .gitignore ├── playground ├── src │ ├── style.css │ ├── main.js │ └── App.vue ├── .postcssrc.cjs ├── vite.config.js ├── tailwind.config.cjs ├── index.html └── package.json ├── .prettierrc ├── .prettierignore ├── .editorconfig ├── tests-integration ├── node │ ├── package.json │ └── index.js ├── deno │ └── index.ts ├── node-esm │ ├── index.js │ └── package.json └── run.sh ├── .npmignore ├── index.html ├── src ├── benchmark │ ├── tinyld.ts │ ├── tinyld_heavy.ts │ ├── tinyld_light.ts │ ├── languagedetect.ts │ ├── langdetect.ts │ ├── franc.ts │ ├── franc-all.ts │ ├── franc-min.ts │ ├── cld.ts │ └── bench.ts ├── clean │ └── index.ts ├── index.ts ├── index_heavy.ts ├── index_light.ts ├── train │ └── splitter.ts ├── tokenizer.ts ├── train.ts └── core.ts ├── .github └── workflows │ ├── main.yml │ └── playground.yml ├── docs ├── light.md ├── install.md ├── api.md ├── dev.md ├── cli.md ├── langs.md ├── algorithm.md ├── faq.md ├── benchmark.md ├── overall.svg └── language.svg ├── tsconfig.json ├── tests ├── light.js ├── locale.js ├── clean.js └── detect.js ├── bin ├── tinyld.js ├── tinyld-heavy.js └── tinyld-light.js ├── .eslintrc ├── license ├── utils ├── overall.js ├── index.js ├── exectime.js ├── language.js ├── length.js └── mkdown.js ├── data └── bench │ ├── langdetect.json │ ├── tinyld-light.json │ ├── tinyld.json │ ├── franc-all.json │ ├── tinyld-heavy.json │ ├── franc.json │ ├── cld.json │ ├── languagedetect.json │ └── franc-min.json ├── Readme.md └── package.json /.gitattributes: -------------------------------------------------------------------------------- 1 | package-lock.json binary 2 | yarn.lock binary 3 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komodojp/tinyld/HEAD/logo.png -------------------------------------------------------------------------------- /banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komodojp/tinyld/HEAD/banner.png -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | bin 3 | dist 4 | tests 5 | tests-integration 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | data/tmp 3 | data/udhr 4 | data/tatoeba.csv 5 | node_modules 6 | -------------------------------------------------------------------------------- /playground/src/style.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | -------------------------------------------------------------------------------- /playground/.postcssrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {} 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 120, 3 | "trailingComma": "none", 4 | "singleQuote": true, 5 | "semi": false 6 | } 7 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | .quasar 2 | node_modules 3 | dist 4 | public 5 | coverage 6 | build 7 | SteamCI 8 | config 9 | *.log 10 | package.json 11 | package-lock.json 12 | tests-integration 13 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /tests-integration/node/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "language-detect-node", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "author": "", 7 | "license": "ISC" 8 | } 9 | -------------------------------------------------------------------------------- /playground/vite.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import vue from '@vitejs/plugin-vue' 3 | 4 | // https://vitejs.dev/config/ 5 | export default defineConfig({ 6 | plugins: [vue()] 7 | }) 8 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github 3 | data 4 | docs 5 | src 6 | tests 7 | utils 8 | 9 | .editorconfig 10 | .eslintignore 11 | .eslintrc 12 | .prettierignore 13 | .prettierrc 14 | 15 | tsconfig.json 16 | .gitattributes 17 | -------------------------------------------------------------------------------- /tests-integration/deno/index.ts: -------------------------------------------------------------------------------- 1 | import { detect } from '../../dist/tinyld.normal.node.mjs' 2 | 3 | const language = detect('これは日本語です.') 4 | console.log(`Detect Language ${language}`) 5 | Deno.exit(language === 'ja' ? 0 : 1) 6 | -------------------------------------------------------------------------------- /tests-integration/node-esm/index.js: -------------------------------------------------------------------------------- 1 | import { detect } from '../../dist/tinyld.normal.node.mjs' 2 | 3 | const language = detect('これは日本語です.') 4 | console.log(`Detect Language ${language}`) 5 | process.exit(language === 'ja' ? 0 : 1) 6 | -------------------------------------------------------------------------------- /tests-integration/node/index.js: -------------------------------------------------------------------------------- 1 | const { detect } = require('../../dist/tinyld.normal.node.js') 2 | 3 | const language = detect('これは日本語です.') 4 | console.log(`Detect Language ${language}`) 5 | process.exit(language === 'ja' ? 0 : 1) 6 | -------------------------------------------------------------------------------- /tests-integration/node-esm/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "language-detect-node-esm", 3 | "type": "module", 4 | "version": "1.0.0", 5 | "description": "", 6 | "main": "index.js", 7 | "author": "", 8 | "license": "ISC" 9 | } 10 | -------------------------------------------------------------------------------- /tests-integration/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd "$(dirname "$0")" 4 | 5 | echo "> Check DENO" 6 | deno run ./deno/index.ts 7 | 8 | echo "> Check NODE" 9 | node ./node/index.js 10 | 11 | echo "> Check NODE ESM" 12 | node ./node-esm/index.js 13 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /playground/tailwind.config.cjs: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | content: [ 4 | './index.html', 5 | './public/**/*.html', 6 | './src/**/*.{vue,js,ts,jsx,tsx}' 7 | ], 8 | theme: { 9 | extend: {}, 10 | }, 11 | plugins: [], 12 | } 13 | -------------------------------------------------------------------------------- /src/benchmark/tinyld.ts: -------------------------------------------------------------------------------- 1 | import { detect } from '../index' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | ;(async () => { 5 | const res = await benchmark(detect) 6 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 7 | fs.writeFileSync('./data/bench/tinyld.json', JSON.stringify(res, null, 2)) 8 | })() 9 | -------------------------------------------------------------------------------- /src/benchmark/tinyld_heavy.ts: -------------------------------------------------------------------------------- 1 | import { detect } from '../index_heavy' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | ;(async () => { 5 | const res = await benchmark(detect) 6 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 7 | fs.writeFileSync('./data/bench/tinyld-heavy.json', JSON.stringify(res, null, 2)) 8 | })() 9 | -------------------------------------------------------------------------------- /src/benchmark/tinyld_light.ts: -------------------------------------------------------------------------------- 1 | import { detect } from '../index_light' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | ;(async () => { 5 | const res = await benchmark(detect) 6 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 7 | fs.writeFileSync('./data/bench/tinyld-light.json', JSON.stringify(res, null, 2)) 8 | })() 9 | -------------------------------------------------------------------------------- /playground/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Tinyld Playground 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [ develop ] 6 | pull_request: 7 | branches: [ develop ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Use Node.js 15 | uses: actions/setup-node@v2 16 | with: 17 | node-version: '18.x' 18 | - run: | 19 | yarn 20 | yarn build 21 | yarn test 22 | -------------------------------------------------------------------------------- /docs/light.md: -------------------------------------------------------------------------------- 1 | # **TinyLD** (Light Flavor, for web usage) 2 | 3 | The normal library can be a bit massive (mostly caused by the language profile database), which can be problematic for web usage. 4 | 5 | For this usage we also provide a lighter version (a tradeoff between disk size and accuracy) 6 | 7 | - import with: `import { detect } from 'tinyld/dist/tinyld.light.cjs'` 8 | - normal version ~900KB, light version is only ~100KB (~25KB with gzip) 9 | - only 24 languages supported 10 | - slightly less accurate, only ~95% 11 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ESNext", 4 | "module": "commonjs", 5 | "moduleResolution": "node", 6 | "esModuleInterop": true, 7 | "strict": true, 8 | "noImplicitAny": true, 9 | "noImplicitThis": true, 10 | "alwaysStrict": true, 11 | "strictBindCallApply": true, 12 | "strictNullChecks": true, 13 | "strictFunctionTypes": true, 14 | "strictPropertyInitialization": true, 15 | "sourceMap": false, 16 | "resolveJsonModule": true, 17 | "rootDir": "src", 18 | "outDir": "dist" 19 | }, 20 | "include": ["src/**/*"] 21 | } 22 | -------------------------------------------------------------------------------- /tests/light.js: -------------------------------------------------------------------------------- 1 | const { test } = require('uvu') 2 | const assert = require('uvu/assert') 3 | const light = require('../dist/tinyld.light.node.js') 4 | 5 | function assertLocale(locale, val) { 6 | const res = light.detectAll(val) 7 | if (res.length > 0 && res[0].lang != locale) light.detectAll(val, { verbose: true }) 8 | assert.is(light.detect(val), locale, `is ${locale} : ${val}`) 9 | } 10 | 11 | test('Supported Language', () => { 12 | assert.is(light.supportedLanguages.length, 24) 13 | }) 14 | 15 | test('Detect English', () => { 16 | assertLocale('en', 'I’m still learning English, so please speak slowly.') 17 | }) 18 | 19 | test.run() 20 | -------------------------------------------------------------------------------- /src/benchmark/languagedetect.ts: -------------------------------------------------------------------------------- 1 | import { benchmark } from './bench' 2 | import fs from 'fs' 3 | 4 | // eslint-disable-next-line @typescript-eslint/no-var-requires 5 | const languageDetect = require('languagedetect') 6 | const lngDetector = new languageDetect() 7 | lngDetector.setLanguageType('iso2') 8 | 9 | function detect(val: string): string { 10 | const res = lngDetector.detect(val) 11 | if (res.length > 0) return res[0][0] || '' 12 | return '' 13 | } 14 | 15 | ;(async () => { 16 | const res = await benchmark(detect) 17 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 18 | fs.writeFileSync('./data/bench/languagedetect.json', JSON.stringify(res, null, 2)) 19 | })() 20 | -------------------------------------------------------------------------------- /playground/src/main.js: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue' 2 | import './style.css' 3 | import App from './App.vue' 4 | 5 | import { library } from '@fortawesome/fontawesome-svg-core' 6 | import { faGithub } from '@fortawesome/free-brands-svg-icons' 7 | import { faChartLine, faShareFromSquare } from '@fortawesome/free-solid-svg-icons' 8 | import { faCircleQuestion } from '@fortawesome/free-regular-svg-icons' 9 | import { FontAwesomeIcon } from '@fortawesome/vue-fontawesome' 10 | 11 | library.add(faGithub) 12 | library.add(faChartLine) 13 | library.add(faShareFromSquare) 14 | library.add(faCircleQuestion) 15 | 16 | const app = createApp(App) 17 | app.component('v-icon', FontAwesomeIcon) 18 | app.mount('#app') 19 | -------------------------------------------------------------------------------- /src/benchmark/langdetect.ts: -------------------------------------------------------------------------------- 1 | import { benchmark } from './bench' 2 | import fs from 'fs' 3 | 4 | // eslint-disable-next-line @typescript-eslint/no-var-requires 5 | const { detect } = require('langdetect') 6 | 7 | function langdetect(val: string): string { 8 | const res = detect(val) 9 | if (res && res.length > 0) { 10 | const lang = res[0].lang || '' 11 | if (['zh-cn', 'zh-tw'].includes(lang)) return 'zh' 12 | return lang 13 | } 14 | return '' 15 | } 16 | 17 | ;(async () => { 18 | const res = await benchmark(langdetect) 19 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 20 | fs.writeFileSync('./data/bench/langdetect.json', JSON.stringify(res, null, 2)) 21 | })() 22 | -------------------------------------------------------------------------------- /bin/tinyld.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | const { detectAll } = require('../dist/tinyld.normal.node.js') 3 | 4 | function main() { 5 | const [, , ...args] = process.argv 6 | 7 | let onlyLangs = [] 8 | let verbose = false 9 | 10 | const texts = [] 11 | for (const arg of [...args]) { 12 | if (arg.startsWith('--only=')) { 13 | onlyLangs = arg.replace('--only=', '').split(',') 14 | continue 15 | } 16 | 17 | if (arg.startsWith('--verbose') || arg.startsWith('-v')) { 18 | verbose = true 19 | continue 20 | } 21 | 22 | texts.push(arg) 23 | } 24 | const message = texts.join(' ') 25 | const options = { only: onlyLangs, verbose } 26 | console.log(detectAll(message, options)) 27 | } 28 | 29 | main() 30 | -------------------------------------------------------------------------------- /bin/tinyld-heavy.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | const { detectAll } = require('../dist/tinyld.heavy.node.js') 3 | 4 | function main() { 5 | const [, , ...args] = process.argv 6 | 7 | let onlyLangs = [] 8 | let verbose = false 9 | 10 | const texts = [] 11 | for (const arg of [...args]) { 12 | if (arg.startsWith('--only=')) { 13 | onlyLangs = arg.replace('--only=', '').split(',') 14 | continue 15 | } 16 | 17 | if (arg.startsWith('--verbose') || arg.startsWith('-v')) { 18 | verbose = true 19 | continue 20 | } 21 | 22 | texts.push(arg) 23 | } 24 | const message = texts.join(' ') 25 | const options = { only: onlyLangs, verbose } 26 | console.log(detectAll(message, options)) 27 | } 28 | 29 | main() 30 | -------------------------------------------------------------------------------- /bin/tinyld-light.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | const { detectAll } = require('../dist/tinyld.light.node.js') 3 | 4 | function main() { 5 | const [, , ...args] = process.argv 6 | 7 | let onlyLangs = [] 8 | let verbose = false 9 | 10 | const texts = [] 11 | for (const arg of [...args]) { 12 | if (arg.startsWith('--only=')) { 13 | onlyLangs = arg.replace('--only=', '').split(',') 14 | continue 15 | } 16 | 17 | if (arg.startsWith('--verbose') || arg.startsWith('-v')) { 18 | verbose = true 19 | continue 20 | } 21 | 22 | texts.push(arg) 23 | } 24 | const message = texts.join(' ') 25 | const options = { only: onlyLangs, verbose } 26 | console.log(detectAll(message, options)) 27 | } 28 | 29 | main() 30 | -------------------------------------------------------------------------------- /src/benchmark/franc.ts: -------------------------------------------------------------------------------- 1 | import { toISO2 } from '../core' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | // eslint-disable-next-line @typescript-eslint/no-var-requires 5 | const franc = require('franc') 6 | 7 | const langMap: { [id: string]: string } = { 8 | arb: 'ara', 9 | fas: 'pes', 10 | lav: 'lat', 11 | nno: 'nob' 12 | } 13 | 14 | function detect(val: string): string { 15 | let res = franc(val) 16 | if (res === 'und') res = '' 17 | else if (res in langMap) res = langMap[res] 18 | return res ? toISO2(res) : '' 19 | } 20 | 21 | ;(async () => { 22 | const res = await benchmark(detect) 23 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 24 | fs.writeFileSync('./data/bench/franc.json', JSON.stringify(res, null, 2)) 25 | })() 26 | -------------------------------------------------------------------------------- /src/benchmark/franc-all.ts: -------------------------------------------------------------------------------- 1 | import { toISO2 } from '../core' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | // eslint-disable-next-line @typescript-eslint/no-var-requires 5 | const franc = require('franc-all') 6 | 7 | const langMap: { [id: string]: string } = { 8 | arb: 'ara', 9 | fas: 'pes', 10 | lav: 'lat', 11 | nno: 'nob' 12 | } 13 | 14 | function detect(val: string): string { 15 | let res = franc(val) 16 | if (res === 'und') res = '' 17 | else if (res in langMap) res = langMap[res] 18 | return res ? toISO2(res) : '' 19 | } 20 | 21 | ;(async () => { 22 | const res = await benchmark(detect) 23 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 24 | fs.writeFileSync('./data/bench/franc-all.json', JSON.stringify(res, null, 2)) 25 | })() 26 | -------------------------------------------------------------------------------- /src/benchmark/franc-min.ts: -------------------------------------------------------------------------------- 1 | import { toISO2 } from '../core' 2 | import { benchmark } from './bench' 3 | import fs from 'fs' 4 | // eslint-disable-next-line @typescript-eslint/no-var-requires 5 | const franc = require('franc-min') 6 | 7 | const langMap: { [id: string]: string } = { 8 | arb: 'ara', 9 | fas: 'pes', 10 | lav: 'lat', 11 | nno: 'nob' 12 | } 13 | 14 | function detect(val: string): string { 15 | let res = franc(val) 16 | if (res === 'und') res = '' 17 | else if (res in langMap) res = langMap[res] 18 | return res ? toISO2(res) : '' 19 | } 20 | 21 | ;(async () => { 22 | const res = await benchmark(detect) 23 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 24 | fs.writeFileSync('./data/bench/franc-min.json', JSON.stringify(res, null, 2)) 25 | })() 26 | -------------------------------------------------------------------------------- /playground/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tinyld-playground", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "vite build", 9 | "preview": "vite preview" 10 | }, 11 | "dependencies": { 12 | "@fortawesome/fontawesome-svg-core": "^6.2.0", 13 | "@fortawesome/free-brands-svg-icons": "^6.2.0", 14 | "@fortawesome/free-regular-svg-icons": "^6.2.0", 15 | "@fortawesome/free-solid-svg-icons": "^6.2.0", 16 | "@fortawesome/vue-fontawesome": "^3.0.2", 17 | "@vueuse/core": "^9.5.0", 18 | "tailwindcss": "^3.2.3", 19 | "tinyld": "^1.3.1", 20 | "vue": "^3.2.41" 21 | }, 22 | "devDependencies": { 23 | "@vitejs/plugin-vue": "^3.2.0", 24 | "postcss": "^8.4.12", 25 | "vite": "^3.2.3" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "extends": [ 4 | "eslint:recommended" 5 | ], 6 | "env": { 7 | "node": true, 8 | "es2021": true 9 | }, 10 | "parserOptions": { 11 | "ecmaVersion": 12, 12 | "sourceType": "module" 13 | }, 14 | "overrides": [ 15 | { 16 | "files": ["**/*.ts", "**/*.tsx"], 17 | "parser": "@typescript-eslint/parser", 18 | "plugins": [ 19 | "@typescript-eslint" 20 | ], 21 | "extends": [ 22 | "eslint:recommended", 23 | "plugin:@typescript-eslint/eslint-recommended", 24 | "plugin:@typescript-eslint/recommended" 25 | ], 26 | "parserOptions": { 27 | "project": ["./tsconfig.json"] 28 | }, 29 | "rules": { 30 | "@typescript-eslint/no-extra-semi": "off" 31 | } 32 | } 33 | ], 34 | "rules": { 35 | "no-extra-semi": "off" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/locale.js: -------------------------------------------------------------------------------- 1 | const { test } = require('uvu') 2 | const assert = require('uvu/assert') 3 | const { supportedLanguages, validateISO2, toISO3, toISO2 } = require('../dist/tinyld.normal.node.js') 4 | 5 | test('Supported Language', () => { 6 | assert.is(supportedLanguages.length, 62) 7 | }) 8 | 9 | test('Validate Locale', () => { 10 | assert.is(validateISO2('jp'), 'ja') 11 | assert.is(validateISO2('ja'), 'ja') 12 | assert.is(validateISO2('fr'), 'fr') 13 | assert.is(validateISO2('us'), 'en') 14 | assert.is(validateISO2('gb'), 'en') 15 | assert.is(validateISO2('en'), 'en') 16 | }) 17 | 18 | test('Locale toISO3', () => { 19 | assert.is(toISO3('jp'), 'jpn') 20 | assert.is(toISO3('ja'), 'jpn') 21 | assert.is(toISO3('fr'), 'fra') 22 | }) 23 | 24 | test('Locale toISO2', () => { 25 | assert.is(toISO2('jpn'), 'ja') 26 | assert.is(toISO2('fra'), 'fr') 27 | }) 28 | 29 | test.run() 30 | -------------------------------------------------------------------------------- /src/benchmark/cld.ts: -------------------------------------------------------------------------------- 1 | import { benchmark } from './bench' 2 | import fs from 'fs' 3 | // eslint-disable-next-line @typescript-eslint/no-var-requires 4 | const cld = require('cld') 5 | 6 | const langMap: { [id: string]: string } = { 7 | iw: 'he', // hebrew changed in 1988, no idea why cld is still using this 8 | 'zh-Hant': 'zh', 9 | 'xx-Java': 'jv', 10 | ms: 'id', 11 | jw: 'jv' 12 | } 13 | 14 | async function detect(val: string) { 15 | try { 16 | const result = await cld.detect(val) 17 | let res = result.languages[0].code 18 | if (res in langMap) res = langMap[res] 19 | return res 20 | } catch (err) { 21 | // 22 | } 23 | return '' 24 | } 25 | 26 | ;(async () => { 27 | const res = await benchmark(detect) 28 | if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench') 29 | fs.writeFileSync('./data/bench/cld.json', JSON.stringify(res, null, 2)) 30 | })() 31 | -------------------------------------------------------------------------------- /src/clean/index.ts: -------------------------------------------------------------------------------- 1 | const REGEXP_PUNCTUATIONS = /[,.。,、#%&/\\+*¡!¿?[\]!?;:…„“«»”"“_–—~]/gi 2 | const REGEXP_NUMBERS = /[0-9]/g 3 | const REGEXP_FULLWIDTH_NUMBERS = /[\uFF10-\uFF19]/g 4 | const REGEXP_SPACES = /\s\s+/g 5 | const REGEXP_APOSTROPHE = /’/gi 6 | const REGEXP_NORMALIZE = /[\u0300-\u036f]/g 7 | 8 | export function isString(value: unknown): boolean { 9 | return typeof value === 'string' || value instanceof String 10 | } 11 | 12 | export function cleanString(value: string): string { 13 | return value 14 | .toLowerCase() 15 | .replace(REGEXP_APOSTROPHE, "'") 16 | .replace(REGEXP_PUNCTUATIONS, ' ') 17 | .replace(REGEXP_FULLWIDTH_NUMBERS, (m) => String.fromCharCode(m.charCodeAt(0) - 0xfee0)) 18 | .replace(REGEXP_NUMBERS, '') 19 | .replace(REGEXP_SPACES, ' ') 20 | .trim() 21 | } 22 | 23 | export function normalize(value: string): string { 24 | return value.normalize('NFD').replace(REGEXP_NORMALIZE, '') 25 | } 26 | -------------------------------------------------------------------------------- /tests/clean.js: -------------------------------------------------------------------------------- 1 | const { test } = require('uvu') 2 | const assert = require('uvu/assert') 3 | const { cleanString } = require('../dist/tinyld.normal.node.js') 4 | 5 | function check(str, expected) { 6 | assert.is(cleanString(str), expected, `Clean ${str}`) 7 | } 8 | 9 | test('Clean String - Punctuation', () => { 10 | check('Bonjour', 'bonjour') 11 | check('Bonjour,', 'bonjour') 12 | check('Bonjour, comment ca va?', 'bonjour comment ca va') 13 | check('先程、どういうわけかマイクが入りませんでした。', '先程 どういうわけかマイクが入りませんでした') 14 | check('¿Dónde vives?', 'dónde vives') 15 | 16 | check('那是一张近照吗?', '那是一张近照吗') 17 | check('那就表示有問題...', '那就表示有問題') 18 | check('要变得完美,她就是少了一个缺点。', '要变得完美 她就是少了一个缺点') 19 | check( 20 | '"Daran habe ich nie gedacht", sagte der alte Mann. "Was sollen wir tun?"', 21 | 'daran habe ich nie gedacht sagte der alte mann was sollen wir tun' 22 | ) 23 | check( 24 | '„Wann wirst du zurückkommen?“ – „Das hängt ganz vom Wetter ab.“', 25 | 'wann wirst du zurückkommen das hängt ganz vom wetter ab' 26 | ) 27 | }) 28 | 29 | test.run() 30 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Komodo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | ## NodeJS 4 | 5 | ```sh 6 | # for npm users 7 | npm install --save tinyld 8 | 9 | # for yarn users 10 | yarn add tinyld 11 | ``` 12 | 13 | Then usage 14 | 15 | ```ts 16 | const { detect } = require('tinyld') 17 | // or ESM 18 | import { detect } from 'tinyld' 19 | ``` 20 | 21 | ## Browser Usage (CDN) 22 | 23 | ```html 24 | 28 | ``` 29 | 30 | ## Deno (Pika CDN) 31 | 32 | ```ts 33 | import { detect } from 'https://cdn.skypack.dev/tinyld' 34 | ``` 35 | 36 | --- 37 | 38 | ## API 39 | 40 | ```js 41 | import { detect, detectAll } from 'tinyld' 42 | // or node: `const { detect } = require('tinyld')` 43 | 44 | // Detect 45 | detect('ceci est un text en francais.') // fr 46 | detect('これは日本語です.') // ja 47 | detect('and this is english.') // en 48 | 49 | // DetectAll 50 | detectAll('ceci est un text en francais.') 51 | // [ { lang: 'fr', accuracy: 0.5238 }, { lang: 'ro', accuracy: 0.3802 }, ... ] 52 | ``` 53 | 54 | --- 55 | 56 | [More about the API Documentation](./api.md) 57 | -------------------------------------------------------------------------------- /tests/detect.js: -------------------------------------------------------------------------------- 1 | const { test } = require('uvu') 2 | const assert = require('uvu/assert') 3 | const { detect, detectAll } = require('../dist/tinyld.normal.node.js') 4 | 5 | function assertLocale(locale, val) { 6 | const res = detectAll(val) 7 | if (res.length > 0 && res[0].lang != locale) detectAll(val, { verbose: true }) 8 | assert.is(detect(val), locale, `is ${locale} : ${val}`) 9 | } 10 | 11 | test('Check input', () => { 12 | assert.is(detect(''), '') 13 | assert.is(detect(1), '') 14 | }) 15 | 16 | test('Detect French', () => { 17 | assertLocale('fr', 'Bonjour les gens') 18 | assertLocale('fr', 'Bonne après-midi') 19 | assertLocale('fr', 'Ceci est un texte en francais.') 20 | // assertLocale('fr', 'reste cool sac a merde') 21 | }) 22 | 23 | test('Detect Japanese', () => { 24 | assertLocale('ja', 'モリーンです。') 25 | assertLocale('ja', '本は面白いです') 26 | assertLocale('ja', 'これは日本語です.') 27 | }) 28 | 29 | test('Detect Korean', () => { 30 | assertLocale('ko', '저는 7년 동안 한국에서 살았어요') 31 | assertLocale('ko', '한국인') 32 | }) 33 | 34 | test('Detect English', () => { 35 | assertLocale('en', 'I’m still learning English, so please speak slowly.') 36 | assertLocale('en', 'I just started working here') 37 | assertLocale('en', 'Good morning') 38 | assertLocale('en', 'and this is english.') 39 | }) 40 | 41 | test.run() 42 | -------------------------------------------------------------------------------- /utils/overall.js: -------------------------------------------------------------------------------- 1 | const chartistSvg = require('chartist-svg') 2 | 3 | module.exports = (data) => { 4 | const libraries = Object.keys(data) 5 | 6 | var graph = { 7 | title: 'NodeJS Language Detection - Overall Accuracy', 8 | subtitle: ' (green: Success, orange: Unidentified, red: Error)', 9 | labels: libraries, 10 | series: [ 11 | Object.values(data).map((x) => x.stats.success_rate), 12 | Object.values(data).map((x) => x.stats.unindentified_rate), 13 | Object.values(data).map((x) => x.stats.error_rate) 14 | ] 15 | } 16 | 17 | var options = { 18 | options: { 19 | low: 30, 20 | high: 100, 21 | onlyInteger: true, 22 | width: 1200, 23 | height: 600, 24 | stackBars: true 25 | }, 26 | css: ` 27 | svg { background: #FFF; } 28 | 29 | .ct-series-a .ct-bar, .ct-series-a .ct-line, .ct-series-a .ct-point, .ct-series-a .ct-slice-donut { 30 | stroke: #468966; 31 | stroke-width: 40px !important; 32 | } 33 | .ct-series-b .ct-bar, .ct-series-b .ct-line, .ct-series-b .ct-point, .ct-series-b .ct-slice-donut { 34 | stroke: #FEC771; 35 | stroke-width: 40px !important; 36 | } 37 | .ct-series-c .ct-bar, .ct-series-c .ct-line, .ct-series-c .ct-point, .ct-series-c .ct-slice-donut { 38 | stroke: #EB7070; 39 | stroke-width: 40px !important; 40 | } 41 | ` 42 | } 43 | 44 | return chartistSvg('bar', graph, options) 45 | } 46 | -------------------------------------------------------------------------------- /.github/workflows/playground.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy static content to Pages 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["develop"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow one concurrent deployment 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: true 22 | 23 | jobs: 24 | # Single deploy job since we're just deploying 25 | deploy: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v3 33 | - uses: actions/setup-node@v3 34 | with: 35 | node-version: 18 36 | cache: 'npm' 37 | - run: | 38 | cd playground 39 | yarn 40 | yarn build --base=/tinyld/ 41 | - name: Setup Pages 42 | uses: actions/configure-pages@v2 43 | - name: Upload artifact 44 | uses: actions/upload-pages-artifact@v1 45 | with: 46 | # Upload entire repository 47 | path: './playground/dist' 48 | - name: Deploy to GitHub Pages 49 | id: deployment 50 | uses: actions/deploy-pages@v1 51 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { isString } from './clean' 2 | import { DetectOption, ILangCompressedProfiles, ILangProfiles, langFromId, parseDetectOption } from './core' 3 | import data from './profiles/normal.json' 4 | import { detectAllStats } from './tokenizer' 5 | 6 | const compressed = data as ILangCompressedProfiles 7 | const profiles: ILangProfiles = { 8 | uniques: Object.fromEntries( 9 | Object.entries(compressed.uniques).map((x) => { 10 | return [x[0], langFromId[parseInt(x[1].toString(), 36)]] 11 | }) 12 | ), 13 | multiples: Object.fromEntries( 14 | Object.entries(compressed.multiples).map((x) => { 15 | const entry = Object.fromEntries( 16 | x[1].match(/(.{1,4})/g)?.map((y) => { 17 | const [country, val] = y.match(/(.{1,2})/g) as string[] 18 | return [langFromId[parseInt(country, 36)], parseInt(val, 36)] 19 | }) || [] 20 | ) 21 | return [x[0], entry] 22 | }) 23 | ) 24 | } 25 | const uniqueKeys = new Set(Object.keys(data.uniques)) 26 | 27 | export function detect(text: string, opts?: Partial): string { 28 | const res = detectAll(text, opts) 29 | return res.length > 0 ? res[0].lang : '' 30 | } 31 | 32 | export function detectAll(text: string, opts?: Partial): { lang: string; accuracy: number }[] { 33 | const options = parseDetectOption(opts) 34 | if (!isString(text)) return [] 35 | 36 | return detectAllStats(text, options, profiles, uniqueKeys) 37 | } 38 | 39 | export { cleanString } from './clean' 40 | export { toISO2, toISO3, langName, langRegion, validateISO2, supportedLanguages } from './core' 41 | -------------------------------------------------------------------------------- /src/index_heavy.ts: -------------------------------------------------------------------------------- 1 | import { isString } from './clean' 2 | import { DetectOption, ILangCompressedProfiles, ILangProfiles, langFromId, parseDetectOption } from './core' 3 | import data from './profiles/heavy.json' 4 | import { detectAllStats } from './tokenizer' 5 | 6 | const compressed = data as ILangCompressedProfiles 7 | const profiles: ILangProfiles = { 8 | uniques: Object.fromEntries( 9 | Object.entries(compressed.uniques).map((x) => { 10 | return [x[0], langFromId[parseInt(x[1].toString(), 36)]] 11 | }) 12 | ), 13 | multiples: Object.fromEntries( 14 | Object.entries(compressed.multiples).map((x) => { 15 | const entry = Object.fromEntries( 16 | x[1].match(/(.{1,4})/g)?.map((y) => { 17 | const [country, val] = y.match(/(.{1,2})/g) as string[] 18 | return [langFromId[parseInt(country, 36)], parseInt(val, 36)] 19 | }) || [] 20 | ) 21 | return [x[0], entry] 22 | }) 23 | ) 24 | } 25 | const uniqueKeys = new Set(Object.keys(data.uniques)) 26 | 27 | export function detect(text: string, opts?: Partial): string { 28 | const res = detectAll(text, opts) 29 | return res.length > 0 ? res[0].lang : '' 30 | } 31 | 32 | export function detectAll(text: string, opts?: Partial): { lang: string; accuracy: number }[] { 33 | const options = parseDetectOption(opts) 34 | if (!isString(text)) return [] 35 | 36 | return detectAllStats(text, options, profiles, uniqueKeys) 37 | } 38 | 39 | export { cleanString } from './clean' 40 | export { toISO2, toISO3, langName, langRegion, validateISO2, supportedLanguages } from './core' 41 | -------------------------------------------------------------------------------- /src/index_light.ts: -------------------------------------------------------------------------------- 1 | import { isString } from './clean' 2 | import { DetectOption, ILangProfiles, parseDetectOption, ILangCompressedProfiles, langFromId } from './core' 3 | import data from './profiles/light.json' 4 | import { detectAllStats } from './tokenizer' 5 | 6 | const compressed = data as ILangCompressedProfiles 7 | const profiles: ILangProfiles = { 8 | uniques: Object.fromEntries( 9 | Object.entries(compressed.uniques).map((x) => { 10 | return [x[0], langFromId[parseInt(x[1].toString(), 36)]] 11 | }) 12 | ), 13 | multiples: Object.fromEntries( 14 | Object.entries(compressed.multiples).map((x) => { 15 | const entry = Object.fromEntries( 16 | x[1].match(/(.{1,3})/g)?.map((y) => { 17 | const country = y.slice(0, 1) 18 | const val = y.slice(1) 19 | return [langFromId[parseInt(country, 36)], parseInt(val, 36)] 20 | }) || [] 21 | ) 22 | return [x[0], entry] 23 | }) 24 | ) 25 | } 26 | const uniqueKeys = new Set(Object.keys(data.uniques)) 27 | 28 | export function detect(text: string, opts?: Partial): string { 29 | const res = detectAll(text, opts) 30 | return res.length > 0 ? res[0].lang : '' 31 | } 32 | 33 | export function detectAll(text: string, opts?: Partial): { lang: string; accuracy: number }[] { 34 | const options = parseDetectOption(opts) 35 | if (!isString(text)) return [] 36 | 37 | return detectAllStats(text, options, profiles, uniqueKeys) 38 | } 39 | 40 | export { cleanString } from './clean' 41 | export { toISO2, toISO3, langName, langRegion, validateISO2, supportedLanguages } from './core' 42 | -------------------------------------------------------------------------------- /utils/index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const graphOverall = require('./overall') 3 | const graphLanguage = require('./language') 4 | const graphLength = require('./length') 5 | const graphExecution = require('./exectime') 6 | 7 | function getJSON(filepath) { 8 | return JSON.parse(fs.readFileSync(filepath)) 9 | } 10 | 11 | ;(async () => { 12 | const data = { 13 | 'tinyld-heavy': getJSON('./data/bench/tinyld-heavy.json'), 14 | tinyld: getJSON('./data/bench/tinyld.json'), 15 | 'tinyld-light': getJSON('./data/bench/tinyld-light.json'), 16 | langdetect: getJSON('./data/bench/langdetect.json'), 17 | cld: getJSON('./data/bench/cld.json'), 18 | franc: getJSON('./data/bench/franc.json'), 19 | 'franc-min': getJSON('./data/bench/franc-min.json'), 20 | 'franc-all': getJSON('./data/bench/franc-all.json'), 21 | languagedetect: getJSON('./data/bench/languagedetect.json') 22 | } 23 | 24 | const overall = await graphOverall(data) 25 | fs.writeFileSync('./docs/overall.svg', overall.replace(' { 12 | const wordRank = new Map() 13 | const fileStream = fs.createReadStream(fileIn) 14 | const rl = readline.createInterface({ 15 | input: fileStream, 16 | crlfDelay: Infinity 17 | }) 18 | 19 | for await (const line of rl) { 20 | const words = wordTokenizer(cleanString(line)) 21 | words.forEach((x) => { 22 | if (!x) return 23 | wordRank.set(x, (wordRank.get(x) || 0) + 1) 24 | }) 25 | } 26 | 27 | const values = [...wordRank.entries()] 28 | 29 | return values.map((x) => { 30 | return { word: x[0], count: x[1] } as FreqWord 31 | }) 32 | } 33 | 34 | export async function processFrequencyLineByLine(fileIn: string): Promise { 35 | const wordRank = new Map() 36 | 37 | const fileStream = fs.createReadStream(fileIn) 38 | const rl = readline.createInterface({ 39 | input: fileStream, 40 | crlfDelay: Infinity 41 | }) 42 | 43 | for await (const line of rl) { 44 | const [text, count] = line.split(' ') 45 | const str = cleanString(text) 46 | if (!str || str.startsWith("'")) continue 47 | wordRank.set(str, parseInt(count)) 48 | } 49 | 50 | const values = [...wordRank.entries()] 51 | return values.map((x) => { 52 | return { word: x[0], count: x[1] } as FreqWord 53 | }) 54 | } 55 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | ## Language Detection 4 | 5 | ### Detect 6 | 7 | ```js 8 | // basic detection 9 | detect('this is the text') // => 'en' 10 | 11 | // verbose mode 12 | detect('this is the text', { verbose: true }) // => 'en' 13 | 14 | // only in a subset of languages 15 | detect('this is the text', { only: ['fr', 'en', 'nl'] }) // => 'en' 16 | ``` 17 | 18 | ### Detect All 19 | 20 | ```js 21 | detectAll('this is the text') 22 | /* 23 | [ 24 | { lang: 'en', accuracy: 0.958076923076923 }, 25 | { lang: 'nl', accuracy: 0.15384615384615385 }, 26 | { lang: 'ga', accuracy: 0.14555384615384614 }, 27 | { lang: 'lt', accuracy: 0.03804615384615384 }, 28 | { lang: 'vo', accuracy: 0.03303076923076923 }, 29 | { lang: 'hu', accuracy: 0.022338461538461536 }, 30 | { lang: 'la', accuracy: 0.006738461538461531 }, 31 | { lang: 'fr', accuracy: 0.0025153846153846203 } 32 | ] 33 | */ 34 | ``` 35 | 36 | --- 37 | 38 | ## Language Code Conversion 39 | 40 | This library also expose some language code conversion functions, to switch between iso2 (`ISO 639-1`) and iso3 (`ISO 639-3`) and get compatible with a range of API/Tools. 41 | 42 | ```js 43 | import { toISO2, toISO3 } from 'tinyld' 44 | 45 | toISO2('jpn') // ja 46 | toISO3('jp') // jpn 47 | toISO3('ja') // jpn 48 | ``` 49 | 50 | Also contains some alias for deprecated or common mistakes (`jp` is an alias of `ja`, `cn` is an alias of `zh`, ...) 51 | 52 | --- 53 | 54 | ## Language Helpers 55 | 56 | ```js 57 | import { supportedLanguages, langName, langRegion } from 'tinyld' 58 | 59 | // all supported languages (ISO3 format) 60 | supportedLanguages // ['jpn', 'cmn', ...] 61 | 62 | // and few utils about langs 63 | langName('jpn') // Japanese 64 | langRegion('jpn') // east-asia 65 | ``` 66 | -------------------------------------------------------------------------------- /utils/exectime.js: -------------------------------------------------------------------------------- 1 | const chartistSvg = require('chartist-svg') 2 | 3 | module.exports = (data) => { 4 | const length = Object.keys(data.tinyld.size) 5 | var graph = { 6 | title: 'NodeJS Language Detection - Execution Time', 7 | subtitle: 'in milliseconds (lower is better)', 8 | labels: length, 9 | series: ['tinyld', 'langdetect', 'cld', 'franc'].map((lib) => { 10 | return length.map((len) => { 11 | return data[lib].size[len].execution_time 12 | }) 13 | }) 14 | } 15 | 16 | var options = { 17 | options: { 18 | low: 0, 19 | seriesBarDistance: 16, 20 | onlyInteger: true, 21 | // reverseData: true, 22 | // horizontalBars: true, 23 | width: 1200, 24 | height: 600 25 | }, 26 | css: ` 27 | svg { background: #FFF; } 28 | 29 | .ct-series-a .ct-bar, .ct-series-a .ct-line, .ct-series-a .ct-point, .ct-series-a .ct-slice-donut { 30 | stroke: #468966; 31 | stroke-width: 16px !important; 32 | } 33 | .ct-series-b .ct-bar, .ct-series-b .ct-line, .ct-series-b .ct-point, .ct-series-b .ct-slice-donut { 34 | stroke: #98BAE7; 35 | stroke-width: 8px !important; 36 | } 37 | .ct-series-c .ct-bar, .ct-series-c .ct-line, .ct-series-c .ct-point, .ct-series-c .ct-slice-donut { 38 | stroke: #FEC771; 39 | stroke-width: 8px !important; 40 | } 41 | .ct-series-d .ct-bar, .ct-series-d .ct-line, .ct-series-d .ct-point, .ct-series-d .ct-slice-donut { 42 | stroke: #F38181; 43 | stroke-width: 8px !important; 44 | } 45 | .ct-series-e .ct-bar, .ct-series-e .ct-line, .ct-series-e .ct-point, .ct-series-e .ct-slice-donut { 46 | stroke: #D47AE8; 47 | stroke-width: 8px !important; 48 | } 49 | ` 50 | } 51 | 52 | return chartistSvg('line', graph, options) 53 | } 54 | -------------------------------------------------------------------------------- /utils/language.js: -------------------------------------------------------------------------------- 1 | const chartistSvg = require('chartist-svg') 2 | 3 | module.exports = (data, langs) => { 4 | var graph = { 5 | title: 'NodeJS Language Detection - Per Language', 6 | subtitle: 'Tinyld vs Langdetect vs Cld vs Franc', 7 | labels: langs.map((x) => x.toUpperCase()), 8 | series: ['tinyld', 'langdetect', 'cld', 'franc'].map((lib) => { 9 | return langs.map((lang) => { 10 | return data[lib].languages[lang] 11 | }) 12 | }) 13 | } 14 | 15 | var options = { 16 | options: { 17 | high: 100, 18 | low: 30, 19 | seriesBarDistance: 16, 20 | onlyInteger: true, 21 | // reverseData: true, 22 | // horizontalBars: true, 23 | width: 1200, 24 | height: 600 25 | }, 26 | css: ` 27 | svg { background: #FFF; } 28 | 29 | .ct-series-a .ct-bar, .ct-series-a .ct-line, .ct-series-a .ct-point, .ct-series-a .ct-slice-donut { 30 | stroke: #468966; 31 | stroke-width: 16px !important; 32 | } 33 | .ct-series-b .ct-bar, .ct-series-b .ct-line, .ct-series-b .ct-point, .ct-series-b .ct-slice-donut { 34 | stroke: #98BAE7; 35 | stroke-width: 16px !important; 36 | } 37 | .ct-series-c .ct-bar, .ct-series-c .ct-line, .ct-series-c .ct-point, .ct-series-c .ct-slice-donut { 38 | stroke: #FEC771; 39 | stroke-width: 16px !important; 40 | } 41 | .ct-series-d .ct-bar, .ct-series-d .ct-line, .ct-series-d .ct-point, .ct-series-d .ct-slice-donut { 42 | stroke: #F38181; 43 | stroke-width: 16px !important; 44 | } 45 | .ct-series-e .ct-bar, .ct-series-e .ct-line, .ct-series-e .ct-point, .ct-series-e .ct-slice-donut { 46 | stroke: #D47AE8; 47 | stroke-width: 16px !important; 48 | } 49 | ` 50 | } 51 | 52 | return chartistSvg('bar', graph, options) 53 | } 54 | -------------------------------------------------------------------------------- /utils/length.js: -------------------------------------------------------------------------------- 1 | const chartistSvg = require('chartist-svg') 2 | 3 | module.exports = (data) => { 4 | const length = Object.keys(data.tinyld.size) 5 | var graph = { 6 | title: 'NodeJS Language Detection - Text Length', 7 | subtitle: 'Accuracy in % (higher is better)', 8 | labels: length, 9 | series: ['tinyld', 'langdetect', 'cld', 'franc'].map((lib) => { 10 | return length.map((len) => { 11 | return data[lib].size[len].success_rate 12 | }) 13 | }) 14 | } 15 | 16 | var options = { 17 | options: { 18 | high: 100, 19 | low: 10, 20 | seriesBarDistance: 16, 21 | onlyInteger: true, 22 | // reverseData: true, 23 | // horizontalBars: true, 24 | width: 1200, 25 | height: 600 26 | }, 27 | css: ` 28 | svg { background: #FFF; } 29 | 30 | .ct-series-a .ct-bar, .ct-series-a .ct-line, .ct-series-a .ct-point, .ct-series-a .ct-slice-donut { 31 | stroke: #468966; 32 | stroke-width: 16px !important; 33 | } 34 | .ct-series-b .ct-bar, .ct-series-b .ct-line, .ct-series-b .ct-point, .ct-series-b .ct-slice-donut { 35 | stroke: #98BAE7; 36 | stroke-width: 8px !important; 37 | } 38 | .ct-series-c .ct-bar, .ct-series-c .ct-line, .ct-series-c .ct-point, .ct-series-c .ct-slice-donut { 39 | stroke: #FEC771; 40 | stroke-width: 8px !important; 41 | } 42 | .ct-series-d .ct-bar, .ct-series-d .ct-line, .ct-series-d .ct-point, .ct-series-d .ct-slice-donut { 43 | stroke: #F38181; 44 | stroke-width: 8px !important; 45 | } 46 | .ct-series-e .ct-bar, .ct-series-e .ct-line, .ct-series-e .ct-point, .ct-series-e .ct-slice-donut { 47 | stroke: #D47AE8; 48 | stroke-width: 8px !important; 49 | } 50 | ` 51 | } 52 | 53 | return chartistSvg('line', graph, options) 54 | } 55 | -------------------------------------------------------------------------------- /docs/dev.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Commands 4 | 5 | ```sh 6 | # Install 7 | yarn 8 | 9 | # Build 10 | yarn build 11 | 12 | # Test 13 | yarn test 14 | 15 | # Lint / Auto-fix code style problems 16 | yarn lint 17 | ``` 18 | 19 | --- 20 | 21 | ## Install issues 22 | 23 | For the moment the library has lot of dev-dependencies purely for the benchmark process. 24 | Some of those libraries need to compile native code, which can be problematic (gcc, gyp, python, ...) 25 | 26 | If you run into those issues, one of the easiest solution is to remove the problematic dependencies from `package.json` then try again to install. 27 | 28 | [like here](https://github.com/komodojp/tinyld/issues/10#issuecomment-1019085476) 29 | 30 | It will only cause issue with `yarn bench`, but everything else should still work normally 31 | 32 | --- 33 | 34 | ## Optional 35 | 36 | ### 1. Generate profiles (`yarn train`) 37 | 38 | This step require lot of data and time, so it's optional and the result are store directly in git. 39 | 40 | This will analyse lot fo text in different language and build statistics to be able to identify the best features for each language 41 | 42 | To be able to train the model, you will need first to have the dataset locally 43 | 44 | ``` 45 | Download Datasets 46 | - Download the [Tatoeba sentence export](https://downloads.tatoeba.org/exports/sentences.tar.bz2) 47 | - Extract in `data/tatoeba.csv` 48 | - Download the [UDHR](https://unicode.org/udhr/assemblies/udhr_txt.zip) 49 | - Extract in `data/udhr/` 50 | 51 | Run yarn train 52 | - For each language, it will build statistics for words and n-grams 53 | - This goes through massive amount of data and will take time, prepare few coffee 54 | 55 | When your profile files are generated, you can run `yarn build` and you will have a build with those new data 56 | ``` 57 | 58 | ### 2. Generate benchmark data (`yarn bench`) 59 | 60 | This step require a bit of time, it will run lot of different test for a set of libraries to generate the benchmark page and diagrams. 61 | -------------------------------------------------------------------------------- /data/bench/langdetect.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 60.6875, 5 | "error_rate": 39.3125, 6 | "unindentified_rate": 0, 7 | "execution_time": 0.5959 8 | }, 9 | "16": { 10 | "success_rate": 77.4375, 11 | "error_rate": 22.5625, 12 | "unindentified_rate": 0, 13 | "execution_time": 0.4825 14 | }, 15 | "24": { 16 | "success_rate": 89, 17 | "error_rate": 11, 18 | "unindentified_rate": 0, 19 | "execution_time": 0.4178 20 | }, 21 | "36": { 22 | "success_rate": 94.2813, 23 | "error_rate": 5.7188, 24 | "unindentified_rate": 0, 25 | "execution_time": 0.3856 26 | }, 27 | "48": { 28 | "success_rate": 97.5938, 29 | "error_rate": 2.4063, 30 | "unindentified_rate": 0, 31 | "execution_time": 0.3764 32 | }, 33 | "64": { 34 | "success_rate": 99, 35 | "error_rate": 1, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.3808 38 | }, 39 | "128": { 40 | "success_rate": 99.7813, 41 | "error_rate": 0.2188, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.479 44 | }, 45 | "256": { 46 | "success_rate": 100, 47 | "error_rate": 0, 48 | "unindentified_rate": 0, 49 | "execution_time": 0.762 50 | }, 51 | "512": { 52 | "success_rate": 100, 53 | "error_rate": 0, 54 | "unindentified_rate": 0, 55 | "execution_time": 1.3573 56 | }, 57 | "1024": { 58 | "success_rate": 100, 59 | "error_rate": 0, 60 | "unindentified_rate": 0, 61 | "execution_time": 2.5523 62 | } 63 | }, 64 | "stats": { 65 | "min": 87.36, 66 | "max": 100, 67 | "success_rate": 95.675, 68 | "error_rate": 4.325, 69 | "unindentified_rate": 0, 70 | "execution_time": 0.3647 71 | }, 72 | "languages": { 73 | "heb": 100, 74 | "jpn": 99.99, 75 | "kor": 99.7267, 76 | "ara": 99.63, 77 | "cmn": 98.2, 78 | "fin": 97.99, 79 | "tur": 97.86, 80 | "deu": 97.8, 81 | "fra": 96.16, 82 | "rus": 95.52, 83 | "eng": 95.35, 84 | "ita": 94.03, 85 | "hin": 92.35, 86 | "por": 91.1, 87 | "spa": 89.71, 88 | "nld": 87.36 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/tinyld-light.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 73, 5 | "error_rate": 15.3125, 6 | "unindentified_rate": 11.6875, 7 | "execution_time": 0.0584 8 | }, 9 | "16": { 10 | "success_rate": 88.8438, 11 | "error_rate": 7.5625, 12 | "unindentified_rate": 3.5937, 13 | "execution_time": 0.0635 14 | }, 15 | "24": { 16 | "success_rate": 95.4688, 17 | "error_rate": 4.125, 18 | "unindentified_rate": 0.4063, 19 | "execution_time": 0.0672 20 | }, 21 | "36": { 22 | "success_rate": 97.6563, 23 | "error_rate": 2.25, 24 | "unindentified_rate": 0.0938, 25 | "execution_time": 0.087 26 | }, 27 | "48": { 28 | "success_rate": 99.0938, 29 | "error_rate": 0.875, 30 | "unindentified_rate": 0.0313, 31 | "execution_time": 0.1128 32 | }, 33 | "64": { 34 | "success_rate": 99.5625, 35 | "error_rate": 0.4375, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.1362 38 | }, 39 | "128": { 40 | "success_rate": 99.9375, 41 | "error_rate": 0.0625, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.2514 44 | }, 45 | "256": { 46 | "success_rate": 100, 47 | "error_rate": 0, 48 | "unindentified_rate": 0, 49 | "execution_time": 0.466 50 | }, 51 | "512": { 52 | "success_rate": 100, 53 | "error_rate": 0, 54 | "unindentified_rate": 0, 55 | "execution_time": 0.5923 56 | }, 57 | "1024": { 58 | "success_rate": 100, 59 | "error_rate": 0, 60 | "unindentified_rate": 0, 61 | "execution_time": 0.7342 62 | } 63 | }, 64 | "stats": { 65 | "min": 93.61, 66 | "max": 100, 67 | "success_rate": 97.8778, 68 | "error_rate": 1.9842, 69 | "unindentified_rate": 0.138, 70 | "execution_time": 0.0947 71 | }, 72 | "languages": { 73 | "kor": 100, 74 | "hin": 100, 75 | "rus": 100, 76 | "heb": 100, 77 | "ara": 100, 78 | "jpn": 99.99, 79 | "cmn": 99.31, 80 | "tur": 98.38, 81 | "fin": 98.34, 82 | "nld": 98, 83 | "fra": 97.37, 84 | "deu": 97.04, 85 | "eng": 96.04, 86 | "por": 95.01, 87 | "ita": 93.99, 88 | "spa": 93.61 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/tinyld.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 69.9375, 5 | "error_rate": 17.6563, 6 | "unindentified_rate": 12.4063, 7 | "execution_time": 0.0828 8 | }, 9 | "16": { 10 | "success_rate": 90.2188, 11 | "error_rate": 7.75, 12 | "unindentified_rate": 2.0313, 13 | "execution_time": 0.0973 14 | }, 15 | "24": { 16 | "success_rate": 96.1875, 17 | "error_rate": 3.3438, 18 | "unindentified_rate": 0.4688, 19 | "execution_time": 0.0988 20 | }, 21 | "36": { 22 | "success_rate": 98.3438, 23 | "error_rate": 1.5938, 24 | "unindentified_rate": 0.0625, 25 | "execution_time": 0.1094 26 | }, 27 | "48": { 28 | "success_rate": 99.5, 29 | "error_rate": 0.5, 30 | "unindentified_rate": 0, 31 | "execution_time": 0.1448 32 | }, 33 | "64": { 34 | "success_rate": 99.6875, 35 | "error_rate": 0.3125, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.1822 38 | }, 39 | "128": { 40 | "success_rate": 99.9375, 41 | "error_rate": 0.0625, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.2983 44 | }, 45 | "256": { 46 | "success_rate": 99.9688, 47 | "error_rate": 0.0313, 48 | "unindentified_rate": 0, 49 | "execution_time": 0.5834 50 | }, 51 | "512": { 52 | "success_rate": 100, 53 | "error_rate": 0, 54 | "unindentified_rate": 0, 55 | "execution_time": 0.697 56 | }, 57 | "1024": { 58 | "success_rate": 100, 59 | "error_rate": 0, 60 | "unindentified_rate": 0, 61 | "execution_time": 0.8079 62 | } 63 | }, 64 | "stats": { 65 | "min": 96.13, 66 | "max": 100, 67 | "success_rate": 98.5231, 68 | "error_rate": 1.3712, 69 | "unindentified_rate": 0.1057, 70 | "execution_time": 0.1191 71 | }, 72 | "languages": { 73 | "kor": 100, 74 | "hin": 100, 75 | "jpn": 99.99, 76 | "heb": 99.73, 77 | "ara": 99.59, 78 | "cmn": 99.57, 79 | "rus": 99.27, 80 | "deu": 99.25, 81 | "fra": 99, 82 | "fin": 98.69, 83 | "eng": 98.11, 84 | "nld": 97.66, 85 | "tur": 97.42, 86 | "ita": 96.52, 87 | "por": 96.16, 88 | "spa": 96.13 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/franc-all.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 0.125, 5 | "error_rate": 0.0625, 6 | "unindentified_rate": 99.8125, 7 | "execution_time": 0.0017 8 | }, 9 | "16": { 10 | "success_rate": 41.0938, 11 | "error_rate": 58.9063, 12 | "unindentified_rate": 0, 13 | "execution_time": 0.264 14 | }, 15 | "24": { 16 | "success_rate": 50, 17 | "error_rate": 50, 18 | "unindentified_rate": 0, 19 | "execution_time": 0.3342 20 | }, 21 | "36": { 22 | "success_rate": 60.5938, 23 | "error_rate": 39.4063, 24 | "unindentified_rate": 0, 25 | "execution_time": 0.3994 26 | }, 27 | "48": { 28 | "success_rate": 69.5625, 29 | "error_rate": 30.4375, 30 | "unindentified_rate": 0, 31 | "execution_time": 0.509 32 | }, 33 | "64": { 34 | "success_rate": 73.5938, 35 | "error_rate": 26.4062, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.6104 38 | }, 39 | "128": { 40 | "success_rate": 86.1563, 41 | "error_rate": 13.8438, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.925 44 | }, 45 | "256": { 46 | "success_rate": 93.6875, 47 | "error_rate": 6.3125, 48 | "unindentified_rate": 0, 49 | "execution_time": 1.6346 50 | }, 51 | "512": { 52 | "success_rate": 96.25, 53 | "error_rate": 3.75, 54 | "unindentified_rate": 0, 55 | "execution_time": 2.8385 56 | }, 57 | "1024": { 58 | "success_rate": 98.4331, 59 | "error_rate": 1.5669, 60 | "unindentified_rate": 0, 61 | "execution_time": 4.78 62 | } 63 | }, 64 | "stats": { 65 | "min": 29.47, 66 | "max": 99.93, 67 | "success_rate": 66.7081, 68 | "error_rate": 33.2919, 69 | "unindentified_rate": 0, 70 | "execution_time": 0.4763 71 | }, 72 | "languages": { 73 | "jpn": 99.93, 74 | "kor": 99.8633, 75 | "cmn": 99.35, 76 | "heb": 98.18, 77 | "ara": 91.72, 78 | "deu": 80.77, 79 | "fin": 70.79, 80 | "fra": 67.27, 81 | "hin": 60.42, 82 | "nld": 59.65, 83 | "rus": 51.96, 84 | "eng": 49.92, 85 | "por": 49.39, 86 | "ita": 42.55, 87 | "tur": 32.27, 88 | "spa": 29.47 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/tinyld-heavy.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 79.2188, 5 | "error_rate": 17.8125, 6 | "unindentified_rate": 2.9688, 7 | "execution_time": 0.0777 8 | }, 9 | "16": { 10 | "success_rate": 95.0313, 11 | "error_rate": 4.6563, 12 | "unindentified_rate": 0.3125, 13 | "execution_time": 0.0797 14 | }, 15 | "24": { 16 | "success_rate": 98.125, 17 | "error_rate": 1.8438, 18 | "unindentified_rate": 0.0313, 19 | "execution_time": 0.0754 20 | }, 21 | "36": { 22 | "success_rate": 99.2813, 23 | "error_rate": 0.6875, 24 | "unindentified_rate": 0.0313, 25 | "execution_time": 0.0881 26 | }, 27 | "48": { 28 | "success_rate": 99.4688, 29 | "error_rate": 0.5313, 30 | "unindentified_rate": 0, 31 | "execution_time": 0.1106 32 | }, 33 | "64": { 34 | "success_rate": 99.8125, 35 | "error_rate": 0.1875, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.1348 38 | }, 39 | "128": { 40 | "success_rate": 99.875, 41 | "error_rate": 0.125, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.233 44 | }, 45 | "256": { 46 | "success_rate": 99.9375, 47 | "error_rate": 0.0625, 48 | "unindentified_rate": 0, 49 | "execution_time": 0.4433 50 | }, 51 | "512": { 52 | "success_rate": 100, 53 | "error_rate": 0, 54 | "unindentified_rate": 0, 55 | "execution_time": 0.509 56 | }, 57 | "1024": { 58 | "success_rate": 100, 59 | "error_rate": 0, 60 | "unindentified_rate": 0, 61 | "execution_time": 0.5808 62 | } 63 | }, 64 | "stats": { 65 | "min": 97.52, 66 | "max": 100, 67 | "success_rate": 99.249, 68 | "error_rate": 0.7478, 69 | "unindentified_rate": 0.0032, 70 | "execution_time": 0.096 71 | }, 72 | "languages": { 73 | "kor": 100, 74 | "hin": 100, 75 | "jpn": 99.99, 76 | "heb": 99.88, 77 | "ara": 99.87, 78 | "deu": 99.72, 79 | "cmn": 99.66, 80 | "fra": 99.64, 81 | "rus": 99.52, 82 | "fin": 99.2, 83 | "eng": 99.11, 84 | "tur": 99.01, 85 | "ita": 98.66, 86 | "nld": 98.44, 87 | "spa": 98.13, 88 | "por": 97.52 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/franc.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 0.125, 5 | "error_rate": 0.0625, 6 | "unindentified_rate": 99.8125, 7 | "execution_time": 0.0022 8 | }, 9 | "16": { 10 | "success_rate": 45.6875, 11 | "error_rate": 54.3125, 12 | "unindentified_rate": 0, 13 | "execution_time": 0.1249 14 | }, 15 | "24": { 16 | "success_rate": 56.5938, 17 | "error_rate": 43.4063, 18 | "unindentified_rate": 0, 19 | "execution_time": 0.1481 20 | }, 21 | "36": { 22 | "success_rate": 69.0938, 23 | "error_rate": 30.9063, 24 | "unindentified_rate": 0, 25 | "execution_time": 0.1847 26 | }, 27 | "48": { 28 | "success_rate": 77.0625, 29 | "error_rate": 22.9375, 30 | "unindentified_rate": 0, 31 | "execution_time": 0.2374 32 | }, 33 | "64": { 34 | "success_rate": 80.9688, 35 | "error_rate": 19.0313, 36 | "unindentified_rate": 0, 37 | "execution_time": 0.2791 38 | }, 39 | "128": { 40 | "success_rate": 91.2813, 41 | "error_rate": 8.7188, 42 | "unindentified_rate": 0, 43 | "execution_time": 0.4306 44 | }, 45 | "256": { 46 | "success_rate": 96.7188, 47 | "error_rate": 3.2813, 48 | "unindentified_rate": 0, 49 | "execution_time": 0.7921 50 | }, 51 | "512": { 52 | "success_rate": 98.9063, 53 | "error_rate": 1.0938, 54 | "unindentified_rate": 0, 55 | "execution_time": 1.3237 56 | }, 57 | "1024": { 58 | "success_rate": 99.6866, 59 | "error_rate": 0.3134, 60 | "unindentified_rate": 0, 61 | "execution_time": 2.33 62 | } 63 | }, 64 | "stats": { 65 | "min": 48.96, 66 | "max": 99.93, 67 | "success_rate": 74.2577, 68 | "error_rate": 25.7423, 69 | "unindentified_rate": 0, 70 | "execution_time": 0.2242 71 | }, 72 | "languages": { 73 | "jpn": 99.93, 74 | "kor": 99.8633, 75 | "cmn": 99.35, 76 | "heb": 98.23, 77 | "ara": 91.84, 78 | "deu": 83.87, 79 | "fra": 79.36, 80 | "fin": 78.52, 81 | "hin": 68.25, 82 | "ita": 64.2, 83 | "por": 62.29, 84 | "nld": 62, 85 | "eng": 60.39, 86 | "rus": 52.32, 87 | "spa": 51.24, 88 | "tur": 48.96 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/cld.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 26.5, 5 | "error_rate": 2.4375, 6 | "unindentified_rate": 71.0625, 7 | "execution_time": 0.1076 8 | }, 9 | "16": { 10 | "success_rate": 57.5625, 11 | "error_rate": 4.0625, 12 | "unindentified_rate": 38.375, 13 | "execution_time": 0.0891 14 | }, 15 | "24": { 16 | "success_rate": 81.6563, 17 | "error_rate": 3.1563, 18 | "unindentified_rate": 15.1875, 19 | "execution_time": 0.0754 20 | }, 21 | "36": { 22 | "success_rate": 92.8125, 23 | "error_rate": 1.75, 24 | "unindentified_rate": 5.4375, 25 | "execution_time": 0.0758 26 | }, 27 | "48": { 28 | "success_rate": 96.75, 29 | "error_rate": 1.0938, 30 | "unindentified_rate": 2.1563, 31 | "execution_time": 0.0778 32 | }, 33 | "64": { 34 | "success_rate": 98.3125, 35 | "error_rate": 0.5, 36 | "unindentified_rate": 1.1875, 37 | "execution_time": 0.0747 38 | }, 39 | "128": { 40 | "success_rate": 99.5625, 41 | "error_rate": 0.0938, 42 | "unindentified_rate": 0.3438, 43 | "execution_time": 0.08 44 | }, 45 | "256": { 46 | "success_rate": 99.9375, 47 | "error_rate": 0.0313, 48 | "unindentified_rate": 0.0313, 49 | "execution_time": 0.086 50 | }, 51 | "512": { 52 | "success_rate": 99.9688, 53 | "error_rate": 0, 54 | "unindentified_rate": 0.0313, 55 | "execution_time": 0.1195 56 | }, 57 | "1024": { 58 | "success_rate": 100, 59 | "error_rate": 0, 60 | "unindentified_rate": 0, 61 | "execution_time": 0.1449 62 | } 63 | }, 64 | "stats": { 65 | "min": 78.4, 66 | "max": 100, 67 | "success_rate": 92.3654, 68 | "error_rate": 1.6213, 69 | "unindentified_rate": 6.0133, 70 | "execution_time": 0.0711 71 | }, 72 | "languages": { 73 | "jpn": 100, 74 | "kor": 100, 75 | "hin": 99.43, 76 | "eng": 99.18, 77 | "deu": 97.58, 78 | "fin": 96.3, 79 | "cmn": 94.52, 80 | "fra": 94.05, 81 | "tur": 93.2, 82 | "por": 91.89, 83 | "rus": 89.69, 84 | "nld": 89.58, 85 | "spa": 87.79, 86 | "ita": 85.9, 87 | "heb": 84.06, 88 | "ara": 78.4 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/languagedetect.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 24.75, 5 | "error_rate": 43.25, 6 | "unindentified_rate": 32, 7 | "execution_time": 0.0616 8 | }, 9 | "16": { 10 | "success_rate": 37.2188, 11 | "error_rate": 33.5, 12 | "unindentified_rate": 29.2812, 13 | "execution_time": 0.088 14 | }, 15 | "24": { 16 | "success_rate": 50.1875, 17 | "error_rate": 22.4063, 18 | "unindentified_rate": 27.4063, 19 | "execution_time": 0.1142 20 | }, 21 | "36": { 22 | "success_rate": 59.5, 23 | "error_rate": 14.375, 24 | "unindentified_rate": 26.125, 25 | "execution_time": 0.164 26 | }, 27 | "48": { 28 | "success_rate": 66.2813, 29 | "error_rate": 8.2813, 30 | "unindentified_rate": 25.4375, 31 | "execution_time": 0.2038 32 | }, 33 | "64": { 34 | "success_rate": 69.0938, 35 | "error_rate": 6, 36 | "unindentified_rate": 24.9063, 37 | "execution_time": 0.2542 38 | }, 39 | "128": { 40 | "success_rate": 72.75, 41 | "error_rate": 3.6563, 42 | "unindentified_rate": 23.5938, 43 | "execution_time": 0.4172 44 | }, 45 | "256": { 46 | "success_rate": 74.625, 47 | "error_rate": 3.6875, 48 | "unindentified_rate": 21.6875, 49 | "execution_time": 0.7966 50 | }, 51 | "512": { 52 | "success_rate": 74.9063, 53 | "error_rate": 4.5313, 54 | "unindentified_rate": 20.5625, 55 | "execution_time": 1.4548 56 | }, 57 | "1024": { 58 | "success_rate": 75.2115, 59 | "error_rate": 5.6095, 60 | "unindentified_rate": 19.1789, 61 | "execution_time": 2.1704 62 | } 63 | }, 64 | "stats": { 65 | "min": 0.01, 66 | "max": 98.58, 67 | "success_rate": 65.2835, 68 | "error_rate": 11.2808, 69 | "unindentified_rate": 23.4357, 70 | "execution_time": 0.1896 71 | }, 72 | "languages": { 73 | "hin": 98.58, 74 | "ara": 95, 75 | "nld": 93.53, 76 | "deu": 90.36, 77 | "fra": 88.44, 78 | "fin": 87.57, 79 | "ita": 85.03, 80 | "eng": 84, 81 | "tur": 78.5, 82 | "por": 76.5, 83 | "spa": 71.75, 84 | "rus": 63.43, 85 | "kor": 0.0195, 86 | "jpn": 0.01, 87 | "cmn": 0.01, 88 | "heb": 0.01 89 | } 90 | } -------------------------------------------------------------------------------- /data/bench/franc-min.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": { 3 | "10": { 4 | "success_rate": 0.125, 5 | "error_rate": 0.0625, 6 | "unindentified_rate": 99.8125, 7 | "execution_time": 0.0019 8 | }, 9 | "16": { 10 | "success_rate": 45.3438, 11 | "error_rate": 48.4063, 12 | "unindentified_rate": 6.25, 13 | "execution_time": 0.0428 14 | }, 15 | "24": { 16 | "success_rate": 55.375, 17 | "error_rate": 38.375, 18 | "unindentified_rate": 6.25, 19 | "execution_time": 0.052 20 | }, 21 | "36": { 22 | "success_rate": 66.9375, 23 | "error_rate": 26.8125, 24 | "unindentified_rate": 6.25, 25 | "execution_time": 0.0688 26 | }, 27 | "48": { 28 | "success_rate": 73.375, 29 | "error_rate": 20.375, 30 | "unindentified_rate": 6.25, 31 | "execution_time": 0.0835 32 | }, 33 | "64": { 34 | "success_rate": 75.9063, 35 | "error_rate": 17.875, 36 | "unindentified_rate": 6.2188, 37 | "execution_time": 0.1015 38 | }, 39 | "128": { 40 | "success_rate": 82.5625, 41 | "error_rate": 11.25, 42 | "unindentified_rate": 6.1875, 43 | "execution_time": 0.1628 44 | }, 45 | "256": { 46 | "success_rate": 85.1563, 47 | "error_rate": 8.6875, 48 | "unindentified_rate": 6.1563, 49 | "execution_time": 0.3008 50 | }, 51 | "512": { 52 | "success_rate": 86.5313, 53 | "error_rate": 7.4375, 54 | "unindentified_rate": 6.0313, 55 | "execution_time": 0.5538 56 | }, 57 | "1024": { 58 | "success_rate": 87.1514, 59 | "error_rate": 6.9257, 60 | "unindentified_rate": 5.9229, 61 | "execution_time": 0.9752 62 | } 63 | }, 64 | "stats": { 65 | "min": 0.01, 66 | "max": 99.93, 67 | "success_rate": 70.3891, 68 | "error_rate": 23.1888, 69 | "unindentified_rate": 6.422, 70 | "execution_time": 0.084 71 | }, 72 | "languages": { 73 | "jpn": 99.93, 74 | "kor": 99.8633, 75 | "cmn": 99.35, 76 | "deu": 94.18, 77 | "ara": 91.88, 78 | "fra": 87.32, 79 | "nld": 87.21, 80 | "eng": 81.42, 81 | "por": 76.8, 82 | "ita": 74.31, 83 | "hin": 68.25, 84 | "spa": 67.38, 85 | "tur": 58.11, 86 | "rus": 54.6, 87 | "fin": 0.01, 88 | "heb": 0.01 89 | } 90 | } -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | # **TinyLD CLI** 2 | 3 | Time to time, it can be easier to use the library from a terminal _(Example: testing or debugging)_ 4 | 5 | ```sh 6 | tinyld This is the text that I want to check 7 | # [ { lang: 'en', accuracy: 1 } ] 8 | 9 | tinyld これはテストです 10 | # [ { lang: 'ja', accuracy: 1 } ] 11 | 12 | tinyld Єсть на світі доля 13 | # [ { lang: 'uk', accuracy: 1 } ] 14 | ``` 15 | 16 | _Options_ 17 | 18 | - `--verbose` : Get an explanation of why **TinyLD** pick a language 19 | - `--only=en,ja,fr` : Restrict the detection to a subset of languages 20 | 21 | Can also be run with: 22 | 23 | - Npx: `npx tinyld [message]` 24 | - Yarn: `yarn tinyld [message]` 25 | - Bash: `./node_modules/.bin/tinyld [message]` 26 | 27 | ## Verbose mode (debugging) 28 | 29 | ```sh 30 | > yarn tinyld --verbose this is a text 31 | 32 | [Pass 1] detectUniqueGrams of 1-grams [ 33 | 't', 'h', 'i', 's', 34 | 'i', 's', 'a', 't', 35 | 'e', 'x', 't' 36 | ] 37 | [Pass 1] detectUniqueGrams of 2-grams [ 38 | ' t', 'th', 'hi', 'is', 39 | 's ', ' i', 'is', 's ', 40 | ' a', 'a ', ' t', 'te', 41 | 'ex', 'xt', 't ' 42 | ] 43 | 44 | # ... 45 | 46 | Gram 'a t' [ 47 | 'ind = 43.830000000000005%', 48 | 'tgl = 15.5%', 49 | 'epo = 41.199999999999996%', 50 | 'spa = 90.59%', 51 | 'por = 53.47%', 52 | 'ita = 65.4%', 53 | 'srp = 30.320000000000004%', 54 | 'fin = 94.69999999999999%', 55 | 'hun = 100%', 56 | 'pol = 31.680000000000003%' 57 | ] 58 | Gram ' te' [ 59 | 'ind = 18.060000000000002%', 60 | 'epo = 10.31%', 61 | 'eng = 9.44%', 62 | 'por = 97.13000000000001%', 63 | 'ita = 13.65%', 64 | 'nld = 100%', 65 | 'lat = 37.85%', 66 | 'srp = 3.6700000000000004%', 67 | 'fin = 22.67%', 68 | 'ron = 6.59%' 69 | ] 70 | Gram 'ext' [ 'eng = 59.14%', 'spa = 100%' ] 71 | Gram 'xt ' [ 'eng = 100%' ] 72 | Result this is a text [ 73 | { lang: 'en', accuracy: 0.7667, score: 2274.35 }, 74 | { lang: 'eo', accuracy: 0.3133, score: 6695.6 }, 75 | { lang: 'nl', accuracy: 0.3104, score: 6723.8 }, 76 | { lang: 'pt', accuracy: 0.2754, score: 7064.75 }, 77 | { lang: 'la', accuracy: 0.2662, score: 7154.35 } 78 | ] 79 | [ 80 | { lang: 'en', accuracy: 0.7667, score: 2274.35 }, 81 | { lang: 'eo', accuracy: 0.3133, score: 6695.6 }, 82 | { lang: 'nl', accuracy: 0.3104, score: 6723.8 }, 83 | { lang: 'pt', accuracy: 0.2754, score: 7064.75 }, 84 | { lang: 'la', accuracy: 0.2662, score: 7154.35 } 85 | ] 86 | ``` 87 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # TinyLD 2 | 3 | [![npm](https://img.shields.io/npm/v/tinyld)](https://www.npmjs.com/package/tinyld) 4 | [![npm](https://img.shields.io/npm/dm/tinyld)](https://www.npmjs.com/package/tinyld) 5 | [![CDN Download](https://data.jsdelivr.com/v1/package/npm/tinyld/badge)](https://www.jsdelivr.com/package/npm/tinyld) 6 | [![License](https://img.shields.io/npm/l/tinyld.svg)](https://npmjs.org/package/tinyld) 7 | 8 | ![logo](./banner.png) 9 | 10 | ## :tada: Description 11 | 12 | **Tiny** **L**anguage **D**etector, simply detect the language of a unicode UTF-8 text: 13 | 14 | - Pure JS, No api call, No dependencies (Node and Browser compatible) 15 | - Blazing fast and low memory footprint (unlike ML methods) 16 | - Train with dataset from [Tatoeba](https://tatoeba.org/en/) and [UDHR](https://unicode.org/udhr/) 17 | - Support [62 languages](./docs/langs.md) (24 for [the web version](./docs/light.md)) 18 | - Reliable even for really short texts (chatbot, keywords, ...) 19 | - Support both ISO-639-1 & ISO-639-2 20 | - Available for NodeJS (`CommonJS` and `ESM`), Deno and Browser 21 | 22 | ## Links 23 | 24 | - [**Playground** - Try the library](https://komodojp.github.io/tinyld/) 25 | - [Play with some code](https://runkit.com/kefniark/tinyld) 26 | - [Getting Started](./docs/install.md) 27 | - [Supported Languages](./docs/langs.md) 28 | - [Algorithm](./docs/algorithm.md) 29 | - [Frequently Asked Questions](./docs/faq.md) 30 | 31 | --- 32 | 33 | ## :floppy_disk: Getting Started 34 | 35 | ### Install 36 | 37 | ```sh 38 | yarn add tinyld # or npm install --save tinyld 39 | ``` 40 | 41 | [Install Documentation](./docs/install.md) 42 | 43 | --- 44 | 45 | ### :page_facing_up: **TinyLD API** 46 | 47 | ```js 48 | import { detect, detectAll } from 'tinyld' 49 | 50 | // Detect 51 | detect('これは日本語です.') // ja 52 | detect('and this is english.') // en 53 | 54 | // DetectAll 55 | detectAll('ceci est un text en francais.') 56 | // [ { lang: 'fr', accuracy: 0.5238 }, { lang: 'ro', accuracy: 0.3802 }, ... ] 57 | ``` 58 | 59 | [API Documentation](./docs/api.md) 60 | 61 | --- 62 | 63 | ### :paperclip: **TinyLD CLI** 64 | 65 | ```bash 66 | tinyld This is the text that I want to check 67 | # [ { lang: 'en', accuracy: 1 } ] 68 | ``` 69 | 70 | [More Information](./docs/cli.md) 71 | 72 | --- 73 | 74 | ## :chart_with_upwards_trend: Performance 75 | 76 | Here is a comparison of **Tinyld** against other popular libraries. 77 | 78 | ![SVG Graph](./docs/overall.svg) 79 | 80 | To summary in one sentence: 81 | 82 | > Better, Faster, Smaller 83 | 84 | [More Benchmark Information](./docs/benchmark.md) 85 | 86 | --- 87 | 88 | ## Developer 89 | 90 | You want to **Contribute** or **Open a PR**, it's recommend to take a look [at the dev documentation](./docs/dev.md) 91 | -------------------------------------------------------------------------------- /docs/langs.md: -------------------------------------------------------------------------------- 1 | # 62 Supported Languages 2 | 3 | This list is auto-generated from the code and up-to-date. 4 | 5 | ## Africa (4) 6 | 7 | - **Afrikaans** (ISO Codes: `af` `afr`) 8 | - **Amharic** (ISO Codes: `am` `amh`) 9 | - **Berber** (ISO Codes: `ber` `ber`) 10 | - **Kirundi** (ISO Codes: `rn` `run`) 11 | 12 | ## Asia (6) 13 | 14 | - **Burmese** (ISO Codes: `my` `mya`) 15 | - **Indonesian** (ISO Codes: `id` `ind`) 16 | - **Khmer** (ISO Codes: `km` `khm`) 17 | - **Tagalog** (ISO Codes: `tl` `tgl`) 18 | - **Thai** (ISO Codes: `th` `tha`) 19 | - **Vietnamese** (ISO Codes: `vi` `vie`) 20 | 21 | ## Asia-east (3) 22 | 23 | - **Chinese** (ISO Codes: `zh` `cmn`) 24 | - **Japanese** (ISO Codes: `ja` `jpn`) 25 | - **Korean** (ISO Codes: `ko` `kor`) 26 | 27 | ## Asia-south (7) 28 | 29 | - **Bengali** (ISO Codes: `bn` `ben`) 30 | - **Gujarati** (ISO Codes: `gu` `guj`) 31 | - **Hindi** (ISO Codes: `hi` `hin`) 32 | - **Kannada** (ISO Codes: `kn` `kan`) 33 | - **Tamil** (ISO Codes: `ta` `tam`) 34 | - **Telugu** (ISO Codes: `te` `tel`) 35 | - **Urdu** (ISO Codes: `ur` `urd`) 36 | 37 | ## Europe (6) 38 | 39 | - **Czech** (ISO Codes: `cs` `ces`) 40 | - **Greek** (ISO Codes: `el` `ell`) 41 | - **Latin** (ISO Codes: `la` `lat`) 42 | - **Macedonian** (ISO Codes: `mk` `mkd`) 43 | - **Serbian** (ISO Codes: `sr` `srp`) 44 | - **Slovak** (ISO Codes: `sk` `slk`) 45 | 46 | ## Europe-east (10) 47 | 48 | - **Belarusian** (ISO Codes: `be` `bel`) 49 | - **Bulgarian** (ISO Codes: `bg` `bul`) 50 | - **Estonian** (ISO Codes: `et` `est`) 51 | - **Hungarian** (ISO Codes: `hu` `hun`) 52 | - **Latvian** (ISO Codes: `lv` `lvs`) 53 | - **Lithuanian** (ISO Codes: `lt` `lit`) 54 | - **Polish** (ISO Codes: `pl` `pol`) 55 | - **Romanian** (ISO Codes: `ro` `ron`) 56 | - **Russian** (ISO Codes: `ru` `rus`) 57 | - **Ukrainian** (ISO Codes: `uk` `ukr`) 58 | 59 | ## Europe-north (5) 60 | 61 | - **Danish** (ISO Codes: `da` `dan`) 62 | - **Finnish** (ISO Codes: `fi` `fin`) 63 | - **Icelandic** (ISO Codes: `is` `isl`) 64 | - **Norwegian** (ISO Codes: `no` `nob`) 65 | - **Swedish** (ISO Codes: `sv` `swe`) 66 | 67 | ## Europe-west (8) 68 | 69 | - **Dutch** (ISO Codes: `nl` `nld`) 70 | - **English** (ISO Codes: `en` `eng`) 71 | - **French** (ISO Codes: `fr` `fra`) 72 | - **German** (ISO Codes: `de` `deu`) 73 | - **Irish** (ISO Codes: `ga` `gle`) 74 | - **Italian** (ISO Codes: `it` `ita`) 75 | - **Portuguese** (ISO Codes: `pt` `por`) 76 | - **Spanish** (ISO Codes: `es` `spa`) 77 | 78 | ## Middle-east (10) 79 | 80 | - **Arabic** (ISO Codes: `ar` `ara`) 81 | - **Armenian** (ISO Codes: `hy` `hye`) 82 | - **Hebrew** (ISO Codes: `he` `heb`) 83 | - **Kazakh** (ISO Codes: `kk` `kaz`) 84 | - **Mongolian** (ISO Codes: `mn` `mon`) 85 | - **Persian** (ISO Codes: `fa` `pes`) 86 | - **Tatar** (ISO Codes: `tt` `tat`) 87 | - **Turkish** (ISO Codes: `tr` `tur`) 88 | - **Turkmen** (ISO Codes: `tk` `tuk`) 89 | - **Yiddish** (ISO Codes: `yi` `yid`) 90 | 91 | ## Other (3) 92 | 93 | - **Esperanto** (ISO Codes: `eo` `epo`) 94 | - **Klingon** (ISO Codes: `tlh` `tlh`) 95 | - **Volapuk** (ISO Codes: `vo` `vol`) 96 | -------------------------------------------------------------------------------- /docs/algorithm.md: -------------------------------------------------------------------------------- 1 | # Algorithm 2 | 3 | This library uses a variant of the usual N-gram algorithm, which gives fast and good results. 4 | 5 | Most libraries are directly using a bayesian scoring algorithm to identify a text language. But TinyLD, decided to add few steps before and after, trying to mimic human logic and identify language with their unique character patterns or word usage. 6 | 7 | This is similar to what ML methods use, that's why this library has a training phase too. The goal is to find which "features" or "n-gram" are the more useful for detection without hardcoding any language specific rules. The heavy lifting is done during build time, so at runtime it can be fast and efficient. 8 | 9 | ## How it works ? 10 | 11 | The string will be split into chunks based on punctuation. Each chunk will be evaluated separately and results merged later weighted with the chunk size. 12 | 13 | This allow to handle mixed language content 14 | 15 | ```js 16 | 'This is a text in english "おはよう" and we can continue to write (and this is english too)' 17 | ``` 18 | 19 | ```js 20 | 'this is a text in english', // => will be detected as EN 21 | 'おはよう', // => will be detected as JA 22 | 'and we can continue to write', 23 | 'and this is english too' 24 | ``` 25 | 26 | Then each chunk will be evaluated with the following method: 27 | 28 | --- 29 | 30 | ### **1) First pass**: Unique Character Detection 31 | 32 | Some languages like japanese or korean can be identified right away, just based on their characters or punctuation and dont even need to reach the scoring algorithm. 33 | 34 | **Example**: 35 | 36 | - `も` is japanese 37 | - `두` is korean 38 | - `où` is french 39 | 40 | This identification is done on different sizes of grams (including 1-gram and 2-gram), which give better results than other libraries on short texts. 41 | 42 | **This pass is**: 43 | 44 | - really fast (a lookup in a map) 45 | - return only one locale (local detected this way are really accurate) 46 | 47 | --- 48 | 49 | ### **2) Second pass**: Gram Detection (2-gram, 3-gram, ...) 50 | 51 | Most of the other libraries are only using this part. 52 | More traditional method of statistical analysis on grams. 53 | Split each word in 4-gram and for each of them try to find languages that match and score them. 54 | 55 | **This pass is**: 56 | 57 | - probabilistic 58 | - return multiples locale and they have to be scored and sorted 59 | - remove grams already covered by previous step (to save space) 60 | 61 | --- 62 | 63 | ## Why doing all that ? Is gram analysis not good enough ? 64 | 65 | Individually, the accuracy of each method is not really high 66 | 67 | - Unique character detection: ~65% 68 | - Gram detection: ~85% 69 | 70 | But what allows this library to be so good, is that those detection methods are complementary and work together. 71 | 72 | For example: 73 | 74 | - Japanese accuracy is good thanks to character detection (JA ~99% but EN ~15%) 75 | - English accuracy is good thanks to word detection (JA ~1.5% but EN ~98%) 76 | 77 | Which is why together those methods get an overall accuracy > 95% 78 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | - [Language Detection Error](#my-text-is-detected-in-the-wrong-language) 4 | - [Cand I have a custom version](#can-i-have-a-version-specific-for-my-app-and-my-needs) 5 | - [Short text detection issues](#can-tinyld-identify-short-strings) 6 | - [Live Chat usage](#can-i-use-tinyld-for-an-application-like-a-chat-even-if-texts-are-short) 7 | 8 | --- 9 | 10 | ## My text is detected in the wrong language 11 | 12 | It's sad to hear, but it's not unusual. 13 | 14 | As we can see [here](https://github.com/komodojp/tinyld/blob/develop/docs/benchmark.md#libraries), **Tinyld** is good but not perfect. Overall 1~2% of the time it will get it wrong. 15 | 16 | The two things which usually increase error rate: 17 | 18 | - short inputs, try to make it longer 19 | - similar language (like spanish and catalan) 20 | - generic names/brand which may appears in multiple language corpus 21 | 22 | --- 23 | 24 | ## Can I have a version specific for my app and my needs 25 | 26 | Everything in life is about tradeoff. 27 | 28 | Tinyld was designed to be accurate, small and fast. 29 | Based on how much space and resource you are ready to spend, we provide different flavor 30 | 31 | - **Tinyld** : The general one (~500KB) which detect 64 languages 32 | - **Tinyld Light** : Mostly for browser usage (~70KB) which detect 24 languages 33 | - **Tinyld Heavy (Soon)** : The one for backend usage (few MB) which focus on accuracy only 34 | 35 | To select the one you want, simply change your import 36 | 37 | ```ts 38 | import { detect } from 'tinyld' 39 | import { detect } from 'tinyld/light' 40 | import { detect } from 'tinyld/heavy' 41 | ``` 42 | 43 | --- 44 | 45 | ## Can Tinyld identify short strings? 46 | 47 | If by short you mean one or two word with a good accuracy, the answer is most likely **No**. 48 | 49 | The key point here is to understand algorithms behind language detection. 50 | 51 | - How can you detect a language, without embedding and checking a whole dictionary for each language? 52 | - Even just between 2 or 3 languages, how would you do it? Handcraft regexp for specific languages? 53 | - How can you scale up this method easily to more languages? Even to languages you dont speak or understand? 54 | 55 | There are multiple approaches to solve this problem, but the two main ones are AI and statistics. And the general idea is to recognize some patterns or succession of letters that are specific for each language. ([n-gram](https://en.wikipedia.org/wiki/N-gram)) 56 | 57 | **Good part**: 58 | 59 | - We don't need to understand a language syntax to be able to detect it 60 | - We can extend this method to more language fairly easily 61 | - The signature of a language can be quite small only few KB 62 | 63 | **Bad part**: 64 | 65 | - It requires a certain text size to get a good detection accuracy and valuable n-grams 66 | - Common short words are usually the best for detection ("the", "or", "do", "this"), which lead to better results on sentence and not on single words 67 | - It requires a clean corpus for training 68 | - Mixed language content can be hard to detect 69 | 70 | We are always trying to improve our process and detection rate, you can find some benchmark [related to this](https://github.com/komodojp/tinyld/blob/develop/docs/benchmark.md#accuracy-by-text-length). 71 | But to give some numbers: 72 | 73 | - Tinyld usually pass the ~95% detection accuracy threshold around ~24 characters 74 | - It fall at ~80% for 12 characters (barely usable) 75 | - Less than 10 characters it's just random 76 | 77 | We recommend you to use the [TinyLD Playground](https://komodojp.github.io/tinyld/) to do some tests, and see how the accuracy increase with text length 78 | 79 | --- 80 | 81 | ## Can I use tinyld for an application like a chat, even if texts are short? 82 | 83 | Yes you can, and this is why it was built originally. 84 | 85 | One of the easy ways to workaround the size issue is to keep a context, a user is unlikely to change language abruptly in the middle of a discussion. And multiple users usually chat in a common language. 86 | So you can keep some buffer (like the last 256 characters of this user in this channel) and check this and not just the last message. 87 | 88 | This gives stability and more accurate results to the detection. 89 | -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | # NodeJS Language Detection Benchmark :rocket: 2 | 3 | - This kind of benchmark is not perfect and % can vary over time, but it gives a good idea of overall performances 4 | - Language evaluated in this benchmark: 5 | - Asia: `jpn`, `cmn`, `kor`, `hin` 6 | - Europe: `fra`, `spa`, `por`, `ita`, `nld`, `eng`, `deu`, `fin`, `rus` 7 | - Middle east: , `tur`, `heb`, `ara` 8 | - This page and graphs are auto-generated from the code 9 | 10 | --- 11 | 12 | ## Libraries 13 | 14 | Here is the list of libraries in this benchmark 15 | 16 | | Library | Script | Language | Properly Identified | Improperly identified | Not identified | Avg Execution Time | Disk Size | 17 | | ---------------- | --------------------------- | -------- | ------------------- | --------------------- | -------------- | ------------------ | --------- | 18 | | **TinyLD Heavy** | `yarn bench:tinyld-heavy` | 64 | 99.249% | 0.7478% | 0.0032% | 0.096ms. | 2.0MB | 19 | | **TinyLD** | `yarn bench:tinyld` | 64 | 98.5231% | 1.3712% | 0.1057% | 0.1191ms. | 580KB | 20 | | **TinyLD Light** | `yarn bench:tinyld-light` | 24 | 97.8778% | 1.9842% | 0.138% | 0.0947ms. | 68KB | 21 | | \*\*langdetect | `yarn bench:langdetect` | 53 | 95.675% | 4.325% | 0% | 0.3647ms. | 1.8MB | 22 | | node-cld | `yarn bench:cld` | 160 | 92.3654% | 1.6213% | 6.0133% | 0.0711ms. | > 10MB | 23 | | franc | `yarn bench:franc` | 187 | 74.2577% | 25.7423% | 0% | 0.2242ms. | 267KB | 24 | | franc-min | `yarn bench:franc-min` | 82 | 70.3891% | 23.1888% | 6.422% | 0.084ms. | 119KB | 25 | | franc-all | `yarn bench:franc-all` | 403 | 66.7081% | 33.2919% | 0% | 0.4763ms. | 509KB | 26 | | languagedetect | `yarn bench:languagedetect` | 52 | 65.2835% | 11.2808% | 23.4357% | 0.1896ms. | 240KB | 27 | 28 | --- 29 | 30 | ## Global Accuracy 31 | 32 | ![Benchmark](./overall.svg) 33 | 34 | We see two group of libraries 35 | 36 | - `tinyld`, `langdetect` and `cld` over 90% accuracy 37 | - `franc` and `languagedetect` under 75% accuracy 38 | 39 | ## Per Language 40 | 41 | ![Language](./language.svg) 42 | 43 | We see big differences between languages: 44 | 45 | - **Japanese** or **Korean** are almost at 100% for every libs (lot of unique characters) 46 | - **Spanish** and **Portuguese** are really close and cause more false-positive and an higher error-rate 47 | 48 | ## Accuracy By Text length 49 | 50 | Most libraries are using statistical analysis, so longer is the input text, better will be the detection. 51 | So we can often see quotes like this in those library documentations. 52 | 53 | > Make sure to pass it big documents to get reliable results. 54 | 55 | Let's see if this statement is true, and how those libraries behave for different input size (from small to long) 56 | ![Size](./length.svg) 57 | 58 | So the previous quote is right, over 512 characters all the libs become accurate enough. 59 | 60 | But for a ~95% accuracy threshold: 61 | 62 | - `tinyld` (green) reaches it around 24 characters 63 | - `langdetect` (cyan) and `cld` (orange) reach it around 48 characters 64 | 65 | ## Execution Time 66 | 67 | ![Size](./exec_time.svg) 68 | 69 | Here we can notice few things about performance: 70 | 71 | - `langdetect` (cyan) and `franc` (pink) seems to slow down at a similar rate 72 | - `tinyld` (green) slow down but at a really flat rate 73 | - `cld` (orange) is definitely the fastest and doesn't show any apparent slow down 74 | 75 | But we've seen previously that some of those libraries need more than 256 characters to be accurate. 76 | It means they start to slow down at the same time they start to give decent results. 77 | 78 | --- 79 | 80 | ## **Conclusion** 81 | 82 | ### Recommended :thumbsup: 83 | 84 | #### - By platform :computer: 85 | 86 | - For **NodeJS**: `TinyLD`, `langdetect` or `node-cld` (fast and accurate) 87 | - For **Browser**: `TinyLD Light` or `franc-min` (small, decent accuracy, franc is less accurate but support more languages) 88 | 89 | #### - By usage :speech_balloon: 90 | 91 | - Short text (chatbot, keywords, database, ...): `TinyLD` or `langdetect` 92 | - Long text (documents, webpage): `node-cld` or `TinyLD` 93 | 94 | ### Not recommended :thumbsdown: 95 | 96 | - `franc-all` is the worst in terms of accuracy, not a surprise because it tries to detect 400+ languages with only 3-grams. A technical demo to put big numbers but useless for real usage, even a language like english barely reaches ~45% detection rate. 97 | - `languagedetect` is light but just not accurate enough 98 | 99 | --- 100 | 101 | ## Last word :raising_hand: 102 | 103 | Thanks for reading this article, those metrics are really helpful for the development of `tinyld`. 104 | It's used in the development to see the impact of every modification and features. 105 | 106 | If you want to contribute or see another library in this benchmark, [open an issue](https://github.com/komodojp/tinyld/issues) 107 | -------------------------------------------------------------------------------- /src/tokenizer.ts: -------------------------------------------------------------------------------- 1 | import { cleanString, normalize } from './clean' 2 | import { approximate, DetectOption, ILangProfiles, langs, toISO2, TRAINING_UNIQUE_GRAMS } from './core' 3 | 4 | const chunk_regexp = /([,,、。!¿?!?":;()「」{}„“«»”"“<>⋯《》*]|[.[\]\\])+/ 5 | const word_regexp = /[ ]+/ 6 | 7 | export function chunkTokenizer(text: string): string[] { 8 | return text.split(chunk_regexp) 9 | } 10 | 11 | export function wordTokenizer(text: string): string[] { 12 | return text.split(word_regexp) 13 | } 14 | 15 | export function ngramTokenizer(text: string, length: number, padding = true): string[] { 16 | const ngramsArray = [] 17 | const array = padding ? ' '.repeat(length - 1) + text + ' '.repeat(length - 1) : text 18 | 19 | for (let i = 0; i < array.length - (length - 1); i++) { 20 | const subNgramsArray = [] 21 | 22 | let consecutiveSpace = 0 23 | for (let j = 0; j < length; j++) { 24 | if (array[i + j] === ' ') consecutiveSpace += 1 25 | else consecutiveSpace = 0 26 | if (consecutiveSpace > 1) continue 27 | subNgramsArray.push(array[i + j]) 28 | } 29 | 30 | const str = subNgramsArray.join('') 31 | if (str.trim().length > 0 && str.length === length) ngramsArray.push(str) 32 | } 33 | 34 | return ngramsArray 35 | } 36 | 37 | export function detectUniqueGrams( 38 | text: string, 39 | profiles: ILangProfiles, 40 | keys: Set, 41 | options: DetectOption 42 | ): string { 43 | for (const rank of TRAINING_UNIQUE_GRAMS) { 44 | const grams = ngramTokenizer(text, rank) 45 | for (const gram of grams) { 46 | if (!keys.has(gram)) continue 47 | 48 | const country = toISO2(profiles.uniques[gram]) 49 | if (options.only.length > 0) { 50 | if (!options.only.includes(country)) continue 51 | } 52 | if (options.verbose) console.log(`[Pass 1] detectUniqueGrams ${rank}-grams - match '${gram}' to ${country}`) 53 | return country 54 | } 55 | } 56 | return '' 57 | } 58 | 59 | export function detectPotentialGrams(text: string, profiles: ILangProfiles, options: DetectOption): string { 60 | const res = detectStatsGrams(text, profiles, options) 61 | if (res.length > 0) return res[0].lang 62 | return '' 63 | } 64 | 65 | export function detectStatsGrams( 66 | text: string, 67 | profiles: ILangProfiles, 68 | options: DetectOption 69 | ): { lang: string; accuracy: number }[] { 70 | const langScores = new Map() 71 | 72 | const grams = TRAINING_UNIQUE_GRAMS.map((x) => ngramTokenizer(text, x)).flat() 73 | if (options.verbose) console.log('[Pass 2] DetectPotentialGrams', text, grams) 74 | const langSet = new Set( 75 | [...langs.values()].filter((x) => { 76 | if (options.only.length > 0) return options.only.includes(x) || options.only.includes(toISO2(x)) 77 | return true 78 | }) 79 | ) 80 | 81 | langSet.forEach((x) => langScores.set(x, 0)) 82 | for (const gramValue of grams) { 83 | const gram = normalize(gramValue) 84 | const gramStat = profiles.multiples[gram] 85 | if (!gramStat) continue 86 | 87 | const gramLangs = new Set(Object.keys(gramStat)) 88 | const debug: string[] = [] 89 | for (const lang of langSet) { 90 | if (gramLangs.has(lang)) { 91 | langScores.set(lang, (langScores.get(lang) || 0) + (gramStat[lang] * gram.length) / 4) 92 | debug.push(`${lang} = ${(gramStat[lang] / 1024) * 100}%`) 93 | } 94 | } 95 | if (options.verbose && debug.length > 0) console.log(`Gram '${gram}'`, debug) 96 | } 97 | 98 | const entries = [...langScores.entries()] 99 | entries.sort((a, b) => b[1] - a[1]) 100 | const max = Math.max(...entries.map((x) => x[1])) || 1 101 | const result = entries.slice(0, 8).map((x) => { 102 | return { 103 | lang: toISO2(x[0]), 104 | accuracy: 1 - approximate((max - x[1]) / max), 105 | score: approximate(x[1]) 106 | } 107 | }) 108 | if (options.verbose) console.log(`Result`, text, result) 109 | return result 110 | } 111 | 112 | export function detectAllStats( 113 | text: string, 114 | options: DetectOption, 115 | profiles: ILangProfiles, 116 | uniqueKeys: Set 117 | ): { lang: string; accuracy: number }[] { 118 | let chunks = chunkTokenizer(text) 119 | chunks = chunks.map((x) => cleanString(x)).filter((x) => !!x) 120 | chunks.sort((a, b) => b.length - a.length) 121 | chunks = chunks.slice(0, 7) 122 | if (options.verbose) console.log('Analize chunks', chunks) 123 | 124 | let size = 0 125 | const results: { [lang: string]: number } = {} 126 | for (const chunk of chunks) { 127 | // pass 1 - unique character detection 128 | const res = detectUniqueGrams(chunk, profiles, uniqueKeys, options) 129 | if (res) { 130 | results[res] = (results[res] || 0) + 1 * chunk.length 131 | size += chunk.length 132 | continue 133 | } 134 | 135 | const words = wordTokenizer(chunk) 136 | for (const word of words) { 137 | // pass 2 - statistical 3-gram analysis 138 | const res2 = detectStatsGrams(word, profiles, options) 139 | res2.forEach((x) => { 140 | results[x.lang] = (results[x.lang] || 0) + x.accuracy 141 | }) 142 | size += word.length 143 | } 144 | } 145 | 146 | // merge result 147 | const entries = Object.entries(results).filter((x) => x[1] > 0) 148 | entries.sort((a, b) => b[1] - a[1]) 149 | const result = entries.map((x) => { 150 | return { lang: x[0], accuracy: x[1] / size } 151 | }) 152 | if (options.verbose) console.log('Merge Results', result) 153 | return result 154 | } 155 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tinyld", 3 | "description": "Simple and Performant Language detection library (pure JS and zero dependencies)", 4 | "version": "1.3.4", 5 | "license": "MIT", 6 | "exports": { 7 | "./light": { 8 | "require": "./dist/tinyld.light.node.js", 9 | "import": "./dist/tinyld.light.node.mjs", 10 | "browser": "./dist/tinyld.light.browser.js", 11 | "types": "./dist/tinyld.light.node.d.ts" 12 | }, 13 | "./heavy": { 14 | "require": "./dist/tinyld.heavy.node.js", 15 | "import": "./dist/tinyld.heavy.node.mjs", 16 | "browser": "./dist/tinyld.heavy.browser.js", 17 | "types": "./dist/tinyld.heavy.node.d.ts" 18 | }, 19 | ".": { 20 | "require": "./dist/tinyld.normal.node.js", 21 | "import": "./dist/tinyld.normal.node.mjs", 22 | "browser": "./dist/tinyld.normal.browser.js", 23 | "types": "./dist/tinyld.normal.node.d.ts" 24 | } 25 | }, 26 | "typesVersions": { 27 | "*": { 28 | "light": [ 29 | "./dist/tinyld.light.node.d.ts" 30 | ], 31 | "heavy": [ 32 | "./dist/tinyld.heavy.node.d.ts" 33 | ], 34 | "*": [ 35 | "./dist/tinyld.normal.node.d.ts" 36 | ] 37 | } 38 | }, 39 | "bin": { 40 | "tinyld": "./bin/tinyld.js", 41 | "tinyld-light": "./bin/tinyld-light.js", 42 | "tinyld-heavy": "./bin/tinyld-heavy.js" 43 | }, 44 | "keywords": [ 45 | "lang", 46 | "language", 47 | "language detection", 48 | "natural-language", 49 | "detect", 50 | "detector", 51 | "n-gram" 52 | ], 53 | "repository": { 54 | "type": "git", 55 | "url": "https://github.com/komodojp/tinyld.git" 56 | }, 57 | "author": { 58 | "name": "Kevin Destrem", 59 | "email": "kevin_destrem@komodo.jp" 60 | }, 61 | "scripts": { 62 | "train": "run-p train:*", 63 | "train:normal": "cross-env TINYLD_CONFIG=normal ts-node src/train.ts", 64 | "train:light": "cross-env TINYLD_CONFIG=light ts-node src/train.ts", 65 | "train:heavy": "cross-env TINYLD_CONFIG=heavy ts-node src/train.ts", 66 | "bench": "run-s bench:*", 67 | "bench:tinyld": "cross-env TINYLD_CONFIG=normal ts-node src/benchmark/tinyld.ts", 68 | "bench:tinyld-light": "cross-env TINYLD_CONFIG=light ts-node src/benchmark/tinyld_light.ts", 69 | "bench:tinyld-heavy": "cross-env TINYLD_CONFIG=heavy ts-node src/benchmark/tinyld_heavy.ts", 70 | "bench:cld": "ts-node src/benchmark/cld.ts", 71 | "bench:franc": "ts-node src/benchmark/franc.ts", 72 | "bench:franc-all": "ts-node src/benchmark/franc-all.ts", 73 | "bench:franc-min": "ts-node src/benchmark/franc-min.ts", 74 | "bench:langdetect": "ts-node src/benchmark/langdetect.ts", 75 | "bench:languagedetect": "ts-node src/benchmark/languagedetect.ts", 76 | "build": "run-p build:*", 77 | "build:normal-node": "esbuild src/index.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"normal\\\"} --bundle --charset=utf8 --minify --platform=node --outfile=dist/tinyld.normal.node.js", 78 | "build:normal-node-esm": "esbuild src/index.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"normal\\\"} --bundle --charset=utf8 --minify --platform=node --format=esm --outfile=dist/tinyld.normal.node.mjs", 79 | "build:normal-web": "esbuild src/index.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"normal\\\"} --bundle --charset=utf8 --platform=browser --format=esm --outfile=dist/tinyld.normal.browser.js", 80 | "build:light-node": "esbuild src/index_light.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"light\\\"} --bundle --charset=utf8 --minify --platform=node --outfile=dist/tinyld.light.node.js", 81 | "build:light-node-esm": "esbuild src/index_light.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"light\\\"} --bundle --charset=utf8 --minify --platform=node --format=esm --outfile=dist/tinyld.light.node.mjs", 82 | "build:light-web": "esbuild src/index_light.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"light\\\"} --bundle --charset=utf8 --minify --platform=browser --format=esm --outfile=dist/tinyld.light.browser.js", 83 | "build:heavy-node": "esbuild src/index_heavy.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"heavy\\\"} --bundle --charset=utf8 --minify --platform=node --outfile=dist/tinyld.heavy.node.js", 84 | "build:heavy-node-esm": "esbuild src/index_heavy.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"heavy\\\"} --bundle --charset=utf8 --minify --platform=node --format=esm --outfile=dist/tinyld.heavy.node.mjs", 85 | "build:heavy-web": "esbuild src/index_heavy.ts --define:process.env={\\\"TINYLD_CONFIG\\\":\\\"heavy\\\"} --bundle --charset=utf8 --minify --platform=browser --format=esm --outfile=dist/tinyld.heavy.browser.js", 86 | "build:type": "run-s build:type:*", 87 | "build:type:build": "tsc --emitDeclarationOnly --declaration --project tsconfig.json --outDir dist", 88 | "build:type:normal": "mv ./dist/index.d.ts ./dist/tinyld.normal.node.d.ts && cp ./dist/tinyld.normal.node.d.ts ./dist/tinyld.normal.browser.d.ts", 89 | "build:type:light": "mv ./dist/index_light.d.ts ./dist/tinyld.light.node.d.ts && cp ./dist/tinyld.light.node.d.ts ./dist/tinyld.light.browser.d.ts", 90 | "build:type:heavy": "mv ./dist/index_heavy.d.ts ./dist/tinyld.heavy.node.d.ts && cp ./dist/tinyld.heavy.node.d.ts ./dist/tinyld.heavy.browser.d.ts", 91 | "build:post": "yarn gen:svg && yarn gen:mkd && yarn lint", 92 | "gen:mkd": "node ./utils/mkdown.js", 93 | "gen:svg": "node ./utils/index.js", 94 | "lint": "eslint --ext .js,.ts --fix ./ && prettier --config .prettierrc --ignore-path .prettierignore --write \"**/*.{ts,js,md}\"", 95 | "test": "run-p test:*", 96 | "test:unit": "uvu tests", 97 | "test:dependencies": "yarn audit --level high || echo \"Run 'yarn update' to interactively update dependencies for this project\"", 98 | "test:lint": "eslint --ext .js,.ts ./ && prettier --config .prettierrc --ignore-path .prettierignore --check \"**/*.{ts,js}\"", 99 | "test:types": "tsc --noEmit" 100 | }, 101 | "devDependencies": { 102 | "@types/node": "^18.0.0", 103 | "@typescript-eslint/eslint-plugin": "^4.28.5", 104 | "@typescript-eslint/parser": "^4.28.5", 105 | "chartist-svg": "^0.2.3", 106 | "cld": "^2.7.0", 107 | "cross-env": "^7.0.3", 108 | "esbuild": "^0.14.0", 109 | "eslint": "^7.32.0", 110 | "franc": "^5.0.0", 111 | "franc-all": "^5.0.0", 112 | "franc-min": "^5.0.0", 113 | "langdetect": "^0.2.1", 114 | "languagedetect": "^2.0.0", 115 | "npm-run-all": "^4.1.5", 116 | "p-limit": "3.1.0", 117 | "prettier": "^2.3.2", 118 | "ts-node": "^10.2.0", 119 | "typescript": "^4.3.5", 120 | "uvu": "^0.5.1" 121 | }, 122 | "engines": { 123 | "node": ">= 12.10.0", 124 | "npm": ">= 6.12.0", 125 | "yarn": ">= 1.20.0" 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /utils/mkdown.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const { langRegion, langName, supportedLanguages, toISO2 } = require('../dist/tinyld.normal.node.js') 3 | 4 | function getJSON(filepath) { 5 | return JSON.parse(fs.readFileSync(filepath)) 6 | } 7 | 8 | function capitalizeFirstLetter(string) { 9 | return string.charAt(0).toUpperCase() + string.slice(1) 10 | } 11 | 12 | async function generateDocLangs() { 13 | let content = '' 14 | const regions = [...new Set(supportedLanguages.map((x) => langRegion(x)))] 15 | regions.sort() 16 | for (const reg of regions) { 17 | const langs = supportedLanguages.filter((x) => langRegion(x) === reg) 18 | 19 | content += `\n## ${capitalizeFirstLetter(reg)} (${langs.length})\n` 20 | langs.sort((a, b) => langName(a).localeCompare(langName(b))) 21 | langs.forEach((x) => { 22 | content += `- **${langName(x)}** (ISO Codes: \`${toISO2(x)}\` \`${x}\`)\n` 23 | }) 24 | } 25 | 26 | fs.writeFileSync( 27 | './docs/langs.md', 28 | `# ${supportedLanguages.length} Supported Languages 29 | This list is auto-generated from the code and up-to-date. 30 | ${content}` 31 | ) 32 | } 33 | 34 | async function generateDocBenchmark() { 35 | const data = { 36 | tinyld: getJSON('./data/bench/tinyld.json'), 37 | 'tinyld-light': getJSON('./data/bench/tinyld-light.json'), 38 | 'tinyld-heavy': getJSON('./data/bench/tinyld-heavy.json'), 39 | langdetect: getJSON('./data/bench/langdetect.json'), 40 | cld: getJSON('./data/bench/cld.json'), 41 | franc: getJSON('./data/bench/franc.json'), 42 | 'franc-min': getJSON('./data/bench/franc-min.json'), 43 | 'franc-all': getJSON('./data/bench/franc-all.json'), 44 | languagedetect: getJSON('./data/bench/languagedetect.json') 45 | } 46 | 47 | const stats = (lib) => { 48 | return `${data[lib].stats.success_rate}% | ${data[lib].stats.error_rate}% | ${data[lib].stats.unindentified_rate}% | ${data[lib].stats.execution_time}ms.` 49 | } 50 | fs.writeFileSync( 51 | './docs/benchmark.md', 52 | `# NodeJS Language Detection Benchmark :rocket: 53 | - This kind of benchmark is not perfect and % can vary over time, but it gives a good idea of overall performances 54 | - Language evaluated in this benchmark: 55 | - Asia: \`jpn\`, \`cmn\`, \`kor\`, \`hin\` 56 | - Europe: \`fra\`, \`spa\`, \`por\`, \`ita\`, \`nld\`, \`eng\`, \`deu\`, \`fin\`, \`rus\` 57 | - Middle east: , \`tur\`, \`heb\`, \`ara\` 58 | - This page and graphs are auto-generated from the code 59 | 60 | --- 61 | 62 | ## Libraries 63 | 64 | Here is the list of libraries in this benchmark 65 | 66 | | Library | Script | Language | Properly Identified | Improperly identified | Not identified | Avg Execution Time | Disk Size | 67 | | -------------- | --------------------------- | -------- | ------------------- | --------------------- | -------------- | ------------------ | --------- | 68 | | **TinyLD Heavy** | \`yarn bench:tinyld-heavy\` | 64 | ${stats('tinyld-heavy')} | 2.0MB | 69 | | **TinyLD** | \`yarn bench:tinyld\` | 64 | ${stats('tinyld')} | 580KB | 70 | | **TinyLD Light** | \`yarn bench:tinyld-light\` | 24 | ${stats('tinyld-light')} | 68KB | 71 | | **langdetect | \`yarn bench:langdetect\` | 53 | ${stats('langdetect')} | 1.8MB | 72 | | node-cld | \`yarn bench:cld\` | 160 | ${stats('cld')} | > 10MB | 73 | | franc | \`yarn bench:franc\` | 187 | ${stats('franc')} | 267KB | 74 | | franc-min | \`yarn bench:franc-min\` | 82 | ${stats('franc-min')} | 119KB | 75 | | franc-all | \`yarn bench:franc-all\` | 403 | ${stats('franc-all')} | 509KB | 76 | | languagedetect | \`yarn bench:languagedetect\` | 52 | ${stats('languagedetect')} | 240KB | 77 | 78 | --- 79 | 80 | ## Global Accuracy 81 | ![Benchmark](./overall.svg) 82 | 83 | We see two group of libraries 84 | - \`tinyld\`, \`langdetect\` and \`cld\` over 90% accuracy 85 | - \`franc\` and \`languagedetect\` under 75% accuracy 86 | 87 | ## Per Language 88 | ![Language](./language.svg) 89 | 90 | We see big differences between languages: 91 | * **Japanese** or **Korean** are almost at 100% for every libs (lot of unique characters) 92 | * **Spanish** and **Portuguese** are really close and cause more false-positive and an higher error-rate 93 | 94 | ## Accuracy By Text length 95 | Most libraries are using statistical analysis, so longer is the input text, better will be the detection. 96 | So we can often see quotes like this in those library documentations. 97 | > Make sure to pass it big documents to get reliable results. 98 | 99 | Let's see if this statement is true, and how those libraries behave for different input size (from small to long) 100 | ![Size](./length.svg) 101 | 102 | So the previous quote is right, over 512 characters all the libs become accurate enough. 103 | 104 | But for a ~95% accuracy threshold: 105 | * \`tinyld\` (green) reaches it around 24 characters 106 | * \`langdetect\` (cyan) and \`cld\` (orange) reach it around 48 characters 107 | 108 | ## Execution Time 109 | ![Size](./exec_time.svg) 110 | 111 | Here we can notice few things about performance: 112 | * \`langdetect\` (cyan) and \`franc\` (pink) seems to slow down at a similar rate 113 | * \`tinyld\` (green) slow down but at a really flat rate 114 | * \`cld\` (orange) is definitely the fastest and doesn't show any apparent slow down 115 | 116 | But we've seen previously that some of those libraries need more than 256 characters to be accurate. 117 | It means they start to slow down at the same time they start to give decent results. 118 | 119 | --- 120 | 121 | ## **Conclusion** 122 | 123 | ### Recommended :thumbsup: 124 | 125 | #### - By platform :computer: 126 | 127 | - For **NodeJS**: \`TinyLD\`, \`langdetect\` or \`node-cld\` (fast and accurate) 128 | - For **Browser**: \`TinyLD Light\` or \`franc-min\` (small, decent accuracy, franc is less accurate but support more languages) 129 | 130 | #### - By usage :speech_balloon: 131 | 132 | - Short text (chatbot, keywords, database, ...): \`TinyLD\` or \`langdetect\` 133 | - Long text (documents, webpage): \`node-cld\` or \`TinyLD\` 134 | 135 | ### Not recommended :thumbsdown: 136 | 137 | - \`franc-all\` is the worst in terms of accuracy, not a surprise because it tries to detect 400+ languages with only 3-grams. A technical demo to put big numbers but useless for real usage, even a language like english barely reaches ~45% detection rate. 138 | - \`languagedetect\` is light but just not accurate enough 139 | 140 | --- 141 | 142 | ## Last word :raising_hand: 143 | 144 | Thanks for reading this article, those metrics are really helpful for the development of \`tinyld\`. 145 | It's used in the development to see the impact of every modification and features. 146 | 147 | If you want to contribute or see another library in this benchmark, [open an issue](https://github.com/komodojp/tinyld/issues)` 148 | ) 149 | } 150 | 151 | ;(async () => { 152 | await generateDocLangs() 153 | await generateDocBenchmark() 154 | })() 155 | -------------------------------------------------------------------------------- /src/benchmark/bench.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import readline from 'readline' 3 | import { approximate, getCoef, langs, langName, toISO2 } from '../core' 4 | 5 | type DetectMethod = (val: string) => Promise | string 6 | 7 | export type BenchmarkResult = { 8 | size: Record 9 | stats: { 10 | min: number 11 | max: number 12 | success_rate: number 13 | error_rate: number 14 | unindentified_rate: number 15 | execution_time: number 16 | } 17 | languages: Record 18 | } 19 | 20 | type BenchmarkSize = { success_rate: number; error_rate: number; unindentified_rate: number; execution_time: number } 21 | type CountPerSize = { 22 | min: number 23 | max: number 24 | buffer: string 25 | total: number 26 | success: number 27 | error: number 28 | unidentified: number 29 | exec: number 30 | } 31 | 32 | const benchLangs = new Set([ 33 | 'jpn', 34 | 'cmn', 35 | 'kor', 36 | 'hin', 37 | 'nld', 38 | 'fra', 39 | 'eng', 40 | 'deu', 41 | 'spa', 42 | 'por', 43 | 'ita', 44 | 'fin', 45 | 'rus', 46 | 'tur', 47 | 'heb', 48 | 'ara' 49 | ]) 50 | 51 | export async function benchmark(detect: DetectMethod): Promise { 52 | const total = new Map() 53 | const success = new Map() 54 | let detectTotal = 0 55 | let detectIdentified = 0 56 | let detectUnidentified = 0 57 | let detectMistake = 0 58 | let executionTime = 0 59 | 60 | const countCategories = [ 61 | { min: 0, max: 10 }, 62 | { min: 10, max: 16 }, 63 | { min: 16, max: 24 }, 64 | { min: 24, max: 36 }, 65 | { min: 36, max: 48 }, 66 | { min: 48, max: 64 }, 67 | { min: 64, max: 128 }, 68 | { min: 128, max: 256 }, 69 | { min: 256, max: 512 }, 70 | { min: 512, max: 1024 } 71 | ] 72 | 73 | const globalCount: Record = Object.fromEntries( 74 | countCategories.map((x) => [x.max, { success_rate: 0, error_rate: 0, unindentified_rate: 0, execution_time: 0 }]) 75 | ) 76 | 77 | const errorMap = new Map() 78 | 79 | for (const country of benchLangs.values()) { 80 | const fileStream = fs.createReadStream(`data/tmp/${country}/sentences.txt`) 81 | const rl = readline.createInterface({ 82 | input: fileStream, 83 | crlfDelay: Infinity 84 | }) 85 | 86 | let line = 0 87 | 88 | const langCount: Record = Object.fromEntries( 89 | countCategories.map((x) => [ 90 | x.max, 91 | { min: x.min, max: x.max, buffer: '', total: 0, success: 0, error: 0, unidentified: 0, exec: 0 } 92 | ]) 93 | ) 94 | 95 | for await (const text of rl) { 96 | if (text.length < 16) continue 97 | line += 1 98 | if (line > 10000) break 99 | 100 | total.set(country, (total.get(country) || 0) + 1) 101 | detectTotal += 1 102 | 103 | const start = process.hrtime() 104 | const res = await detect(text) 105 | const duration = process.hrtime(start)[1] / 1000000 106 | executionTime += duration 107 | 108 | if (res === '') { 109 | detectUnidentified += 1 110 | } else if (res === toISO2(country)) { 111 | success.set(country, (success.get(country) || 0) + 1) 112 | detectIdentified += 1 113 | } else { 114 | detectMistake += 1 115 | const errorKey = `${toISO2(country)} -> ${res}` 116 | errorMap.set(errorKey, (errorMap.get(errorKey) || 0) + 1) 117 | } 118 | } 119 | 120 | fileStream.close() 121 | 122 | const fileStream2 = fs.createReadStream(`data/tmp/${country}/sentences.txt`) 123 | const rl2 = readline.createInterface({ 124 | input: fileStream2, 125 | crlfDelay: Infinity 126 | }) 127 | 128 | for await (const text of rl2) { 129 | for (const size of countCategories.map((x) => x.max)) { 130 | if (langCount[size].buffer.length + text.length < langCount[size].max) { 131 | if (langCount[size].buffer) { 132 | langCount[size].buffer += `. ${text}` 133 | } else { 134 | langCount[size].buffer = text 135 | } 136 | 137 | continue 138 | } 139 | 140 | if ( 141 | langCount[size].buffer && 142 | langCount[size].total < 200 && 143 | langCount[size].buffer.length >= langCount[size].min && 144 | langCount[size].buffer.length <= langCount[size].max 145 | ) { 146 | const start = process.hrtime() 147 | const res = await detect(langCount[size].buffer) 148 | const duration = process.hrtime(start)[1] / 1000000 149 | langCount[size].exec += duration 150 | if (res === '') { 151 | langCount[size].unidentified += 1 152 | } else if (res === toISO2(country)) { 153 | langCount[size].success += 1 154 | } else { 155 | langCount[size].error += 1 156 | } 157 | langCount[size].total += 1 158 | } 159 | 160 | langCount[size].buffer = '' 161 | } 162 | } 163 | 164 | fileStream2.close() 165 | 166 | for (const size of countCategories.map((x) => x.max)) { 167 | globalCount[size].success_rate += langCount[size].success 168 | globalCount[size].error_rate += langCount[size].error 169 | globalCount[size].unindentified_rate += langCount[size].unidentified 170 | globalCount[size].execution_time += langCount[size].exec 171 | } 172 | } 173 | 174 | for (const size of countCategories.map((x) => x.max)) { 175 | const entry = globalCount[size] 176 | const cpt = entry.success_rate + entry.error_rate + entry.unindentified_rate 177 | 178 | entry.success_rate = approximate((entry.success_rate / cpt) * 100) 179 | entry.error_rate = approximate((entry.error_rate / cpt) * 100) 180 | entry.unindentified_rate = approximate((entry.unindentified_rate / cpt) * 100) 181 | entry.execution_time = approximate(entry.execution_time / cpt) 182 | } 183 | 184 | console.log(`--- Per language Accuracy ---`) 185 | const languageAccuracy: [string, number][] = [] 186 | const acc: [number, string][] = [] 187 | for (const lang of total.keys()) { 188 | const s = success.get(lang) || 1 189 | const t = total.get(lang) || 1 190 | acc.push([s / t, ` - ${langName(lang)} (${lang}) - ${approximate((s / t) * 100)}% (coef: ${getCoef(lang)})`]) 191 | languageAccuracy.push([lang, approximate((s / t) * 100)]) 192 | } 193 | acc.sort((a, b) => b[0] - a[0]) 194 | languageAccuracy.sort((a, b) => b[1] - a[1]) 195 | acc.forEach((x) => console.log(x[1])) 196 | 197 | const errors = [...errorMap.entries()] 198 | errors.sort((a, b) => b[1] - a[1]) 199 | console.log( 200 | `\n--- More common errors (${ 201 | Math.round((detectMistake / detectTotal) * 100 * 100) / 100 202 | }% : ${detectMistake} / ${detectTotal}) ---` 203 | ) 204 | console.log( 205 | errors 206 | .map((x) => ` - ${x[0]} : ${approximate((100 * x[1]) / detectMistake)}% (error: ${x[1]})`) 207 | .slice(0, 20) 208 | .join('\n') 209 | ) 210 | 211 | console.log(`\n--- Summary (${langs.size} languages) ---`) 212 | console.log(` - Properly identified: ${approximate((detectIdentified / detectTotal) * 100)}%`) 213 | console.log(` - Improperly identified: ${approximate((detectMistake / detectTotal) * 100)}%`) 214 | console.log(` - Unidentified: ${approximate((detectUnidentified / detectTotal) * 100)}%`) 215 | console.log(` - Avg exec time: ${approximate(executionTime / detectTotal)}ms.`) 216 | 217 | return { 218 | size: globalCount, 219 | stats: { 220 | min: Math.min(...languageAccuracy.map((x) => x[1])), 221 | max: Math.max(...languageAccuracy.map((x) => x[1])), 222 | success_rate: approximate((detectIdentified / detectTotal) * 100), 223 | error_rate: approximate((detectMistake / detectTotal) * 100), 224 | unindentified_rate: approximate((detectUnidentified / detectTotal) * 100), 225 | execution_time: approximate(executionTime / detectTotal) 226 | }, 227 | languages: Object.fromEntries(languageAccuracy) 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /playground/src/App.vue: -------------------------------------------------------------------------------- 1 |
3 |
4 |
5 |
6 |

TinyLD Playground

7 |

8 | Tiny Language Detector, simply detect the language of a unicode UTF-8 text 9 |

10 |
11 |
12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 |
20 | 21 | 22 |
23 |
24 |