├── .gitignore ├── README.md ├── benchmark-testing ├── analyze.ts ├── constants.js ├── data │ └── .gitkeep ├── download-test-data.js ├── helpers.js ├── package-lock.json ├── package.json └── results │ ├── COMPARISONS.md │ ├── RESULTS.md │ ├── RESULTS_with_metadata.csv │ ├── benchmark_results_0.2.1.json │ ├── benchmark_results_0.2.1_data.json │ ├── benchmark_results_0.2.2.json │ ├── benchmark_results_0.2.2_data.json │ ├── benchmark_results_0.2.3.json │ ├── benchmark_results_0.2.3_data.json │ ├── reliability_list_0.2.2.json │ └── reliability_list_0.2.3.json ├── model └── .gitkeep ├── package-lock.json ├── package.json ├── rollup.config.js ├── src └── index.ts └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | node_modules 5 | model/fast-text-lid-model.bin 6 | benchmark-testing/data/* 7 | !benchmark-testing/data/.gitkeep 8 | 9 | # testing 10 | coverage 11 | 12 | # production 13 | dist 14 | build 15 | 16 | # misc 17 | .DS_Store 18 | .env.local 19 | .env.development.local 20 | .env.test.local 21 | .env.production.local 22 | 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast-Text Language Detection 2 | 3 | In a search for the _best_ option for predicting a language from text which didn't require a large machine learning model, it appeared that fast-text, created by FaceBook, was the best option (https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c). 4 | 5 | ## Installation 6 | 7 | ``` 8 | npm i --save @smodin/fast-text-language-detection 9 | ``` 10 | 11 | **Note: This will install the fast-text model by facebook which is about 150MB. You also need python installed, if you're running an alipine docker see how to easily do this [here](https://stackoverflow.com/questions/54428608/docker-node-alpine-image-build-fails-on-node-gyp)** 12 | 13 | ## Usage 14 | 15 | ### Prediction 16 | 17 | _Testing_ 18 | 19 | ```js 20 | ;(async () => { 21 | const LanguageDetection = require('@smodin/fast-text-language-detection') 22 | const lid = new LanguageDetection() 23 | 24 | console.log(await lid.predict('FastText-LID provides a great language identification')) 25 | console.log(await lid.predict('FastText-LID bietet eine hervorragende Sprachidentifikation')) 26 | console.log(await lid.predict('FastText-LID fornisce un ottimo linguaggio di identificazione')) 27 | console.log(await lid.predict('FastText-LID fournit une excellente identification de la langue')) 28 | console.log(await lid.predict('FastText-LID proporciona una gran identificación de idioma')) 29 | console.log(await lid.predict('FastText-LID обеспечивает отличную идентификацию языка')) 30 | console.log(await lid.predict('FastText-LID提供了很好的語言識別')) 31 | })() 32 | ``` 33 | 34 | > The second argument is the number of returned responses, i.e. `lid.predict(text, 10)` will return an array of 10 results 35 | 36 | _Output_ 37 | 38 | ``` 39 | [ { lang: 'en', prob: 0.6313226222991943, isReliableLanguage: true } ] 40 | [ { lang: 'de', prob: 0.9137917160987854, isReliableLanguage: true } ] 41 | [ { lang: 'it', prob: 0.974501371383667, isReliableLanguage: true } ] 42 | [ { lang: 'fr', prob: 0.7358829379081726, isReliableLanguage: true } ] 43 | [ { lang: 'es', prob: 0.9211937189102173, isReliableLanguage: true } ] 44 | [ { lang: 'ru', prob: 0.9899846911430359, isReliableLanguage: true } ] 45 | [ { lang: 'zh', prob: 0.8515647649765015, isReliableLanguage: true } ] 46 | ``` 47 | 48 | > `isReliableLanguage` is true if there were 10 + test results and accuracy was 95% or more 49 | 50 | ### Other Helpers 51 | 52 | ```js 53 | const LanguageDetection = require('@smodin/fast-text-language-detection') 54 | const lid = new LanguageDetection() 55 | const languageIsoCodes = lid.languageIsoCodes // ['af', 'als', 'am', 'an', 'ar', ...] 56 | ``` 57 | 58 | ## Similar Libaries 59 | 60 | FastText has been used and implemented in other computer languages. 61 | 62 | - Python[https://github.com/indix/whatthelang] 63 | 64 | ## Reference Documents 65 | 66 | - FastText model 176: https://fasttext.cc/docs/en/language-identification.html 67 | 68 | ## Accuracy from Benchmark Testing 69 | 70 | ### Long Input (30 to 250 characters) 71 | 72 | Translated sentence data was obtained from tatoeba.org. Additional meta data can be found in `benchmark-testing/results/RESULTS_with_metadata.csv`. 73 | 74 | Testing the 550k sentences of 30 - 250 characters took less than 30 seconds (personal macbook Pro). 75 | 76 | | Language (101) | Symbol (alternates) | Count (558260) | Accuracy (30 - 250 chars) | Mislabels | False Positives | 77 | | -------------------------------- | ------------------- | -------------- | ------------------------- | ---------------- | --------------- | 78 | | English | en | 22428 | 1 | | 120 | 79 | | Greek | el | 12039 | 1 | | 0 | 80 | | Hebrew | he | 8616 | 1 | | 0 | 81 | | Japanese | ja | 2169 | 1 | | 0 | 82 | | Georgian | ka | 1973 | 1 | | 0 | 83 | | Bengali | bn | 1164 | 1 | | 131 | 84 | | Thai | th | 572 | 1 | | 0 | 85 | | Mandarin Chinese | zh | 568 | 1 | | 0 | 86 | | Malayalam | ml | 517 | 1 | | 0 | 87 | | Korean | ko | 482 | 1 | | 7 | 88 | | Burmese | my | 216 | 1 | | 0 | 89 | | Tamil | ta | 205 | 1 | | 0 | 90 | | Kannada | kn | 118 | 1 | | 1 | 91 | | Telugu | te | 102 | 1 | | 0 | 92 | | Punjabi (Eastern) | pa | 88 | 1 | | 0 | 93 | | Lao | lo | 70 | 1 | | 0 | 94 | | Gujarati | gu | 57 | 1 | | 0 | 95 | | Tibetan | bo | 20 | 1 | | 0 | 96 | | Divehi, Dhivehi, Maldivian | dv | 15 | 1 | | 0 | 97 | | Sinhala | si | 9 | 1 | | 0 | 98 | | Amharic | am | 3 | 1 | | 0 | 99 | | German | de | 22014 | 0.9998637230853094 | en | 64 | 100 | | Polish | pl | 17768 | 0.999718595227375 | en,eo,de,ro | 88 | 101 | | Russian | ru | 17329 | 0.9997114663281205 | bg,kk,uk,mk | 241 | 102 | | Hungarian | hu | 17942 | 0.9996655891204994 | tr,br,it,de,en | 43 | 103 | | Hindi | hi | 5362 | 0.999627004848937 | mr | 0 | 104 | | Vietnamese | vi | 13000 | 0.9996153846153846 | eo,hu,fr | 9 | 105 | | Turkish | tr | 19919 | 0.9995983734123199 | eo,en,it,fr,nds | 1092 | 106 | | Esperanto | eo | 17841 | 0.999551594641556 | it,es,pt,fr,ceb | 13 | 107 | | French | fr | 23076 | 0.999523314265904 | en,es,it,ru | 238 | 108 | | Marathi | mr | 10461 | 0.9995220342223496 | hi | 2 | 109 | | Uyghur | ug | 3692 | 0.9991874322860238 | ba,ru,hu | 0 | 110 | | Finnish | fi | 17406 | 0.9990807767436516 | it,et,en,hr,de | 37 | 111 | | Italian | it | 18326 | 0.9989632216522972 | es,de,fr,en,la | 2207 | 112 | | Spanish | es | 18227 | 0.998134635430954 | pt,it,io,ca,ia | 3476 | 113 | | Armenian | hy | 518 | 0.9980694980694981 | de | 0 | 114 | | Arabic | ar | 8761 | 0.9978312977970552 | arz,fa,es,mzn,en | 0 | 115 | | Ukrainian | uk | 14285 | 0.9963598179908996 | ru,sr | 133 | 116 | | Macedonian | mk | 14465 | 0.9959903214656066 | bg,sr,ru | 93 | 117 | | Dutch | nl | 19626 | 0.9934780393355752 | en,af,de,nds,fr | 382 | 118 | | Lithuanian | lt | 13835 | 0.9933501987712324 | fi,pl,eo,pt,sr | 20 | 119 | | Portuguese | pt | 20174 | 0.9933082184990581 | es,gl,it,en,fr | 1149 | 120 | | Khmer | km | 379 | 0.9920844327176781 | az,et | 0 | 121 | | Urdu | ur | 963 | 0.9906542056074766 | pnb,fa,ro,en | 9 | 122 | | Czech | cs | 10863 | 0.9898738838258307 | sk,pl,hu,sl,en | 1 | 123 | | Swedish | sv | 12188 | 0.9886773875943551 | no,da,en,fi,id | 174 | 124 | | Romanian | ro | 13560 | 0.9886430678466077 | es,fr,it,en,pt | 133 | 125 | | Bulgarian | bg | 11144 | 0.9869885139985642 | mk,ru,uk,sr | 2 | 126 | | Ossetian | os | 59 | 0.9830508474576272 | ru | 0 | 127 | | Icelandic | is | 6364 | 0.9803582652419862 | et,no,da,hu,cs | 4 | 128 | | Kazakh | kk | 2232 | 0.9802867383512545 | ru,tr,tt,uk,ky | 4 | 129 | | Tagalog | tl | 10351 | 0.9737223456670853 | ceb,en,id,es,war | 21 | 130 | | Tatar | tt | 8178 | 0.9680851063829787 | az,tr,ru,fi,kk | 13 | 131 | | Basque | eu | 2999 | 0.9676558852950984 | it,nl,id,en,io | 14 | 132 | | Tajik | tg | 30 | 0.9666666666666667 | ru | 0 | 133 | | Belarusian | be | 6253 | 0.9625779625779626 | uk,ru,pl,bg,sr | 0 | 134 | | Latvian | lv | 1243 | 0.9597747385358005 | lt,hr,sr,fi,eo | 4 | 135 | | Chuvash | cv | 460 | 0.9543478260869566 | ru,uk,ba,sr | 0 | 136 | | Breton | br | 2451 | 0.9543043655650755 | fr,nl,eu,de,pt | 0 | 137 | | Bashkir | ba | 120 | 0.95 | tt,av | 0 | 138 | | Indonesian | id | 9372 | 0.949637217242851 | ms,it,en,eo,tr | 16 | 139 | | Danish | da | 15299 | 0.948035819334597 | no,sv,de,en,nn | 2 | 140 | | Estonian | et | 1227 | 0.9356153219233904 | fi,en,hu,it,nl | 5 | 141 | | Latin | la | 11437 | 0.9206085511934948 | fr,it,en,es,pt | 292 | 142 | | Irish | ga | 867 | 0.9065743944636678 | en,gd,ca,kv,cs | 14 | 143 | | Scottish Gaelic | gd | 542 | 0.8966789667896679 | en,ga,de,fr,pam | 2 | 144 | | Welsh | cy | 619 | 0.8917609046849758 | es,en,la,kw,de | 8 | 145 | | Catalan | ca | 4725 | 0.8833862433862434 | es,pt,fr,it,ro | 0 | 146 | | Kyrgyz | ky | 66 | 0.8787878787878788 | ru,kk | 4 | 147 | | Cornish | kw | 426 | 0.8779342723004695 | en,cy,de,br,sq | 1 | 148 | | Assamese | as | 960 | 0.8635416666666667 | bn | 0 | 149 | | Volapük | vo | 806 | 0.8511166253101737 | id,de,fi,en,eo | 15 | 150 | | Serbian | sr | 13494 | 0.8489699125537276 | hr,sh,mk,bs,sl | 1050 | 151 | | Slovak | sk | 4370 | 0.8263157894736842 | cs,pl,sl,no,sr | 45 | 152 | | Maltese | mt | 52 | 0.8076923076923077 | es,cs,pt,sr,eo | 7 | 153 | | Norwegian Nynorsk | nn (no) | 657 | 0.7990867579908676 | da,sv,de,es,fi | 29 | 154 | | Afrikaans | af | 1632 | 0.7879901960784313 | nl,en,fr,de,nds | 0 | 155 | | Occitan | oc | 2861 | 0.7679133170220203 | ca,es,fr,pt,it | 27 | 156 | | Interlingua | ia | 18782 | 0.7500798636992866 | es,it,fr,la,pt | 82 | 157 | | Sanskrit | sa | 11 | 0.7272727272727273 | hi,ne | 0 | 158 | | Chechen | ce | 7 | 0.7142857142857143 | mn,ru | 0 | 159 | | Slovenian | sl | 372 | 0.6774193548387096 | sr,hr,bs,pl,eo | 62 | 160 | | Frisian | fy | 107 | 0.6635514018691588 | nl,en,de,af,fr | 8 | 161 | | Javanese | jv | 260 | 0.6461538461538462 | id,en,ms,ko,su | 5 | 162 | | Yoruba | yo | 5 | 0.6 | sk,rm | 1 | 163 | | Luxembourgish | lb | 217 | 0.5944700460829493 | de,nds,sv,fr,nl | 3 | 164 | | Galician | gl | 2618 | 0.5790679908326967 | pt,es,it,fr,ca | 8 | 165 | | Turkmen | tk | 3793 | 0.5710519377801213 | tr,uz,en,et,io | 0 | 166 | | Croatian | hr | 2222 | 0.5333033303330333 | sr,sh,bs,sl,pl | 45 | 167 | | Aragonese | an | 4 | 0.5 | es | 0 | 168 | | Ido | io | 2905 | 0.48055077452667816 | eo,es,it,pt,tr | 7 | 169 | | Interlingue | ie | 2007 | 0.4718485301444943 | es,it,fr,en,ia | 7 | 170 | | Limburgan, Limburger, Limburgish | li | 3 | 0.3333333333333333 | de | 1 | 171 | | Walloon | wa | 16 | 0.3125 | fr,pt,tl,oc,en | 1 | 172 | | Somali | so | 32 | 0.21875 | fi,eo,cy,en,az | 1 | 173 | | Corsican | co | 5 | 0.2 | it,fr | 0 | 174 | | Sundanese | su | 11 | 0.18181818181818182 | id,ms,es | 19 | 175 | | Haitian Creole | ht | 15 | 0.06666666666666667 | br,fr,su,diq,no | 3 | 176 | | Romansh | rm | 16 | 0.0625 | it,fr,en,tl,qu | 3 | 177 | | Bosnian | bs | 139 | 0.03597122302158273 | sr,hr,sh,pl,sl | 0 | 178 | | Manx | gv | 6 | 0 | cy,fr,nl,et,en | 0 | 179 | 180 | ### Short Form (10 to 40 characters) 181 | 182 | As a test of accuracy on shorter phrases, the min and max character count was changed to 10 - 40, and similar results can be seen for major languages, but less known languages suffer significantly: 183 | 184 | | Language (102) | Symbol (alternates) | Count (837539) | Accuracy (10 - 40 chars) | Mislabels | 185 | | -------------------------------- | ------------------- | -------------- | ------------------------ | ---------------- | 186 | | Thai | th | 3399 | 1 | | 187 | | Malayalam | ml | 525 | 1 | | 188 | | Burmese | my | 243 | 1 | | 189 | | Tamil | ta | 229 | 1 | | 190 | | Telugu | te | 220 | 1 | | 191 | | Punjabi (Eastern) | pa | 156 | 1 | | 192 | | Amharic | am | 154 | 1 | | 193 | | Kannada | kn | 126 | 1 | | 194 | | Gujarati | gu | 116 | 1 | | 195 | | Sinhala | si | 37 | 1 | | 196 | | Tibetan | bo | 29 | 1 | | 197 | | Divehi, Dhivehi, Maldivian | dv | 15 | 1 | | 198 | | Japanese | ja | 28060 | 0.9999643620812545 | zh | 199 | | Greek | el | 24980 | 0.9999599679743795 | en | 200 | | Hebrew | he | 26461 | 0.9999244170666264 | en,yi | 201 | | Korean | ko | 6128 | 0.9996736292428199 | tr,ja | 202 | | Armenian | hy | 1855 | 0.9994609164420485 | de | 203 | | Bengali | bn | 4132 | 0.9992739593417231 | bpy,as | 204 | | Marathi | mr | 25633 | 0.9989466703078064 | hi,gom,pt,new | 205 | | English | en | 17094 | 0.9986544986544986 | nl,it,hu,eo,es | 206 | | Mandarin Chinese | zh | 17801 | 0.9978652884669401 | wuu,yue,ja,sr,pt | 207 | | Turkish | tr | 18879 | 0.9978282748026909 | en,eo,az,es,it | 208 | | Russian | ru | 20855 | 0.9977942939343083 | uk,bg,mk,sr,be | 209 | | German | de | 17223 | 0.9974452766649248 | en,it,fr,es,sv | 210 | | Uyghur | ug | 6135 | 0.9973920130399349 | ar,ba,tt,ca,hu | 211 | | Vietnamese | vi | 13130 | 0.9971058644325971 | it,pms,eo,pt,fr | 212 | | Esperanto | eo | 21641 | 0.9966729818400258 | it,es,tr,pt,pl | 213 | | Georgian | ka | 4550 | 0.996043956043956 | xmf,en | 214 | | Hindi | hi | 11497 | 0.9958249978255197 | mr,dty,new,bh,ne | 215 | | Italian | it | 20449 | 0.995598806787618 | es,en,fr,eo,pt | 216 | | Arabic | ar | 25531 | 0.9955348399984333 | arz,fa,en,mzn,ps | 217 | | French | fr | 16040 | 0.9953865336658354 | en,it,ia,es,pt | 218 | | Hungarian | hu | 20843 | 0.9952502039053879 | en,pt,it,nl,eo | 219 | | Lao | lo | 183 | 0.994535519125683 | el | 220 | | Polish | pl | 21386 | 0.9940147760216964 | en,it,eo,de,cs | 221 | | Khmer | km | 1252 | 0.9920127795527156 | az,ru,sr,et | 222 | | Spanish | es | 20498 | 0.9895599570689824 | pt,it,fr,ca,en | 223 | | Finnish | fi | 20731 | 0.9849500747672567 | it,en,eo,et,nl | 224 | | Portuguese | pt | 18352 | 0.9833805579773321 | es,it,gl,fr,en | 225 | | Macedonian | mk | 23602 | 0.9830099144140327 | ru,bg,sr,uk | 226 | | Ukrainian | uk | 23251 | 0.982667412154316 | ru,mk,bg,be,sr | 227 | | Urdu | ur | 1583 | 0.9797852179406191 | pnb,fa,ug,en,ro | 228 | | Dutch | nl | 19349 | 0.9720915809602564 | en,de,nds,af,fr | 229 | | Lithuanian | lt | 24184 | 0.9597667879589812 | eo,fi,sr,pt,pl | 230 | | Czech | cs | 25189 | 0.951605859700663 | sk,pl,hu,en,sl | 231 | | Chuvash | cv | 1332 | 0.9481981981981982 | ru,uk,krc,ba,sr | 232 | | Tatar | tt | 8283 | 0.9471206084751902 | ru,tr,az,kk,ky | 233 | | Swedish | sv | 24466 | 0.9464563067113545 | da,no,en,de,eo | 234 | | Icelandic | is | 7745 | 0.9449967721110394 | da,et,cs,no,de | 235 | | Bulgarian | bg | 19328 | 0.9352235099337748 | mk,ru,uk,sr,tg | 236 | | Sanskrit | sa | 135 | 0.9259259259259259 | hi,ne,mr | 237 | | Kazakh | kk | 2373 | 0.9258322798145807 | uk,tt,tr,ru,ky | 238 | | Romanian | ro | 18367 | 0.9235041106332008 | it,es,en,fr,pt | 239 | | Tagalog | tl | 11133 | 0.9193389023623462 | ceb,en,it,id,es | 240 | | Ossetian | os | 205 | 0.9170731707317074 | ru,hy,sr,kv,mrj | 241 | | Indonesian | id | 9707 | 0.9138765839085197 | ms,en,it,eo,tr | 242 | | Danish | da | 22539 | 0.9081591907360576 | no,sv,de,en,fr | 243 | | Latin | la | 24699 | 0.8979310903275436 | it,fr,en,es,pt | 244 | | Basque | eu | 4570 | 0.8851203501094091 | it,id,hu,nl,eo | 245 | | Belarusian | be | 9005 | 0.8785119378123265 | ru,uk,bg,mk,pl | 246 | | Cornish | kw | 3757 | 0.8759648655842428 | en,de,cy,es,br | 247 | | Tajik | tg | 48 | 0.875 | ru,uk | 248 | | Latvian | lv | 2198 | 0.8735213830755232 | lt,es,sr,en,fr | 249 | | Breton | br | 5468 | 0.8579005120702268 | en,fr,pt,de,eu | 250 | | Irish | ga | 1977 | 0.840161861406171 | en,pt,es,ca,gd | 251 | | Bashkir | ba | 128 | 0.8359375 | tt,ru,sr,av,kk | 252 | | Sindhi | sd | 6 | 0.8333333333333334 | ur | 253 | | Serbian | sr | 23128 | 0.8054738844690419 | hr,mk,sh,ru,sl | 254 | | Estonian | et | 3077 | 0.8043548911277218 | fi,en,hu,tr,it | 255 | | Scottish Gaelic | gd | 753 | 0.7822045152722443 | en,ga,de,fr,pam | 256 | | Welsh | cy | 1167 | 0.7660668380462725 | en,es,kw,la,it | 257 | | Volapük | vo | 3941 | 0.7609743719868054 | id,en,eo,fi,de | 258 | | Kyrgyz | ky | 227 | 0.7533039647577092 | ru,kk,tt,mn,bg | 259 | | Catalan | ca | 5313 | 0.7504234895539243 | es,pt,it,fr,en | 260 | | Assamese | as | 2635 | 0.7127134724857686 | bn,bpy,en,tl,bh | 261 | | Yoruba | yo | 31 | 0.7096774193548387 | ga,pl,en,qu,ckb | 262 | | Occitan | oc | 4096 | 0.70751953125 | es,fr,ca,pt,it | 263 | | Interlingua | ia | 14949 | 0.7073382834972239 | it,es,fr,en,la | 264 | | Afrikaans | af | 3299 | 0.6808123673840558 | nl,en,de,fr,nds | 265 | | Norwegian Nynorsk | nn (no) | 1287 | 0.6798756798756799 | da,sv,de,es,hu | 266 | | Maltese | mt | 165 | 0.6727272727272727 | hu,en,es,it,pl | 267 | | Slovak | sk | 13877 | 0.6105786553289616 | cs,pl,sl,no,sr | 268 | | Chechen | ce | 25 | 0.6 | bg,sr,mn,ba,uk | 269 | | Interlingue | ie | 6538 | 0.5183542367696543 | es,it,en,fr,eo | 270 | | Ido | io | 6495 | 0.4857582755966128 | eo,es,it,pt,tr | 271 | | Slovenian | sl | 908 | 0.46255506607929514 | sr,hr,cs,pl,bs | 272 | | Javanese | jv | 548 | 0.45255474452554745 | id,en,ko,ms,hu | 273 | | Turkmen | tk | 4585 | 0.45169029443838604 | tr,en,uz,et,pl | 274 | | Croatian | hr | 4186 | 0.4362159579550884 | sr,sh,bs,sl,pl | 275 | | Galician | gl | 3245 | 0.4200308166409861 | pt,es,it,en,fr | 276 | | Luxembourgish | lb | 732 | 0.3975409836065574 | de,fr,en,nds,nl | 277 | | Frisian | fy | 282 | 0.36879432624113473 | nl,en,nds,de,fr | 278 | | Walloon | wa | 37 | 0.2972972972972973 | fr,en,no,it,gn | 279 | | Corsican | co | 13 | 0.23076923076923078 | it,min,ro,ilo,id | 280 | | Sundanese | su | 18 | 0.2222222222222222 | id,es,en,it,lmo | 281 | | Somali | so | 61 | 0.14754098360655737 | en,fi,et,cy,su | 282 | | Limburgan, Limburger, Limburgish | li | 34 | 0.14705882352941177 | de,nl,en,no,is | 283 | | Haitian Creole | ht | 58 | 0.1206896551724138 | en,fr,br,la,de | 284 | | Manx | gv | 30 | 0.06666666666666667 | en,it,pt,fr,kw | 285 | | Bosnian | bs | 520 | 0.04423076923076923 | sr,hr,sh,it,pl | 286 | | Aragonese | an | 73 | 0.0136986301369863 | es,pt,it,en,fr | 287 | | Romansh | rm | 11 | 0 | it,pt,fr,en,tl | 288 | 289 | ### Additional Insights 290 | 291 | - During testing, the highest incorrect probability was often near 1, which means it's not possible to use a high possibility to suggest a correct assessment 292 | 293 | - The lowest probability for a correct assessment varried widely. Although these were good predictors for some of the very accurate languages (99.9%), other languages were sometimes as low as a .09 probability. This means it's not possible to use a low probability as an accurate assessment of a false positive. 294 | 295 | - To improve expectations of an incorrect result, you can use the difference in probability of result 1 and 2. It appears that the verage probability difference between 1 and 2 is somewhat of an indicator of a potentially incorrect prediction. 296 | 297 | - Anything over 100 characters is strongly accurate, though there isn't enough sentences for test data to assure this for all the test languages. 55 out of 82 languages that had this data had a 99% or better accuracy, 63 had 90%+ accuracy, 72 had 75%+ accuracy. For 200+ characters, 42 of 47 languages had a perfect score, though most had less than 10 test cases. 298 | 299 | - Spanish tends to give the most false positives based on sheer quantity of percentage of false positives. 300 | 301 | - In attempting to add a second check with `franc` for a smaller difference in probabilities between language 1 and 2 (i.e. less than 0.2), only the worst performing languages showed significant benefit. There doesn't seem to be a trend for any other languages. You can see this data on the `COMPARISONS.md`. 302 | 303 | ## Improving Accuracy 304 | 305 | Most incorrect suggestions are due to non-text characters (i.e. punctuation) that should be filtered out to provide better results. Please submit an issue for incorrect suggestions so we can work on improving the accuracy. 306 | 307 | ## Comparison NPM Libaries 308 | 309 | Success benchmarking has been checked with other popular libraries (notably `franc` and `languagedetect`) and results are included in `benchmark-testing/results/COMPARISONS.md` 310 | 311 | ## Sample Dockerfile 312 | 313 | Note: You need to have python installed to make this work in alpine-node 314 | 315 | ```Dockerfile 316 | FROM mhart/alpine-node:14 317 | 318 | WORKDIR /usr/src/app 319 | 320 | COPY package*.json ./ 321 | 322 | RUN apk add --no-cache --virtual .gyp \ 323 | python \ 324 | make \ 325 | g++ \ 326 | && npm ci --only=production \ 327 | && apk del .gyp 328 | 329 | COPY . ./ 330 | 331 | CMD [ "npm", "start" ] 332 | ``` 333 | 334 | ## TODO List 335 | 336 | - Improve accuracy by replicating the test analysis from https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c and attempt to improve the `formatText()` function by strategically choosing punctuation / non-text characters. 337 | 338 | _This is an improved modification of https://www.npmjs.com/package/fasttext-lid_ 339 | 340 | Created with <3 for https://smodin.io 341 | -------------------------------------------------------------------------------- /benchmark-testing/analyze.ts: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const { version } = require('../package.json') 3 | const csv = require('csvtojson') 4 | const { asyncPoolForEach, getTSVsInDir } = require('./helpers') 5 | const { tatoeba2Languages, fastTextLanguages } = require('./constants') 6 | const LanguageDetection = require('../src/index.ts') 7 | const lid = new LanguageDetection() 8 | 9 | const getFileNameAndPath = (iso3Code: string) => `data/${iso3Code}_sentences.tsv` 10 | 11 | const getTsvSentences = (iso3Code: string, limit: number, minSentenceLength: number, maxSentenceLength: number) => { 12 | const content = fs.readFileSync(getFileNameAndPath(iso3Code), 'utf8') 13 | const rows = content.split('\n').slice(0, limit) 14 | const filteredRows = rows.filter((row: string) => { 15 | const columns = row.split('\t') 16 | if (columns.length !== 3) { 17 | return false 18 | } else { 19 | const sentenceLength = columns[2].length 20 | if (sentenceLength < minSentenceLength || sentenceLength > maxSentenceLength) { 21 | return false 22 | } 23 | } 24 | 25 | return true 26 | }) 27 | 28 | const sentences = filteredRows.map((row: string) => row.split('\t')[2]) 29 | 30 | return sentences 31 | } 32 | 33 | const buildData = async ( 34 | limit: number, 35 | minSentenceLength: number, 36 | maxSentenceLength: number, 37 | includeOnly?: string[] 38 | ) => { 39 | const TSVFiles = getTSVsInDir('data') 40 | const iso3Langs = tatoeba2Languages.map((lang: any) => lang.iso3) 41 | const iso3LangsWithData = TSVFiles.filter((file: string) => iso3Langs.includes(file.substring(0, 3))).map( 42 | (file: string) => file.substring(0, 3) 43 | ) 44 | 45 | let sentenceCount: number = 0 46 | const DATA: any[] = [] 47 | await asyncPoolForEach(iso3LangsWithData, async (iso3Lang: string) => { 48 | const tatoeba2Language = tatoeba2Languages.find((lang: any) => lang.iso3 === iso3Lang) 49 | const { fastTextSymbol, alternativeSymbols } = tatoeba2Language 50 | if (!includeOnly || includeOnly.includes(fastTextSymbol)) { 51 | const sentences = getTsvSentences(iso3Lang, limit, minSentenceLength, maxSentenceLength) 52 | 53 | if (sentences.length > 0) { 54 | console.info(`${fastTextSymbol}: ${sentences.length} sentences`) 55 | sentenceCount = sentenceCount + sentences.length 56 | DATA.push({ 57 | language: fastTextSymbol, 58 | alternativeSymbols, 59 | texts: sentences, 60 | }) 61 | } 62 | } 63 | }) 64 | 65 | console.info(`FINAL: ${DATA.length} languages & ${sentenceCount} sentences`) 66 | 67 | return [DATA, sentenceCount] 68 | } 69 | 70 | const predict = async (text: string) => { 71 | const predictions = await lid.predict(text) 72 | 73 | return Array.isArray(predictions) && predictions[0] ? predictions[0].lang : null 74 | } 75 | 76 | const createResultsMDFile = ( 77 | results: any[], 78 | languageCount: number, 79 | sentenceCount: number, 80 | minSentenceLength: number, 81 | maxSentenceLength: number 82 | ) => { 83 | // const results = require('./results/benchmark_results_0.2.1.json') // optionally create from existing file 84 | const sortedResults = Object.keys(results) 85 | .map((lang: string) => ({ fastTextSymbol: lang, ...results[lang] })) 86 | .sort((a, b) => { 87 | if (a.accuracy === b.accuracy) { 88 | return b.count - a.count 89 | } 90 | return a.accuracy < b.accuracy ? 1 : -1 91 | }) 92 | 93 | const getResultsMDDisplayRow = (result: any) => { 94 | const tatoeba2Language = tatoeba2Languages.find((l: any) => l.fastTextSymbol === result.fastTextSymbol) 95 | const { language, alternativeSymbols } = tatoeba2Language 96 | 97 | return `| ${language} | ${result.fastTextSymbol}${alternativeSymbols ? ` (${alternativeSymbols})` : ''} | ${ 98 | result.count 99 | } | ${result.accuracy} | ${result.mislabels.map((label: any) => label.lang).join(',')} | ${result.falsePositives} |` 100 | } 101 | const getResultsCSVRow = (result: any) => { 102 | const tatoeba2Language = tatoeba2Languages.find((l: any) => l.fastTextSymbol === result.fastTextSymbol) 103 | const { language, alternativeSymbols } = tatoeba2Language 104 | 105 | return [ 106 | language.replace(/,/g, '|'), 107 | `${result.fastTextSymbol}${alternativeSymbols ? ` (${alternativeSymbols})` : ''}`, 108 | result.count, 109 | result.accuracy, 110 | result.mislabels.map((label: any) => label.lang).join('|'), 111 | result.falsePositives, 112 | result.lowestProbability, 113 | result.highestFalseProbability, 114 | result.correctAvgConfidence, 115 | result.incorrectAvgConfidence, 116 | ] 117 | .map((item: any) => (item === undefined ? ' ' : item)) 118 | .join(',') 119 | } 120 | 121 | const resultsMD = [ 122 | `| Language (${languageCount}) | Symbol (alternates) | Count (${sentenceCount})| Accuracy (${minSentenceLength} - ${maxSentenceLength} chars) | Mislabels | False Positives |`, 123 | '| -------- | ----------- | ------------ | -------------- | ---------- | --------- |', 124 | ...sortedResults.map(getResultsMDDisplayRow), 125 | ].join('\n') 126 | 127 | fs.writeFileSync(`./results/RESULTS.md`, resultsMD, 'utf-8') 128 | 129 | const resultsCSV = [ 130 | [ 131 | `Language (${languageCount})`, 132 | `Symbol (alternates)`, 133 | `Count (${sentenceCount})`, 134 | `Accuracy (${minSentenceLength} - ${maxSentenceLength} chars)`, 135 | 'Mislabels', 136 | `False Positives`, 137 | `Lowest Probability of Correct`, 138 | `Highest Probability of Incorrect`, 139 | 'Correct Average Probability Difference', 140 | 'Incorrect Average Probability Difference', 141 | ].join(','), 142 | ...sortedResults.map(getResultsCSVRow), 143 | ].join('\n') 144 | fs.writeFileSync(`./results/RESULTS_with_metadata.csv`, resultsCSV, 'utf-8') 145 | } 146 | 147 | const createIsReliableList = (results: any[], minAccuracy = 0.95, minTestCount = 10) => { 148 | const sortedResults = Object.keys(results) 149 | .map((lang: string) => ({ fastTextSymbol: lang, ...results[lang] })) 150 | .sort((a, b) => { 151 | if (a.accuracy === b.accuracy) { 152 | return b.count - a.count 153 | } 154 | return a.accuracy < b.accuracy ? 1 : -1 155 | }) 156 | .filter(({ count, accuracy }) => accuracy >= minAccuracy && count >= minTestCount) 157 | .map((l: any) => l.fastTextSymbol) 158 | 159 | fs.writeFileSync(`./results/reliability_list_${version}.json`, JSON.stringify(sortedResults), 'utf-8') 160 | } 161 | 162 | const analyzeDatasets = async ( 163 | includeOnly?: string[], 164 | perLanguageSentenceLimit = 30000, 165 | minSentenceLength = 30, 166 | maxSentenceLength = 250 167 | ) => { 168 | const [data, sentenceCount] = await buildData( 169 | perLanguageSentenceLimit, 170 | minSentenceLength, 171 | maxSentenceLength, 172 | includeOnly 173 | ) 174 | fs.writeFileSync(`./results/benchmark_results_${version}_data.json`, JSON.stringify(data), 'utf-8') 175 | const results: any = {} 176 | const falsePositives: any = {} 177 | await asyncPoolForEach( 178 | data, 179 | async ({ 180 | language, 181 | alternativeSymbols, 182 | texts, 183 | }: { 184 | language: string 185 | alternativeSymbols?: string[] 186 | texts: string[] 187 | }) => { 188 | let count = 0 189 | let accuratePredictions = 0 190 | let incorrectPredictions: { [key: string]: number } = {} 191 | let lowestProbability = 1 192 | let highestFalseProbability = 0 193 | let correctPredictionConfidences: number[] = [] 194 | let incorrectPredictionConfidences: number[] = [] 195 | 196 | await asyncPoolForEach( 197 | texts, 198 | async (text: string) => { 199 | // const prediction = await predict(text) 200 | const predictions: any[] = await lid.predict(text, 2) 201 | const { lang: prediction, prob: probability } = predictions[0] 202 | const probabilityDifference = probability - predictions[1].prob 203 | 204 | count = count + 1 205 | if ( 206 | prediction === language || 207 | (Array.isArray(alternativeSymbols) && alternativeSymbols.includes(prediction)) 208 | ) { 209 | accuratePredictions = accuratePredictions + 1 210 | if (probability < lowestProbability) { 211 | lowestProbability = probability 212 | } 213 | correctPredictionConfidences.push(probabilityDifference) 214 | } else { 215 | if (probability > highestFalseProbability) { 216 | highestFalseProbability = probability 217 | } 218 | falsePositives[prediction] = falsePositives[prediction] ? falsePositives[prediction] + 1 : 1 219 | incorrectPredictions[prediction] = incorrectPredictions[prediction] 220 | ? incorrectPredictions[prediction] + 1 221 | : 1 222 | incorrectPredictionConfidences.push(probabilityDifference) 223 | } 224 | }, 225 | 10 226 | ) 227 | 228 | results[language] = { 229 | count, 230 | accuratePredictions, 231 | mislabels: Object.keys(incorrectPredictions) 232 | .map((lang: string) => ({ 233 | lang, 234 | count: incorrectPredictions[lang], 235 | })) 236 | .sort((a, b) => b.count - a.count) 237 | .slice(0, 5), 238 | accuracy: accuratePredictions / count, 239 | falsePositives: falsePositives[language] || 0, 240 | lowestProbability, 241 | highestFalseProbability, 242 | correctAvgConfidence: 243 | correctPredictionConfidences.reduce((a, b) => a + b, 0) / correctPredictionConfidences.length, 244 | incorrectAvgConfidence: 245 | incorrectPredictionConfidences.reduce((a, b) => a + b, 0) / incorrectPredictionConfidences.length, 246 | } 247 | } 248 | ) 249 | 250 | // save results file 251 | fs.writeFileSync(`./results/benchmark_results_${version}.json`, JSON.stringify(results), 'utf-8') 252 | 253 | createResultsMDFile(results, (data as any[]).length, sentenceCount as number, minSentenceLength, maxSentenceLength) 254 | createIsReliableList(results) 255 | 256 | console.info('Finished writing files.') 257 | } 258 | 259 | analyzeDatasets() 260 | -------------------------------------------------------------------------------- /benchmark-testing/constants.js: -------------------------------------------------------------------------------- 1 | // this is an incomplete list 2 | const tatoeba2Languages = [ 3 | { iso3: 'abk', iso1: 'ab', language: 'Abkhaz' }, 4 | { iso3: 'aar', iso1: 'aa', language: 'Afar' }, 5 | { iso3: 'afr', iso1: 'af', language: 'Afrikaans', fastTextSymbol: 'af' }, 6 | { iso3: 'amh', iso1: 'am', language: 'Amharic', fastTextSymbol: 'am' }, 7 | { iso3: 'ara', iso1: 'ar', language: 'Arabic', fastTextSymbol: 'ar' }, 8 | { iso3: 'arg', iso1: 'an', language: 'Aragonese', fastTextSymbol: 'an' }, 9 | { iso3: 'asm', iso1: 'as', language: 'Assamese', fastTextSymbol: 'as' }, 10 | { iso3: 'ave', iso1: 'ae', language: 'Avestan' }, 11 | { iso3: 'bak', iso1: 'ba', language: 'Bashkir', fastTextSymbol: 'ba' }, 12 | { iso3: 'bam', iso1: 'bm', language: 'Bambara' }, 13 | { iso3: 'bel', iso1: 'be', language: 'Belarusian', fastTextSymbol: 'be' }, 14 | { iso3: 'ben', iso1: 'bn', language: 'Bengali', fastTextSymbol: 'bn' }, 15 | { iso3: 'bis', iso1: 'bi', language: 'Bislama' }, 16 | { iso3: 'bod', iso1: 'bo', language: 'Tibetan', fastTextSymbol: 'bo' }, 17 | { iso3: 'bos', iso1: 'bs', language: 'Bosnian', fastTextSymbol: 'bs' }, 18 | { iso3: 'bre', iso1: 'br', language: 'Breton', fastTextSymbol: 'br' }, 19 | { iso3: 'bul', iso1: 'bg', language: 'Bulgarian', fastTextSymbol: 'bg' }, 20 | { iso3: 'cat', iso1: 'ca', language: 'Catalan', fastTextSymbol: 'ca' }, 21 | { iso3: 'ces', iso1: 'cs', language: 'Czech', fastTextSymbol: 'cs' }, 22 | { iso3: 'cha', iso1: 'ch', language: 'Chamorro' }, 23 | { iso3: 'che', iso1: 'ce', language: 'Chechen', fastTextSymbol: 'ce' }, 24 | { 25 | iso3: 'chu', 26 | iso1: 'cu', 27 | language: 'Church Slavic, Old Slavonic, Church Slavonic, Old Bulgarian, Old Church Slavonic', 28 | }, 29 | { iso3: 'chv', iso1: 'cv', language: 'Chuvash', fastTextSymbol: 'cv' }, 30 | { iso3: 'cmn', iso1: 'zh', language: 'Mandarin Chinese', fastTextSymbol: 'zh' }, 31 | { iso3: 'cor', iso1: 'kw', language: 'Cornish', fastTextSymbol: 'kw' }, 32 | { iso3: 'cos', iso1: 'co', language: 'Corsican', fastTextSymbol: 'co' }, 33 | { iso3: 'cym', iso1: 'cy', language: 'Welsh', fastTextSymbol: 'cy' }, 34 | { iso3: 'dan', iso1: 'da', language: 'Danish', fastTextSymbol: 'da' }, 35 | { iso3: 'deu', iso1: 'de', language: 'German', fastTextSymbol: 'de' }, 36 | { iso3: 'div', iso1: 'dv', language: 'Divehi, Dhivehi, Maldivian', fastTextSymbol: 'dv' }, 37 | { iso3: 'dzo', iso1: 'dz', language: 'Dzongkha' }, 38 | { iso3: 'ell', iso1: 'el', language: 'Greek', fastTextSymbol: 'el' }, 39 | { iso3: 'eng', iso1: 'en', language: 'English', fastTextSymbol: 'en' }, 40 | { iso3: 'epo', iso1: 'eo', language: 'Esperanto', fastTextSymbol: 'eo' }, 41 | { iso3: 'est', iso1: 'et', language: 'Estonian', fastTextSymbol: 'et' }, 42 | { iso3: 'eus', iso1: 'eu', language: 'Basque', fastTextSymbol: 'eu' }, 43 | { iso3: 'ewe', iso1: 'ee', language: 'Ewe' }, 44 | { iso3: 'fao', iso1: 'fo', language: 'Faroese' }, 45 | { iso3: 'fij', iso1: 'fj', language: 'Fijian' }, 46 | { iso3: 'fin', iso1: 'fi', language: 'Finnish', fastTextSymbol: 'fi' }, 47 | { iso3: 'fra', iso1: 'fr', language: 'French', fastTextSymbol: 'fr' }, 48 | { iso3: 'fry', iso1: 'fy', language: 'Frisian', fastTextSymbol: 'fy' }, 49 | { iso3: 'gla', iso1: 'gd', language: 'Scottish Gaelic', fastTextSymbol: 'gd' }, 50 | { iso3: 'gle', iso1: 'ga', language: 'Irish', fastTextSymbol: 'ga' }, 51 | { iso3: 'glg', iso1: 'gl', language: 'Galician', fastTextSymbol: 'gl' }, 52 | { iso3: 'glv', iso1: 'gv', language: 'Manx', fastTextSymbol: 'gv' }, 53 | { iso3: 'guj', iso1: 'gu', language: 'Gujarati', fastTextSymbol: 'gu' }, 54 | { iso3: 'hat', iso1: 'ht', language: 'Haitian Creole', fastTextSymbol: 'ht' }, 55 | { iso3: 'hau', iso1: 'ha', language: 'Hausa' }, 56 | { iso3: 'heb', iso1: 'he', language: 'Hebrew', fastTextSymbol: 'he' }, 57 | { iso3: 'her', iso1: 'hz', language: 'Herero' }, 58 | { iso3: 'hin', iso1: 'hi', language: 'Hindi', fastTextSymbol: 'hi' }, 59 | { iso3: 'hmo', iso1: 'ho', language: 'Hiri Motu' }, 60 | { iso3: 'hrv', iso1: 'hr', language: 'Croatian', fastTextSymbol: 'hr' }, 61 | { iso3: 'hun', iso1: 'hu', language: 'Hungarian', fastTextSymbol: 'hu' }, 62 | { iso3: 'hye', iso1: 'hy', language: 'Armenian', fastTextSymbol: 'hy' }, 63 | { iso3: 'ibo', iso1: 'ig', language: 'Igbo' }, 64 | { iso3: 'ido', iso1: 'io', language: 'Ido', fastTextSymbol: 'io' }, 65 | { iso3: 'ile', iso1: 'ie', language: 'Interlingue', fastTextSymbol: 'ie' }, 66 | { iso3: 'ina', iso1: 'ia', language: 'Interlingua', fastTextSymbol: 'ia' }, 67 | { iso3: 'ind', iso1: 'id', language: 'Indonesian', fastTextSymbol: 'id' }, 68 | { iso3: 'iii', iso1: 'ii', language: 'Sichuan Yi, Nuosu' }, 69 | { iso3: 'isl', iso1: 'is', language: 'Icelandic', fastTextSymbol: 'is' }, 70 | { iso3: 'ita', iso1: 'it', language: 'Italian', fastTextSymbol: 'it' }, 71 | { iso3: 'jav', iso1: 'jv', language: 'Javanese', fastTextSymbol: 'jv' }, 72 | { iso3: 'jpn', iso1: 'ja', language: 'Japanese', fastTextSymbol: 'ja' }, 73 | { iso3: 'kal', iso1: 'kl', language: 'Greenlandic' }, 74 | { iso3: 'kan', iso1: 'kn', language: 'Kannada', fastTextSymbol: 'kn' }, 75 | { iso3: 'kas', iso1: 'ks', language: 'Kashmiri' }, 76 | { iso3: 'kat', iso1: 'ka', language: 'Georgian', fastTextSymbol: 'ka' }, 77 | { iso3: 'kaz', iso1: 'kk', language: 'Kazakh', fastTextSymbol: 'kk' }, 78 | { iso3: 'khm', iso1: 'km', language: 'Khmer', fastTextSymbol: 'km' }, 79 | { iso3: 'kik', iso1: 'ki', language: 'Kikuyu, Gikuyu' }, 80 | { iso3: 'kin', iso1: 'rw', language: 'Kinyarwanda' }, 81 | { iso3: 'kir', iso1: 'ky', language: 'Kyrgyz', fastTextSymbol: 'ky' }, 82 | { iso3: 'kor', iso1: 'ko', language: 'Korean', fastTextSymbol: 'ko' }, 83 | { iso3: 'kua', iso1: 'kj', language: 'Kuanyama, Kwanyama' }, 84 | { iso3: 'lao', iso1: 'lo', language: 'Lao', fastTextSymbol: 'lo' }, 85 | { iso3: 'lat', iso1: 'la', language: 'Latin', fastTextSymbol: 'la' }, 86 | { iso3: 'lvs', iso1: 'lv', language: 'Latvian', fastTextSymbol: 'lv' }, 87 | { iso3: 'lim', iso1: 'li', language: 'Limburgan, Limburger, Limburgish', fastTextSymbol: 'li' }, 88 | { iso3: 'lin', iso1: 'ln', language: 'Lingala' }, 89 | { iso3: 'lit', iso1: 'lt', language: 'Lithuanian', fastTextSymbol: 'lt' }, 90 | { iso3: 'ltz', iso1: 'lb', language: 'Luxembourgish', fastTextSymbol: 'lb' }, 91 | { iso3: 'lug', iso1: 'lg', language: 'Luganda' }, 92 | { iso3: 'lub', iso1: 'lu', language: 'Luba-Katanga' }, 93 | { iso3: 'mah', iso1: 'mh', language: 'Marshallese' }, 94 | { iso3: 'mal', iso1: 'ml', language: 'Malayalam', fastTextSymbol: 'ml' }, 95 | { iso3: 'mar', iso1: 'mr', language: 'Marathi', fastTextSymbol: 'mr' }, 96 | { iso3: 'mkd', iso1: 'mk', language: 'Macedonian', fastTextSymbol: 'mk' }, 97 | { iso3: 'mlt', iso1: 'mt', language: 'Maltese', fastTextSymbol: 'mt' }, 98 | { iso3: 'mri', iso1: 'mi', language: 'Maori' }, 99 | { iso3: 'mal', iso1: 'ms', language: 'Malay', fastTextSymbol: 'ms' }, 100 | { iso3: 'mya', iso1: 'my', language: 'Burmese', fastTextSymbol: 'my' }, 101 | { iso3: 'nau', iso1: 'na', language: 'Nauruan' }, 102 | { iso3: 'nav', iso1: 'nv', language: 'Navajo' }, 103 | { iso3: 'nbl', iso1: 'nr', language: 'South Ndebele' }, 104 | { iso3: 'nde', iso1: 'nd', language: 'North Ndebele' }, 105 | { iso3: 'ndo', iso1: 'ng', language: 'Ndonga' }, 106 | { iso3: 'nld', iso1: 'nl', language: 'Dutch', fastTextSymbol: 'nl' }, 107 | { iso3: 'nob', iso1: 'nb', language: 'Norwegian Bokmål' }, 108 | { iso3: 'nno', iso1: 'nn', language: 'Norwegian Nynorsk', fastTextSymbol: 'nn', alternativeSymbols: ['no'] }, 109 | { iso3: 'nya', iso1: 'ny', language: 'Chinyanja' }, 110 | { iso3: 'oci', iso1: 'oc', language: 'Occitan', fastTextSymbol: 'oc' }, 111 | { iso3: 'oss', iso1: 'os', language: 'Ossetian', fastTextSymbol: 'os' }, 112 | { iso3: 'pan', iso1: 'pa', language: 'Punjabi (Eastern)', fastTextSymbol: 'pa' }, 113 | { iso3: 'pli', iso1: 'pi', language: 'Pali' }, 114 | { iso3: 'pol', iso1: 'pl', language: 'Polish', fastTextSymbol: 'pl' }, 115 | { iso3: 'por', iso1: 'pt', language: 'Portuguese', fastTextSymbol: 'pt' }, 116 | { iso3: 'roh', iso1: 'rm', language: 'Romansh', fastTextSymbol: 'rm' }, 117 | { iso3: 'ron', iso1: 'ro', language: 'Romanian', fastTextSymbol: 'ro' }, 118 | { iso3: 'run', iso1: 'rn', language: 'Rundi' }, 119 | { iso3: 'rus', iso1: 'ru', language: 'Russian', fastTextSymbol: 'ru' }, 120 | { iso3: 'sag', iso1: 'sg', language: 'Sango' }, 121 | { iso3: 'san', iso1: 'sa', language: 'Sanskrit', fastTextSymbol: 'sa' }, 122 | { iso3: 'sin', iso1: 'si', language: 'Sinhala', fastTextSymbol: 'si' }, 123 | { iso3: 'slk', iso1: 'sk', language: 'Slovak', fastTextSymbol: 'sk' }, 124 | { iso3: 'slv', iso1: 'sl', language: 'Slovenian', fastTextSymbol: 'sl' }, 125 | { iso3: 'sme', iso1: 'se', language: 'Northern Sami' }, 126 | { iso3: 'smo', iso1: 'sm', language: 'Samoan' }, 127 | { iso3: 'sna', iso1: 'sn', language: 'Shona' }, 128 | { iso3: 'snd', iso1: 'sd', language: 'Sindhi', fastTextSymbol: 'sd' }, 129 | { iso3: 'som', iso1: 'so', language: 'Somali', fastTextSymbol: 'so' }, 130 | { iso3: 'sot', iso1: 'st', language: 'Southern Sotho' }, 131 | { iso3: 'spa', iso1: 'es', language: 'Spanish', fastTextSymbol: 'es' }, 132 | { iso3: 'srp', iso1: 'sr', language: 'Serbian', fastTextSymbol: 'sr' }, 133 | { iso3: 'ssw', iso1: 'ss', language: 'Swazi' }, 134 | { iso3: 'sun', iso1: 'su', language: 'Sundanese', fastTextSymbol: 'su' }, 135 | { iso3: 'swe', iso1: 'sv', language: 'Swedish', fastTextSymbol: 'sv' }, 136 | { iso3: 'tah', iso1: 'ty', language: 'Tahitian' }, 137 | { iso3: 'tam', iso1: 'ta', language: 'Tamil', fastTextSymbol: 'ta' }, 138 | { iso3: 'tat', iso1: 'tt', language: 'Tatar', fastTextSymbol: 'tt' }, 139 | { iso3: 'tel', iso1: 'te', language: 'Telugu', fastTextSymbol: 'te' }, 140 | { iso3: 'tgk', iso1: 'tg', language: 'Tajik', fastTextSymbol: 'tg' }, 141 | { iso3: 'tgl', iso1: 'tl', language: 'Tagalog', fastTextSymbol: 'tl' }, 142 | { iso3: 'tha', iso1: 'th', language: 'Thai', fastTextSymbol: 'th' }, 143 | { iso3: 'tir', iso1: 'ti', language: 'Tigrinya' }, 144 | { iso3: 'ton', iso1: 'to', language: 'Tongan' }, 145 | { iso3: 'tsn', iso1: 'tn', language: 'Setswana' }, 146 | { iso3: 'tso', iso1: 'ts', language: 'Tsonga' }, 147 | { iso3: 'tuk', iso1: 'tk', language: 'Turkmen', fastTextSymbol: 'tk' }, 148 | { iso3: 'tur', iso1: 'tr', language: 'Turkish', fastTextSymbol: 'tr' }, 149 | { iso3: 'twi', iso1: 'tw', language: 'Twi' }, 150 | { iso3: 'uig', iso1: 'ug', language: 'Uyghur', fastTextSymbol: 'ug' }, 151 | { iso3: 'ukr', iso1: 'uk', language: 'Ukrainian', fastTextSymbol: 'uk' }, 152 | { iso3: 'urd', iso1: 'ur', language: 'Urdu', fastTextSymbol: 'ur' }, 153 | { iso3: 'ven', iso1: 've', language: 'Venda' }, 154 | { iso3: 'vie', iso1: 'vi', language: 'Vietnamese', fastTextSymbol: 'vi' }, 155 | { iso3: 'vol', iso1: 'vo', language: 'Volapük', fastTextSymbol: 'vo' }, 156 | { iso3: 'wln', iso1: 'wa', language: 'Walloon', fastTextSymbol: 'wa' }, 157 | { iso3: 'wol', iso1: 'wo', language: 'Wolof' }, 158 | { iso3: 'xho', iso1: 'xh', language: 'Xhosa' }, 159 | { iso3: 'yor', iso1: 'yo', language: 'Yoruba', fastTextSymbol: 'yo' }, 160 | { iso3: 'zul', iso1: 'zu', language: 'Zulu' }, 161 | ] 162 | 163 | const fastTextLanguages = [ 164 | 'af', 165 | 'als', 166 | 'am', 167 | 'an', 168 | 'ar', 169 | 'arz', 170 | 'as', 171 | 'ast', 172 | 'av', 173 | 'az', 174 | 'azb', 175 | 'ba', 176 | 'bar', 177 | 'bcl', 178 | 'be', 179 | 'bg', 180 | 'bh', 181 | 'bn', 182 | 'bo', 183 | 'bpy', 184 | 'br', 185 | 'bs', 186 | 'bxr', 187 | 'ca', 188 | 'cbk', 189 | 'ce', 190 | 'ceb', 191 | 'ckb', 192 | 'co', 193 | 'cs', 194 | 'cv', 195 | 'cy', 196 | 'da', 197 | 'de', 198 | 'diq', 199 | 'dsb', 200 | 'dty', 201 | 'dv', 202 | 'el', 203 | 'eml', 204 | 'en', 205 | 'eo', 206 | 'es', 207 | 'et', 208 | 'eu', 209 | 'fa', 210 | 'fi', 211 | 'fr', 212 | 'frr', 213 | 'fy', 214 | 'ga', 215 | 'gd', 216 | 'gl', 217 | 'gn', 218 | 'gom', 219 | 'gu', 220 | 'gv', 221 | 'he', 222 | 'hi', 223 | 'hif', 224 | 'hr', 225 | 'hsb', 226 | 'ht', 227 | 'hu', 228 | 'hy', 229 | 'ia', 230 | 'id', 231 | 'ie', 232 | 'ilo', 233 | 'io', 234 | 'is', 235 | 'it', 236 | 'ja', 237 | 'jbo', 238 | 'jv', 239 | 'ka', 240 | 'kk', 241 | 'km', 242 | 'kn', 243 | 'ko', 244 | 'krc', 245 | 'ku', 246 | 'kv', 247 | 'kw', 248 | 'ky', 249 | 'la', 250 | 'lb', 251 | 'lez', 252 | 'li', 253 | 'lmo', 254 | 'lo', 255 | 'lrc', 256 | 'lt', 257 | 'lv', 258 | 'mai', 259 | 'mg', 260 | 'mhr', 261 | 'min', 262 | 'mk', 263 | 'ml', 264 | 'mn', 265 | 'mr', 266 | 'mrj', 267 | 'ms', 268 | 'mt', 269 | 'mwl', 270 | 'my', 271 | 'myv', 272 | 'mzn', 273 | 'nah', 274 | 'nap', 275 | 'nds', 276 | 'ne', 277 | 'new', 278 | 'nl', 279 | 'nn', 280 | 'no', 281 | 'oc', 282 | 'or', 283 | 'os', 284 | 'pa', 285 | 'pam', 286 | 'pfl', 287 | 'pl', 288 | 'pms', 289 | 'pnb', 290 | 'ps', 291 | 'pt', 292 | 'qu', 293 | 'rm', 294 | 'ro', 295 | 'ru', 296 | 'rue', 297 | 'sa', 298 | 'sah', 299 | 'sc', 300 | 'scn', 301 | 'sco', 302 | 'sd', 303 | 'sh', 304 | 'si', 305 | 'sk', 306 | 'sl', 307 | 'so', 308 | 'sq', 309 | 'sr', 310 | 'su', 311 | 'sv', 312 | 'sw', 313 | 'ta', 314 | 'te', 315 | 'tg', 316 | 'th', 317 | 'tk', 318 | 'tl', 319 | 'tr', 320 | 'tt', 321 | 'tyv', 322 | 'ug', 323 | 'uk', 324 | 'ur', 325 | 'uz', 326 | 'vec', 327 | 'vep', 328 | 'vi', 329 | 'vls', 330 | 'vo', 331 | 'wa', 332 | 'war', 333 | 'wuu', 334 | 'xal', 335 | 'xmf', 336 | 'yi', 337 | 'yo', 338 | 'yue', 339 | 'zh', 340 | ] 341 | 342 | module.exports = { 343 | tatoeba2Languages, 344 | fastTextLanguages, 345 | } 346 | -------------------------------------------------------------------------------- /benchmark-testing/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smodin-io/fast-text-language-detection/9b3c87287aa590966d6899ce87d5475ad6f27241/benchmark-testing/data/.gitkeep -------------------------------------------------------------------------------- /benchmark-testing/download-test-data.js: -------------------------------------------------------------------------------- 1 | const https = require('https') 2 | const fs = require('fs') 3 | const bz2 = require('unbzip2-stream') 4 | const { asyncPoolForEach } = require('./helpers') 5 | const { tatoeba2Languages } = require('./constants') 6 | 7 | const languages = tatoeba2Languages.filter((l) => !!l.fastTextSymbol) 8 | 9 | const extract_bz2_file = (file) => { 10 | return fs 11 | .createReadStream(file) 12 | .pipe(bz2()) 13 | .pipe(fs.createWriteStream(`${file.split('.bz2')[0]}`)) 14 | } 15 | 16 | const getFileNameAndPath = (iso3Code) => `data/${iso3Code}_sentences.tsv.bz2` 17 | 18 | const downloadAndSaveFile = (iso3Code) => { 19 | const url = `https://downloads.tatoeba.org/exports/per_language/${iso3Code}/${iso3Code}_sentences.tsv.bz2` 20 | 21 | const fileNameAndPath = getFileNameAndPath(iso3Code) 22 | const file = fs.createWriteStream(fileNameAndPath) 23 | return new Promise((resolve, reject) => { 24 | https.get(url, (response) => { 25 | if (response.statusCode === 404) { 26 | console.error('Missing language:', iso3Code) 27 | reject() 28 | } else { 29 | const stream = response.pipe(file) 30 | 31 | stream.on('finish', () => { 32 | resolve() 33 | }) 34 | } 35 | }) 36 | }) 37 | } 38 | 39 | const downloadExtractAndDelete = async (iso3Code) => { 40 | try { 41 | await downloadAndSaveFile(iso3Code) 42 | await extract_bz2_file(getFileNameAndPath(iso3Code)) 43 | await fs.unlinkSync(getFileNameAndPath(iso3Code)) 44 | } catch (e) { 45 | // delete, it's likely it was a 404 and missing 46 | await fs.unlinkSync(getFileNameAndPath(iso3Code)) 47 | return 48 | } 49 | } 50 | 51 | // will download and instlal only ones that fastText has and tatoeba2 dataset has 52 | const downloadAndSaveAllLanguages = async () => { 53 | console.info(`Downloading ${languages.length} languages.`) 54 | 55 | await asyncPoolForEach( 56 | languages, 57 | async (language) => { 58 | await downloadExtractAndDelete(language.iso3) 59 | }, 60 | 10 61 | ) 62 | 63 | console.info(`Finished Downloading Languages.`) 64 | } 65 | 66 | // downloadExtractAndDelete('ava') // bak 67 | 68 | downloadAndSaveAllLanguages() 69 | -------------------------------------------------------------------------------- /benchmark-testing/helpers.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const PromisePool = require('@supercharge/promise-pool') 3 | 4 | const asyncPoolForEach = (array, callback, concurrency = 3) => { 5 | return PromisePool.for(array).withConcurrency(concurrency).process(callback) 6 | } 7 | 8 | const getTSVsInDir = (source, extension = 'tsv') => { 9 | const files = fs.readdirSync(source) 10 | return files.filter((file) => file.match(new RegExp(`.*\.(${extension})`, 'ig'))) 11 | } 12 | 13 | module.exports = { 14 | asyncPoolForEach, 15 | getTSVsInDir, 16 | } 17 | -------------------------------------------------------------------------------- /benchmark-testing/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "requires": true, 3 | "lockfileVersion": 1, 4 | "dependencies": { 5 | "@cspotcode/source-map-consumer": { 6 | "version": "0.8.0", 7 | "resolved": "https://registry.npmjs.org/@cspotcode/source-map-consumer/-/source-map-consumer-0.8.0.tgz", 8 | "integrity": "sha512-41qniHzTU8yAGbCp04ohlmSrZf8bkf/iJsl3V0dRGsQN/5GFfx+LbCSsCpp2gqrqjTVg/K6O8ycoV35JIwAzAg==" 9 | }, 10 | "@cspotcode/source-map-support": { 11 | "version": "0.6.1", 12 | "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.6.1.tgz", 13 | "integrity": "sha512-DX3Z+T5dt1ockmPdobJS/FAsQPW4V4SrWEhD2iYQT2Cb2tQsiMnYxrcUH9By/Z3B+v0S5LMBkQtV/XOBbpLEOg==", 14 | "requires": { 15 | "@cspotcode/source-map-consumer": "0.8.0" 16 | } 17 | }, 18 | "@supercharge/promise-pool": { 19 | "version": "1.7.0", 20 | "resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.7.0.tgz", 21 | "integrity": "sha512-OpnF7oqk6asrOUMhldnDju4RKeZ/iMAfw3LIoLdcTI53RZJLiQ9vEAcGW+bcBELXkiPhT7RqtuPSXAFF2iAmbg==" 22 | }, 23 | "@tsconfig/node10": { 24 | "version": "1.0.8", 25 | "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.8.tgz", 26 | "integrity": "sha512-6XFfSQmMgq0CFLY1MslA/CPUfhIL919M1rMsa5lP2P097N2Wd1sSX0tx1u4olM16fLNhtHZpRhedZJphNJqmZg==" 27 | }, 28 | "@tsconfig/node12": { 29 | "version": "1.0.9", 30 | "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.9.tgz", 31 | "integrity": "sha512-/yBMcem+fbvhSREH+s14YJi18sp7J9jpuhYByADT2rypfajMZZN4WQ6zBGgBKp53NKmqI36wFYDb3yaMPurITw==" 32 | }, 33 | "@tsconfig/node14": { 34 | "version": "1.0.1", 35 | "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.1.tgz", 36 | "integrity": "sha512-509r2+yARFfHHE7T6Puu2jjkoycftovhXRqW328PDXTVGKihlb1P8Z9mMZH04ebyajfRY7dedfGynlrFHJUQCg==" 37 | }, 38 | "@tsconfig/node16": { 39 | "version": "1.0.2", 40 | "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.2.tgz", 41 | "integrity": "sha512-eZxlbI8GZscaGS7kkc/trHTT5xgrjH3/1n2JDwusC9iahPKWMRvRjJSAN5mCXviuTGQ/lHnhvv8Q1YTpnfz9gA==" 42 | }, 43 | "acorn": { 44 | "version": "8.5.0", 45 | "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.5.0.tgz", 46 | "integrity": "sha512-yXbYeFy+jUuYd3/CDcg2NkIYE991XYX/bje7LmjJigUciaeO1JR4XxXgCIV1/Zc/dRuFEyw1L0pbA+qynJkW5Q==" 47 | }, 48 | "acorn-walk": { 49 | "version": "8.2.0", 50 | "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", 51 | "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==" 52 | }, 53 | "arg": { 54 | "version": "4.1.3", 55 | "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", 56 | "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==" 57 | }, 58 | "bluebird": { 59 | "version": "3.7.2", 60 | "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", 61 | "integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==" 62 | }, 63 | "create-require": { 64 | "version": "1.1.1", 65 | "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", 66 | "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==" 67 | }, 68 | "csvtojson": { 69 | "version": "2.0.10", 70 | "resolved": "https://registry.npmjs.org/csvtojson/-/csvtojson-2.0.10.tgz", 71 | "integrity": "sha512-lUWFxGKyhraKCW8Qghz6Z0f2l/PqB1W3AO0HKJzGIQ5JRSlR651ekJDiGJbBT4sRNNv5ddnSGVEnsxP9XRCVpQ==", 72 | "requires": { 73 | "bluebird": "^3.5.1", 74 | "lodash": "^4.17.3", 75 | "strip-bom": "^2.0.0" 76 | } 77 | }, 78 | "diff": { 79 | "version": "4.0.2", 80 | "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", 81 | "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==" 82 | }, 83 | "is-utf8": { 84 | "version": "0.2.1", 85 | "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", 86 | "integrity": "sha1-Sw2hRCEE0bM2NA6AeX6GXPOffXI=" 87 | }, 88 | "lodash": { 89 | "version": "4.17.21", 90 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", 91 | "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" 92 | }, 93 | "make-error": { 94 | "version": "1.3.6", 95 | "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", 96 | "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==" 97 | }, 98 | "strip-bom": { 99 | "version": "2.0.0", 100 | "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-2.0.0.tgz", 101 | "integrity": "sha1-YhmoVhZSBJHzV4i9vxRHqZx+aw4=", 102 | "requires": { 103 | "is-utf8": "^0.2.0" 104 | } 105 | }, 106 | "ts-node": { 107 | "version": "10.2.1", 108 | "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.2.1.tgz", 109 | "integrity": "sha512-hCnyOyuGmD5wHleOQX6NIjJtYVIO8bPP8F2acWkB4W06wdlkgyvJtubO/I9NkI88hCFECbsEgoLc0VNkYmcSfw==", 110 | "requires": { 111 | "@cspotcode/source-map-support": "0.6.1", 112 | "@tsconfig/node10": "^1.0.7", 113 | "@tsconfig/node12": "^1.0.7", 114 | "@tsconfig/node14": "^1.0.0", 115 | "@tsconfig/node16": "^1.0.2", 116 | "acorn": "^8.4.1", 117 | "acorn-walk": "^8.1.1", 118 | "arg": "^4.1.0", 119 | "create-require": "^1.1.0", 120 | "diff": "^4.0.1", 121 | "make-error": "^1.1.1", 122 | "yn": "3.1.1" 123 | } 124 | }, 125 | "typescript": { 126 | "version": "4.4.3", 127 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.4.3.tgz", 128 | "integrity": "sha512-4xfscpisVgqqDfPaJo5vkd+Qd/ItkoagnHpufr+i2QCHBsNYp+G7UAoyFl8aPtx879u38wPV65rZ8qbGZijalA==" 129 | }, 130 | "yn": { 131 | "version": "3.1.1", 132 | "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", 133 | "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==" 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /benchmark-testing/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "scripts": { 3 | "analyze": "npx ts-node --transpile-only analyze.ts", 4 | "getdata": "node download-test-data.js" 5 | }, 6 | "dependencies": { 7 | "@supercharge/promise-pool": "^1.7.0", 8 | "csvtojson": "^2.0.10", 9 | "ts-node": "^10.2.1", 10 | "typescript": "^4.4.3" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark-testing/results/COMPARISONS.md: -------------------------------------------------------------------------------- 1 | # Comparing to Other NPM Language Prediction Packages 2 | 3 | **Note**: If the accuracy was 0 it was assumed not to be a supported language of the package. Each package was tested with the entire 99 languages. The expected output to compare to was the iso3 value. If this value didn't match, then it would most certainly have an accuracy score of 0. 4 | 5 | # Franc 6 | 7 | Obtained from https://github.com/wooorm/franc/tree/main version ^5.0.0 8 | 9 | Supported Languages: 187 10 | 11 | Time to Completion: 2 Minutes 12 | 13 | | Language (68) | Symbol | Count (555790) | Accuracy (30 - 250 chars) | 14 | | ----------------- | ------ | -------------- | ------------------------- | 15 | | Georgian | ka | 1973 | 1 | 16 | | Bengali | bn | 1164 | 1 | 17 | | Thai | th | 572 | 1 | 18 | | Malayalam | ml | 517 | 1 | 19 | | Burmese | my | 216 | 1 | 20 | | Tamil | ta | 205 | 1 | 21 | | Telugu | te | 102 | 1 | 22 | | Punjabi (Eastern) | pa | 88 | 1 | 23 | | Lao | lo | 70 | 1 | 24 | | Gujarati | gu | 57 | 1 | 25 | | Tibetan | bo | 20 | 1 | 26 | | Sinhala | si | 9 | 1 | 27 | | Amharic | am | 3 | 1 | 28 | | Greek | el | 12039 | 0.999916936622643 | 29 | | Japanese | ja | 2169 | 0.9981558321807285 | 30 | | Uyghur | ug | 3692 | 0.9981040086673889 | 31 | | Armenian | hy | 518 | 0.9980694980694981 | 32 | | Korean | ko | 482 | 0.9979253112033195 | 33 | | Mandarin Chinese | zh | 568 | 0.9947183098591549 | 34 | | Hebrew | he | 8616 | 0.994661095636026 | 35 | | Kannada | kn | 118 | 0.9915254237288136 | 36 | | Khmer | km | 379 | 0.9894459102902374 | 37 | | Tajik | tg | 30 | 0.9666666666666667 | 38 | | Urdu | ur | 963 | 0.9657320872274143 | 39 | | Kazakh | kk | 2232 | 0.9363799283154122 | 40 | | Belarusian | be | 6253 | 0.8984487446025907 | 41 | | German | de | 22014 | 0.8912964477150904 | 42 | | Vietnamese | vi | 13000 | 0.8835384615384615 | 43 | | French | fr | 23076 | 0.8797018547408563 | 44 | | Finnish | fi | 17406 | 0.8782603699873607 | 45 | | Somali | so | 32 | 0.875 | 46 | | Marathi | mr | 10461 | 0.8501099321288595 | 47 | | Hungarian | hu | 17942 | 0.8125627020399063 | 48 | | Polish | pl | 17768 | 0.8096015308419631 | 49 | | Afrikaans | af | 1632 | 0.8057598039215687 | 50 | | Tatar | tt | 8178 | 0.7922474932746393 | 51 | | Kyrgyz | ky | 66 | 0.7878787878787878 | 52 | | Italian | it | 18326 | 0.7713630906908218 | 53 | | Romanian | ro | 13560 | 0.7629793510324484 | 54 | | Ukrainian | uk | 14285 | 0.75960798039902 | 55 | | Hindi | hi | 5362 | 0.7513987318164864 | 56 | | Portuguese | pt | 20174 | 0.7396649152374343 | 57 | | Haitian Creole | ht | 15 | 0.7333333333333333 | 58 | | Catalan | ca | 4725 | 0.7172486772486772 | 59 | | Bulgarian | bg | 11144 | 0.709978463747308 | 60 | | Dutch | nl | 19626 | 0.7059512891062876 | 61 | | Norwegian Nynorsk | nn | 657 | 0.6940639269406392 | 62 | | Esperanto | eo | 17841 | 0.6932907348242812 | 63 | | Turkmen | tk | 3793 | 0.6904824677036646 | 64 | | Lithuanian | lt | 13835 | 0.6840621611853993 | 65 | | English | en | 22428 | 0.6620296058498306 | 66 | | Macedonian | mk | 14465 | 0.6540615278257864 | 67 | | Swedish | sv | 12188 | 0.633081719724319 | 68 | | Spanish | es | 18227 | 0.61803917265595 | 69 | | Yoruba | yo | 5 | 0.6 | 70 | | Turkish | tr | 19919 | 0.5897886440082334 | 71 | | Russian | ru | 17329 | 0.5891280512435801 | 72 | | Danish | da | 15299 | 0.5861167396561867 | 73 | | Czech | cs | 10863 | 0.5285832642916322 | 74 | | Galician | gl | 2618 | 0.5248281130634072 | 75 | | Slovak | sk | 4370 | 0.49702517162471394 | 76 | | Tagalog | tl | 10351 | 0.4821756352043281 | 77 | | Sundanese | su | 11 | 0.45454545454545453 | 78 | | Slovenian | sl | 372 | 0.44623655913978494 | 79 | | Indonesian | id | 9372 | 0.42509603072983354 | 80 | | Serbian | sr | 13494 | 0.37868682377352897 | 81 | | Bosnian | bs | 139 | 0.30935251798561153 | 82 | | Croatian | hr | 2222 | 0.28622862286228623 | 83 | | Javanese | jv | 260 | 0.2653846153846154 | 84 | 85 | # LanguageDetect 86 | 87 | Obtained from https://github.com/FGRibreau/node-language-detect version ^2.0.0 88 | 89 | Supported Langauges: 52 90 | 91 | Time to Completion: 1 Minute 92 | 93 | | Language (36) | Symbol | Count (555790) | Accuracy (30 - 250 chars) | 94 | | ------------- | ------ | -------------- | ------------------------- | 95 | | Bengali | bn | 1164 | 0.9991408934707904 | 96 | | Hindi | hi | 5362 | 0.9944050727340544 | 97 | | Urdu | ur | 963 | 0.9927310488058152 | 98 | | Vietnamese | vi | 13000 | 0.9906923076923076 | 99 | | Arabic | ar | 8761 | 0.9851615112430088 | 100 | | Dutch | nl | 19626 | 0.9665749515948232 | 101 | | Indonesian | id | 9372 | 0.9637217242851046 | 102 | | Kazakh | kk | 2232 | 0.956989247311828 | 103 | | Welsh | cy | 619 | 0.9531502423263328 | 104 | | French | fr | 23076 | 0.9489512913849887 | 105 | | Icelandic | is | 6364 | 0.9483029541169076 | 106 | | Polish | pl | 17768 | 0.9440004502476362 | 107 | | German | de | 22014 | 0.9438084855092214 | 108 | | Finnish | fi | 17406 | 0.9413420659542686 | 109 | | Somali | so | 32 | 0.9375 | 110 | | Kyrgyz | ky | 66 | 0.9242424242424242 | 111 | | Italian | it | 18326 | 0.9192404234421041 | 112 | | Hungarian | hu | 17942 | 0.909263181362167 | 113 | | English | en | 22428 | 0.9029338327091136 | 114 | | Turkish | tr | 19919 | 0.8688187157989858 | 115 | | Portuguese | pt | 20174 | 0.864677307425399 | 116 | | Romanian | ro | 13560 | 0.8488200589970502 | 117 | | Lithuanian | lt | 13835 | 0.8152511745572822 | 118 | | Spanish | es | 18227 | 0.8088549953365886 | 119 | | Latin | la | 11437 | 0.7977616507825479 | 120 | | Bulgarian | bg | 11144 | 0.7865218951902369 | 121 | | Swedish | sv | 12188 | 0.7738759435510338 | 122 | | Croatian | hr | 2222 | 0.7583258325832584 | 123 | | Ukrainian | uk | 14285 | 0.7467273363668183 | 124 | | Russian | ru | 17329 | 0.6920768653701888 | 125 | | Slovak | sk | 4370 | 0.6823798627002289 | 126 | | Danish | da | 15299 | 0.6808288123406758 | 127 | | Slovenian | sl | 372 | 0.6532258064516129 | 128 | | Macedonian | mk | 14465 | 0.623574144486692 | 129 | | Czech | cs | 10863 | 0.4528215041885299 | 130 | | Tagalog | tl | 10351 | 0.35523137861076226 | 131 | | Serbian | sr | 13494 | 0.17504075885578776 | 132 | 133 | # Using Franc With Fast Text 134 | 135 | Settings that achieved the best results were setting the threshold to 0.25 (AKA the probability difference was 0.25 between the first and second result) and using franc to decide between the top 2 results. For some lesser languages this provided significant improvement, but for more popular langauges, not so much. I experimented with threshold settings between .1 and .5 and language comparison counts between 2 and 5. 136 | 137 | | Language (101) | Symbol (alternates) | Count (558260) | Accuracy (30 - 250 chars) | addedAccuracy (Franc) | False Positives | 138 | | -------------------------------- | ------------------- | -------------- | ------------------------- | ------------------------ | --------------- | 139 | | English | en | 22428 | 1 | 0 | 120 | 140 | | Greek | el | 12039 | 1 | 0 | 0 | 141 | | Hebrew | he | 8616 | 1 | 0 | 0 | 142 | | Japanese | ja | 2169 | 1 | 0 | 0 | 143 | | Georgian | ka | 1973 | 1 | 0 | 0 | 144 | | Bengali | bn | 1164 | 1 | 0 | 131 | 145 | | Thai | th | 572 | 1 | 0 | 0 | 146 | | Mandarin Chinese | zh | 568 | 1 | 0 | 0 | 147 | | Malayalam | ml | 517 | 1 | 0 | 0 | 148 | | Korean | ko | 482 | 1 | 0 | 7 | 149 | | Burmese | my | 216 | 1 | 0 | 0 | 150 | | Tamil | ta | 205 | 1 | 0 | 0 | 151 | | Kannada | kn | 118 | 1 | 0 | 1 | 152 | | Telugu | te | 102 | 1 | 0 | 0 | 153 | | Punjabi (Eastern) | pa | 88 | 1 | 0 | 0 | 154 | | Lao | lo | 70 | 1 | 0 | 0 | 155 | | Gujarati | gu | 57 | 1 | 0 | 0 | 156 | | Tibetan | bo | 20 | 1 | 0 | 0 | 157 | | Divehi, Dhivehi, Maldivian | dv | 15 | 1 | 0 | 0 | 158 | | Sinhala | si | 9 | 1 | 0 | 0 | 159 | | Amharic | am | 3 | 1 | 0 | 0 | 160 | | German | de | 22014 | 0.9998637230853094 | 0.00009085127646046853 | 64 | 161 | | Polish | pl | 17768 | 0.999718595227375 | -0.00016884286357499168 | 88 | 162 | | Russian | ru | 17329 | 0.9997114663281205 | -0.00034624040625541586 | 241 | 163 | | Hungarian | hu | 17942 | 0.9996655891204994 | -0.000055735146583413275 | 43 | 164 | | Hindi | hi | 5362 | 0.999627004848937 | -0.00037299515106303804 | 0 | 165 | | Vietnamese | vi | 13000 | 0.9996153846153846 | 0 | 9 | 166 | | Turkish | tr | 19919 | 0.9995983734123199 | 0.00010040664692001489 | 1092 | 167 | | Esperanto | eo | 17841 | 0.999551594641556 | -0.00022420267922196313 | 13 | 168 | | French | fr | 23076 | 0.999523314265904 | 0.00017334026694393323 | 238 | 169 | | Marathi | mr | 10461 | 0.9995220342223496 | -0.00019118631106007644 | 2 | 170 | | Uyghur | ug | 3692 | 0.9991874322860238 | 0 | 0 | 171 | | Finnish | fi | 17406 | 0.9990807767436516 | 0.00045961162817420576 | 37 | 172 | | Italian | it | 18326 | 0.9989632216522972 | 0.00016370184437419777 | 2207 | 173 | | Spanish | es | 18227 | 0.998134635430954 | -0.0003840456465682429 | 3476 | 174 | | Armenian | hy | 518 | 0.9980694980694981 | 0 | 0 | 175 | | Arabic | ar | 8761 | 0.9978312977970552 | -0.00034242666362294116 | 0 | 176 | | Ukrainian | uk | 14285 | 0.9963598179908996 | 0.0002800140006999907 | 133 | 177 | | Macedonian | mk | 14465 | 0.9959903214656066 | -0.0012443829934324357 | 93 | 178 | | Dutch | nl | 19626 | 0.9934780393355752 | 0.0008661979007439369 | 382 | 179 | | Lithuanian | lt | 13835 | 0.9933501987712324 | 0.0017347307553305935 | 20 | 180 | | Portuguese | pt | 20174 | 0.9933082184990581 | 0.0015366313076237148 | 1149 | 181 | | Khmer | km | 379 | 0.9920844327176781 | -0.0026385224274406704 | 0 | 182 | | Urdu | ur | 963 | 0.9906542056074766 | 0.0031152647975077885 | 9 | 183 | | Czech | cs | 10863 | 0.9898738838258307 | 0.0006443892110835625 | 1 | 184 | | Swedish | sv | 12188 | 0.9886773875943551 | 0.001640958319658714 | 174 | 185 | | Romanian | ro | 13560 | 0.9886430678466077 | 0.0018436578171090456 | 133 | 186 | | Bulgarian | bg | 11144 | 0.9869885139985642 | 0.0008973438621679986 | 2 | 187 | | Ossetian | os | 59 | 0.9830508474576272 | 0 | 0 | 188 | | Icelandic | is | 6364 | 0.9803582652419862 | -0.013513513513513598 | 4 | 189 | | Kazakh | kk | 2232 | 0.9802867383512545 | 0.0035842293906810374 | 4 | 190 | | Tagalog | tl | 10351 | 0.9737223456670853 | 0.01062699256110522 | 21 | 191 | | Tatar | tt | 8178 | 0.9680851063829787 | -0.00024455857177796414 | 13 | 192 | | Basque | eu | 2999 | 0.9676558852950984 | -0.03867955985328442 | 14 | 193 | | Tajik | tg | 30 | 0.9666666666666667 | 0 | 0 | 194 | | Belarusian | be | 6253 | 0.9625779625779626 | 0.0075163921317767945 | 0 | 195 | | Latvian | lv | 1243 | 0.9597747385358005 | -0.024135156878519748 | 4 | 196 | | Chuvash | cv | 460 | 0.9543478260869566 | -0.006521739130434856 | 0 | 197 | | Breton | br | 2451 | 0.9543043655650755 | -0.027335781313749474 | 0 | 198 | | Bashkir | ba | 120 | 0.95 | -0.033333333333333326 | 0 | 199 | | Indonesian | id | 9372 | 0.949637217242851 | 0.02443448570209139 | 16 | 200 | | Danish | da | 15299 | 0.948035819334597 | 0.022877312242630232 | 2 | 201 | | Estonian | et | 1227 | 0.9356153219233904 | 0.01792991035044822 | 5 | 202 | | Latin | la | 11437 | 0.9206085511934948 | -0.10894465331817782 | 292 | 203 | | Irish | ga | 867 | 0.9065743944636678 | -0.05882352941176472 | 14 | 204 | | Scottish Gaelic | gd | 542 | 0.8966789667896679 | -0.08302583025830257 | 2 | 205 | | Welsh | cy | 619 | 0.8917609046849758 | -0.043618739903069526 | 8 | 206 | | Catalan | ca | 4725 | 0.8833862433862434 | 0.02031746031746029 | 0 | 207 | | Kyrgyz | ky | 66 | 0.8787878787878788 | 0 | 4 | 208 | | Cornish | kw | 426 | 0.8779342723004695 | -0.03521126760563387 | 1 | 209 | | Assamese | as | 960 | 0.8635416666666667 | -0.03437499999999993 | 0 | 210 | | Volapük | vo | 806 | 0.8511166253101737 | -0.12034739454094301 | 15 | 211 | | Serbian | sr | 13494 | 0.8489699125537276 | 0.011190158589002519 | 1050 | 212 | | Slovak | sk | 4370 | 0.8263157894736842 | 0.024256292906178478 | 45 | 213 | | Maltese | mt | 52 | 0.8076923076923077 | -0.17307692307692313 | 7 | 214 | | Norwegian Nynorsk | nn (no) | 657 | 0.7990867579908676 | -0.0243531202435312 | 29 | 215 | | Afrikaans | af | 1632 | 0.7879901960784313 | 0.040441176470588314 | 0 | 216 | | Occitan | oc | 2861 | 0.7679133170220203 | -0.08843061866480251 | 27 | 217 | | Interlingua | ia | 18782 | 0.7500798636992866 | -0.16393355340219362 | 82 | 218 | | Sanskrit | sa | 11 | 0.7272727272727273 | 0 | 0 | 219 | | Chechen | ce | 7 | 0.7142857142857143 | 0 | 0 | 220 | | Slovenian | sl | 372 | 0.6774193548387096 | 0.008064516129032251 | 62 | 221 | | Frisian | fy | 107 | 0.6635514018691588 | -0.07476635514018692 | 8 | 222 | | Javanese | jv | 260 | 0.6461538461538462 | 0.0461538461538461 | 5 | 223 | | Yoruba | yo | 5 | 0.6 | 0 | 1 | 224 | | Luxembourgish | lb | 217 | 0.5944700460829493 | -0.07373271889400923 | 3 | 225 | | Galician | gl | 2618 | 0.5790679908326967 | 0.037815126050420256 | 8 | 226 | | Turkmen | tk | 3793 | 0.5710519377801213 | 0.05404692855259685 | 0 | 227 | | Croatian | hr | 2222 | 0.5333033303330333 | 0.023402340234023322 | 45 | 228 | | Aragonese | an | 4 | 0.5 | 0 | 0 | 229 | | Ido | io | 2905 | 0.48055077452667816 | -0.11703958691910499 | 7 | 230 | | Interlingue | ie | 2007 | 0.4718485301444943 | -0.13702042850024915 | 7 | 231 | | Limburgan, Limburger, Limburgish | li | 3 | 0.3333333333333333 | 0 | 1 | 232 | | Walloon | wa | 16 | 0.3125 | 0 | 1 | 233 | | Somali | so | 32 | 0.21875 | 0.25 | 1 | 234 | | Corsican | co | 5 | 0.2 | 0 | 0 | 235 | | Sundanese | su | 11 | 0.18181818181818182 | 0.09090909090909088 | 19 | 236 | | Haitian Creole | ht | 15 | 0.06666666666666667 | 0.06666666666666667 | 3 | 237 | | Romansh | rm | 16 | 0.0625 | 0 | 3 | 238 | | Bosnian | bs | 139 | 0.03597122302158273 | 0.02877697841726619 | 0 | 239 | | Manx | gv | 6 | 0 | 0 | 0 | 240 | -------------------------------------------------------------------------------- /benchmark-testing/results/RESULTS.md: -------------------------------------------------------------------------------- 1 | | Language (101) | Symbol (alternates) | Count (558260) | Accuracy (30 - 250 chars) | Mislabels | False Positives | 2 | | -------------------------------- | ------------------- | -------------- | ------------------------- | ---------------- | --------------- | 3 | | English | en | 22428 | 1 | | 120 | 4 | | Greek | el | 12039 | 1 | | 0 | 5 | | Hebrew | he | 8616 | 1 | | 0 | 6 | | Japanese | ja | 2169 | 1 | | 0 | 7 | | Georgian | ka | 1973 | 1 | | 0 | 8 | | Bengali | bn | 1164 | 1 | | 131 | 9 | | Thai | th | 572 | 1 | | 0 | 10 | | Mandarin Chinese | zh | 568 | 1 | | 0 | 11 | | Malayalam | ml | 517 | 1 | | 0 | 12 | | Korean | ko | 482 | 1 | | 7 | 13 | | Burmese | my | 216 | 1 | | 0 | 14 | | Tamil | ta | 205 | 1 | | 0 | 15 | | Kannada | kn | 118 | 1 | | 1 | 16 | | Telugu | te | 102 | 1 | | 0 | 17 | | Punjabi (Eastern) | pa | 88 | 1 | | 0 | 18 | | Lao | lo | 70 | 1 | | 0 | 19 | | Gujarati | gu | 57 | 1 | | 0 | 20 | | Tibetan | bo | 20 | 1 | | 0 | 21 | | Divehi, Dhivehi, Maldivian | dv | 15 | 1 | | 0 | 22 | | Sinhala | si | 9 | 1 | | 0 | 23 | | Amharic | am | 3 | 1 | | 0 | 24 | | German | de | 22014 | 0.9998637230853094 | en | 64 | 25 | | Polish | pl | 17768 | 0.999718595227375 | en,eo,de,ro | 88 | 26 | | Russian | ru | 17329 | 0.9997114663281205 | bg,kk,uk,mk | 241 | 27 | | Hungarian | hu | 17942 | 0.9996655891204994 | tr,br,it,de,en | 43 | 28 | | Hindi | hi | 5362 | 0.999627004848937 | mr | 0 | 29 | | Vietnamese | vi | 13000 | 0.9996153846153846 | eo,hu,fr | 9 | 30 | | Turkish | tr | 19919 | 0.9995983734123199 | eo,en,it,fr,nds | 1092 | 31 | | Esperanto | eo | 17841 | 0.999551594641556 | it,es,pt,fr,ceb | 13 | 32 | | French | fr | 23076 | 0.999523314265904 | en,es,it,ru | 238 | 33 | | Marathi | mr | 10461 | 0.9995220342223496 | hi | 2 | 34 | | Uyghur | ug | 3692 | 0.9991874322860238 | ba,ru,hu | 0 | 35 | | Finnish | fi | 17406 | 0.9990807767436516 | it,et,en,hr,de | 37 | 36 | | Italian | it | 18326 | 0.9989632216522972 | es,de,fr,en,la | 2207 | 37 | | Spanish | es | 18227 | 0.998134635430954 | pt,it,io,ca,ia | 3476 | 38 | | Armenian | hy | 518 | 0.9980694980694981 | de | 0 | 39 | | Arabic | ar | 8761 | 0.9978312977970552 | arz,fa,es,mzn,en | 0 | 40 | | Ukrainian | uk | 14285 | 0.9963598179908996 | ru,sr | 133 | 41 | | Macedonian | mk | 14465 | 0.9959903214656066 | bg,sr,ru | 93 | 42 | | Dutch | nl | 19626 | 0.9934780393355752 | en,af,de,nds,fr | 382 | 43 | | Lithuanian | lt | 13835 | 0.9933501987712324 | fi,pl,eo,pt,sr | 20 | 44 | | Portuguese | pt | 20174 | 0.9933082184990581 | es,gl,it,en,fr | 1149 | 45 | | Khmer | km | 379 | 0.9920844327176781 | az,et | 0 | 46 | | Urdu | ur | 963 | 0.9906542056074766 | pnb,fa,ro,en | 9 | 47 | | Czech | cs | 10863 | 0.9898738838258307 | sk,pl,hu,sl,en | 1 | 48 | | Swedish | sv | 12188 | 0.9886773875943551 | no,da,en,fi,id | 174 | 49 | | Romanian | ro | 13560 | 0.9886430678466077 | es,fr,it,en,pt | 133 | 50 | | Bulgarian | bg | 11144 | 0.9869885139985642 | mk,ru,uk,sr | 2 | 51 | | Ossetian | os | 59 | 0.9830508474576272 | ru | 0 | 52 | | Icelandic | is | 6364 | 0.9803582652419862 | et,no,da,hu,cs | 4 | 53 | | Kazakh | kk | 2232 | 0.9802867383512545 | ru,tr,tt,uk,ky | 4 | 54 | | Tagalog | tl | 10351 | 0.9737223456670853 | ceb,en,id,es,war | 21 | 55 | | Tatar | tt | 8178 | 0.9680851063829787 | az,tr,ru,fi,kk | 13 | 56 | | Basque | eu | 2999 | 0.9676558852950984 | it,nl,id,en,io | 14 | 57 | | Tajik | tg | 30 | 0.9666666666666667 | ru | 0 | 58 | | Belarusian | be | 6253 | 0.9625779625779626 | uk,ru,pl,bg,sr | 0 | 59 | | Latvian | lv | 1243 | 0.9597747385358005 | lt,hr,sr,fi,eo | 4 | 60 | | Chuvash | cv | 460 | 0.9543478260869566 | ru,uk,ba,sr | 0 | 61 | | Breton | br | 2451 | 0.9543043655650755 | fr,nl,eu,de,pt | 0 | 62 | | Bashkir | ba | 120 | 0.95 | tt,av | 0 | 63 | | Indonesian | id | 9372 | 0.949637217242851 | ms,it,en,eo,tr | 16 | 64 | | Danish | da | 15299 | 0.948035819334597 | no,sv,de,en,nn | 2 | 65 | | Estonian | et | 1227 | 0.9356153219233904 | fi,en,hu,it,nl | 5 | 66 | | Latin | la | 11437 | 0.9206085511934948 | fr,it,en,es,pt | 292 | 67 | | Irish | ga | 867 | 0.9065743944636678 | en,gd,ca,kv,cs | 14 | 68 | | Scottish Gaelic | gd | 542 | 0.8966789667896679 | en,ga,de,fr,pam | 2 | 69 | | Welsh | cy | 619 | 0.8917609046849758 | es,en,la,kw,de | 8 | 70 | | Catalan | ca | 4725 | 0.8833862433862434 | es,pt,fr,it,ro | 0 | 71 | | Kyrgyz | ky | 66 | 0.8787878787878788 | ru,kk | 4 | 72 | | Cornish | kw | 426 | 0.8779342723004695 | en,cy,de,br,sq | 1 | 73 | | Assamese | as | 960 | 0.8635416666666667 | bn | 0 | 74 | | Volapük | vo | 806 | 0.8511166253101737 | id,de,fi,en,eo | 15 | 75 | | Serbian | sr | 13494 | 0.8489699125537276 | hr,sh,mk,bs,sl | 1050 | 76 | | Slovak | sk | 4370 | 0.8263157894736842 | cs,pl,sl,no,sr | 45 | 77 | | Maltese | mt | 52 | 0.8076923076923077 | es,cs,pt,sr,eo | 7 | 78 | | Norwegian Nynorsk | nn (no) | 657 | 0.7990867579908676 | da,sv,de,es,fi | 29 | 79 | | Afrikaans | af | 1632 | 0.7879901960784313 | nl,en,fr,de,nds | 0 | 80 | | Occitan | oc | 2861 | 0.7679133170220203 | ca,es,fr,pt,it | 27 | 81 | | Interlingua | ia | 18782 | 0.7500798636992866 | es,it,fr,la,pt | 82 | 82 | | Sanskrit | sa | 11 | 0.7272727272727273 | hi,ne | 0 | 83 | | Chechen | ce | 7 | 0.7142857142857143 | mn,ru | 0 | 84 | | Slovenian | sl | 372 | 0.6774193548387096 | sr,hr,bs,pl,eo | 62 | 85 | | Frisian | fy | 107 | 0.6635514018691588 | nl,en,de,af,fr | 8 | 86 | | Javanese | jv | 260 | 0.6461538461538462 | id,en,ms,ko,su | 5 | 87 | | Yoruba | yo | 5 | 0.6 | sk,rm | 1 | 88 | | Luxembourgish | lb | 217 | 0.5944700460829493 | de,nds,sv,fr,nl | 3 | 89 | | Galician | gl | 2618 | 0.5790679908326967 | pt,es,it,fr,ca | 8 | 90 | | Turkmen | tk | 3793 | 0.5710519377801213 | tr,uz,en,et,io | 0 | 91 | | Croatian | hr | 2222 | 0.5333033303330333 | sr,sh,bs,sl,pl | 45 | 92 | | Aragonese | an | 4 | 0.5 | es | 0 | 93 | | Ido | io | 2905 | 0.48055077452667816 | eo,es,it,pt,tr | 7 | 94 | | Interlingue | ie | 2007 | 0.4718485301444943 | es,it,fr,en,ia | 7 | 95 | | Limburgan, Limburger, Limburgish | li | 3 | 0.3333333333333333 | de | 1 | 96 | | Walloon | wa | 16 | 0.3125 | fr,pt,tl,oc,en | 1 | 97 | | Somali | so | 32 | 0.21875 | fi,eo,cy,en,az | 1 | 98 | | Corsican | co | 5 | 0.2 | it,fr | 0 | 99 | | Sundanese | su | 11 | 0.18181818181818182 | id,ms,es | 19 | 100 | | Haitian Creole | ht | 15 | 0.06666666666666667 | br,fr,su,diq,no | 3 | 101 | | Romansh | rm | 16 | 0.0625 | it,fr,en,tl,qu | 3 | 102 | | Bosnian | bs | 139 | 0.03597122302158273 | sr,hr,sh,pl,sl | 0 | 103 | | Manx | gv | 6 | 0 | cy,fr,nl,et,en | 0 | 104 | -------------------------------------------------------------------------------- /benchmark-testing/results/RESULTS_with_metadata.csv: -------------------------------------------------------------------------------- 1 | Language (101),Symbol (alternates),Count (558260),Accuracy (30 - 250 chars),Mislabels,False Positives,Lowest Probability,Highest Probability,Correct Average Probability Difference,Incorrect Average Probability Difference 2 | English,en,22428,1,,120,0.31277090311050415,0,0.9614576326805134,NaN 3 | Greek,el,12039,1,,0,0.5245928764343262,0,0.9983111462375585,NaN 4 | Hebrew,he,8616,1,,0,0.8786194324493408,0,0.999817035583582,NaN 5 | Japanese,ja,2169,1,,0,0.9178405404090881,0,0.9993141423071018,NaN 6 | Georgian,ka,1973,1,,0,0.7867827415466309,0,0.9956593665799054,NaN 7 | Bengali,bn,1164,1,,131,0.5253420472145081,0,0.9968120863004913,NaN 8 | Thai,th,572,1,,0,0.9927242994308472,0,0.9997663607907868,NaN 9 | Mandarin Chinese,zh,568,1,,0,0.4597342610359192,0,0.9716624540774863,NaN 10 | Malayalam,ml,517,1,,0,0.9986559152603149,0,0.9999117751173325,NaN 11 | Korean,ko,482,1,,7,0.8198571801185608,0,0.9989503776111682,NaN 12 | Burmese,my,216,1,,0,0.9940844178199768,0,0.9998873020938769,NaN 13 | Tamil,ta,205,1,,0,0.9989544749259949,0,1.0000112452849863,NaN 14 | Kannada,kn,118,1,,1,0.9560707807540894,0,0.9986984921360537,NaN 15 | Telugu,te,102,1,,0,0.9983260035514832,0,0.9999092835657447,NaN 16 | Punjabi (Eastern),pa,88,1,,0,0.9979016780853271,0,0.9997773331272645,NaN 17 | Lao,lo,70,1,,0,0.3941526412963867,0,0.7733188960435134,NaN 18 | Gujarati,gu,57,1,,0,0.9978494048118591,0,0.9997111081808742,NaN 19 | Tibetan,bo,20,1,,0,0.9992840886116028,0,0.9999102103707628,NaN 20 | Divehi| Dhivehi| Maldivian,dv,15,1,,0,0.9750259518623352,0,0.9856150934055525,NaN 21 | Sinhala,si,9,1,,0,0.9964900016784668,0,0.9990718097635383,NaN 22 | Amharic,am,3,1,,0,0.9874904751777649,0,0.9915543248450073,NaN 23 | German,de,22014,0.9998637230853094,en,64,0.2944102883338928,0.5308237671852112,0.9873204391080626,0.09905306498209636 24 | Polish,pl,17768,0.999718595227375,en|eo|de|ro,88,0.21345049142837524,0.6913073062896729,0.9890330630600913,0.15380255579948426 25 | Russian,ru,17329,0.9997114663281205,bg|kk|uk|mk,241,0.5000648498535156,0.7860463261604309,0.9907578862055559,0.16911868155002593 26 | Hungarian,hu,17942,0.9996655891204994,tr|br|it|de|en,43,0.21547481417655945,0.9958308935165405,0.9872564970510354,0.2987443840441604 27 | Hindi,hi,5362,0.999627004848937,mr,0,0.5321338772773743,0.8880062103271484,0.9939448039387319,0.726732112467289 28 | Vietnamese,vi,13000,0.9996153846153846,eo|hu|fr,9,0.31661665439605713,0.9998860359191895,0.9981391458102108,0.6576308641175274 29 | Turkish,tr,19919,0.9995983734123199,eo|en|it|fr|nds,1092,0.22616560757160187,0.6153099536895752,0.9927376141332793,0.2094208262860775 30 | Esperanto,eo,17841,0.999551594641556,it|es|pt|fr|ceb,13,0.23145444691181183,0.6255041360855103,0.9771947755108922,0.15247727558016777 31 | French,fr,23076,0.999523314265904,en|es|it|ru,238,0.2356633096933365,0.6747938990592957,0.9778449336716961,0.17877410623160275 32 | Marathi,mr,10461,0.9995220342223496,hi,2,0.4904531240463257,1.00002920627594,0.9920979104482953,0.8198977816457045 33 | Uyghur,ug,3692,0.9991874322860238,ba|ru|hu,0,0.5213584303855896,0.5531967878341675,0.9962514697004626,0.24187888701756796 34 | Finnish,fi,17406,0.9990807767436516,it|et|en|hr|de,37,0.16255062818527222,0.6433226466178894,0.9710191056221561,0.18440376338548958 35 | Italian,it,18326,0.9989632216522972,es|de|fr|en|la,2207,0.1876032054424286,0.9551967978477478,0.9713309805001584,0.2656080092205421 36 | Spanish,es,18227,0.998134635430954,pt|it|io|ca|ia,3476,0.26517876982688904,0.840143084526062,0.9406717692031589,0.22759082803831382 37 | Armenian,hy,518,0.9980694980694981,de,0,0.9992043375968933,0.15621936321258545,1.0000453856843372,0.035118408501148224 38 | Arabic,ar,8761,0.9978312977970552,arz|fa|es|mzn|en,0,0.1952907145023346,0.9997252821922302,0.983906333174537,0.6618713119096356 39 | Ukrainian,uk,14285,0.9963598179908996,ru|sr,133,0.3561241626739502,0.9522985816001892,0.9853175570592393,0.38070394650388223 40 | Macedonian,mk,14465,0.9959903214656066,bg|sr|ru,93,0.3797056972980499,0.8783867359161377,0.9659177295866043,0.2429016182391808 41 | Dutch,nl,19626,0.9934780393355752,en|af|de|nds|fr,382,0.09129912406206131,0.9536797404289246,0.9579142892651017,0.23610969319270225 42 | Lithuanian,lt,13835,0.9933501987712324,fi|pl|eo|pt|sr,20,0.12790657579898834,0.9843482971191406,0.9371611838942122,0.19300622200471876 43 | Portuguese,pt,20174,0.9933082184990581,es|gl|it|en|fr,1149,0.21880872547626495,0.9230474233627319,0.9484523955782658,0.24518370324814762 44 | Khmer,km,379,0.9920844327176781,az|et,0,0.1995551884174347,0.5972086191177368,0.9853872296326177,0.18748741348584494 45 | Urdu,ur,963,0.9906542056074766,pnb|fa|ro|en,9,0.5764018893241882,0.9979611039161682,0.976251529633783,0.4885160554226281 46 | Czech,cs,10863,0.9898738838258307,sk|pl|hu|sl|en,1,0.14324608445167542,0.9298855066299438,0.9387299941674171,0.223398203690621 47 | Swedish,sv,12188,0.9886773875943551,no|da|en|fi|id,174,0.1215486153960228,0.9114173054695129,0.9320943643945397,0.2572491969238373 48 | Romanian,ro,13560,0.9886430678466077,es|fr|it|en|pt,133,0.15412086248397827,0.8736457824707031,0.9464068983322231,0.16759528258404174 49 | Bulgarian,bg,11144,0.9869885139985642,mk|ru|uk|sr,2,0.3436012268066406,0.9950229525566101,0.9387939350125626,0.3202424690127373 50 | Ossetian,os,59,0.9830508474576272,ru,0,0.20986901223659515,0.7272360324859619,0.8207800509568689,0.5724851340055466 51 | Icelandic,is,6364,0.9803582652419862,et|no|da|hu|cs,4,0.14347882568836212,0.9222594499588013,0.9331574130853139,0.20815998595952986 52 | Kazakh,kk,2232,0.9802867383512545,ru|tr|tt|uk|ky,4,0.3560357093811035,0.9922317266464233,0.9706690864912705,0.3102114117408002 53 | Tagalog,tl,10351,0.9737223456670853,ceb|en|id|es|war,21,0.09424613416194916,0.9999354481697083,0.7973100713083318,0.2603196013111462 54 | Tatar,tt,8178,0.9680851063829787,az|tr|ru|fi|kk,13,0.14898742735385895,1.000041127204895,0.9237682296735699,0.3369664904009205 55 | Basque,eu,2999,0.9676558852950984,it|nl|id|en|io,14,0.09961723536252975,0.9024008512496948,0.7555618922205802,0.12721208734533837 56 | Tajik,tg,30,0.9666666666666667,ru,0,0.5033898949623108,0.4936623275279999,0.8163995144142094,0.32341302931308746 57 | Belarusian,be,6253,0.9625779625779626,uk|ru|pl|bg|sr,0,0.2943115234375,0.9963804483413696,0.9391700768558114,0.46370226852934104 58 | Latvian,lv,1243,0.9597747385358005,lt|hr|sr|fi|eo,4,0.14727765321731567,0.9997793436050415,0.8689372204577105,0.19104408176950527 59 | Chuvash,cv,460,0.9543478260869566,ru|uk|ba|sr,0,0.4113158583641052,0.9127076268196106,0.9045468202402555,0.2456898053309747 60 | Breton,br,2451,0.9543043655650755,fr|nl|eu|de|pt,0,0.11872711777687073,0.99974125623703,0.8608614166133552,0.15981427728456765 61 | Bashkir,ba,120,0.95,tt|av,0,0.47795307636260986,0.987030029296875,0.853874352934853,0.5168701093643904 62 | Indonesian,id,9372,0.949637217242851,ms|it|en|eo|tr,16,0.15933756530284882,0.9737618565559387,0.7402594791836996,0.3008475312963128 63 | Danish,da,15299,0.948035819334597,no|sv|de|en|nn,2,0.13372956216335297,1.0000048875808716,0.8043522860550157,0.24264461969795084 64 | Estonian,et,1227,0.9356153219233904,fi|en|hu|it|nl,5,0.14920711517333984,0.9715332388877869,0.7930725701567162,0.24454671802305722 65 | Latin,la,11437,0.9206085511934948,fr|it|en|es|pt,292,0.07717550545930862,1.0000468492507935,0.6563664864164119,0.1617820091332059 66 | Irish,ga,867,0.9065743944636678,en|gd|ca|kv|cs,14,0.11624463647603989,0.9368592500686646,0.6668249724112758,0.2168640845719679 67 | Scottish Gaelic,gd,542,0.8966789667896679,en|ga|de|fr|pam,2,0.11731892079114914,0.5798916220664978,0.5895466482948987,0.13791651890746184 68 | Welsh,cy,619,0.8917609046849758,es|en|la|kw|de,8,0.16016614437103271,0.9361439943313599,0.7355150005527274,0.1879545453129642 69 | Catalan,ca,4725,0.8833862433862434,es|pt|fr|it|ro,0,0.10038846731185913,0.9994297623634338,0.7601376923044137,0.2816235821618286 70 | Kyrgyz,ky,66,0.8787878787878788,ru|kk,4,0.39130115509033203,0.7497158050537109,0.7278588729135789,0.3713524490594864 71 | Cornish,kw,426,0.8779342723004695,en|cy|de|br|sq,1,0.12850528955459595,0.998835027217865,0.7223656110285457,0.1483097232199865 72 | Assamese,as,960,0.8635416666666667,bn,0,0.4754824936389923,0.9992721676826477,0.8744139322772514,0.5647418260090462 73 | Volapük,vo,806,0.8511166253101737,id|de|fi|en|eo,15,0.12574473023414612,0.9909709095954895,0.5384087601954725,0.1557349536800757 74 | Serbian,sr,13494,0.8489699125537276,hr|sh|mk|bs|sl,1050,0.12105872482061386,0.9961422681808472,0.6466786750228418,0.21803273776773696 75 | Slovak,sk,4370,0.8263157894736842,cs|pl|sl|no|sr,45,0.1748054176568985,1.0000356435775757,0.757084173876699,0.34975553321144875 76 | Maltese,mt,52,0.8076923076923077,es|cs|pt|sr|eo,7,0.17778615653514862,0.44417107105255127,0.562098236668611,0.0677537739276886 77 | Norwegian Nynorsk,nn (no),657,0.7990867579908676,da|sv|de|es|fi,29,0.20111340284347534,0.9768945574760437,0.5750402996634165,0.3076396750102779 78 | Afrikaans,af,1632,0.7879901960784313,nl|en|fr|de|nds,0,0.2565021514892578,0.9996443390846252,0.7469852737733197,0.480035386885371 79 | Occitan,oc,2861,0.7679133170220203,ca|es|fr|pt|it,27,0.14034049212932587,0.9993643760681152,0.693478145898965,0.27567044537355256 80 | Interlingua,ia,18782,0.7500798636992866,es|it|fr|la|pt,82,0.08685839176177979,0.9834362268447876,0.5305242322956538,0.2132283623167998 81 | Sanskrit,sa,11,0.7272727272727273,hi|ne,0,0.8556936979293823,0.8185657262802124,0.9226900283247232,0.4112839549779892 82 | Chechen,ce,7,0.7142857142857143,mn|ru,0,0.36961764097213745,0.6981340050697327,0.4036620110273361,0.3522151783108711 83 | Slovenian,sl,372,0.6774193548387096,sr|hr|bs|pl|eo,62,0.17472949624061584,0.9461504220962524,0.5286300588437977,0.22120561103026073 84 | Frisian,fy,107,0.6635514018691588,nl|en|de|af|fr,8,0.18109886348247528,0.9549309015274048,0.687566593225685,0.259281580724443 85 | Javanese,jv,260,0.6461538461538462,id|en|ms|ko|su,5,0.11977090686559677,0.941997766494751,0.48727030853929354,0.18967366327896065 86 | Yoruba,yo,5,0.6,sk|rm,1,0.4282495677471161,0.39966583251953125,0.3706127094725768,0.08796234056353569 87 | Luxembourgish,lb,217,0.5944700460829493,de|nds|sv|fr|nl,3,0.13198049366474152,0.9986009001731873,0.6048251575422149,0.36636357403437037 88 | Galician,gl,2618,0.5790679908326967,pt|es|it|fr|ca,8,0.1384539157152176,0.9998673796653748,0.6503780012143217,0.45353942153052706 89 | Turkmen,tk,3793,0.5710519377801213,tr|uz|en|et|io,0,0.09297733008861542,0.9993958473205566,0.5266673936840375,0.3539530090760075 90 | Croatian,hr,2222,0.5333033303330333,sr|sh|bs|sl|pl,45,0.11031389981508255,0.9780457615852356,0.4462057993209917,0.3404354233619835 91 | Aragonese,an,4,0.5,es,0,0.660906970500946,0.6260275840759277,0.5730769485235214,0.38740820437669754 92 | Ido,io,2905,0.48055077452667816,eo|es|it|pt|tr,7,0.1342163234949112,0.9914343953132629,0.5664547488444712,0.3550978258184717 93 | Interlingue,ie,2007,0.4718485301444943,es|it|fr|en|ia,7,0.0876827985048294,0.9982254505157471,0.37375914048941444,0.20109766928417572 94 | Limburgan| Limburger| Limburgish,li,3,0.3333333333333333,de,1,0.6388689279556274,0.9507495760917664,0.5632038861513138,0.4825317715294659 95 | Walloon,wa,16,0.3125,fr|pt|tl|oc|en,1,0.6337143182754517,0.9523198008537292,0.683312127366662,0.30093699278817937 96 | Somali,so,32,0.21875,fi|eo|cy|en|az,1,0.09245844185352325,0.956329882144928,0.09194009538207736,0.15861030662432313 97 | Corsican,co,5,0.2,it|fr,0,0.8006647825241089,0.8259631395339966,0.7590430341660976,0.4008978884667158 98 | Sundanese,su,11,0.18181818181818182,id|ms|es,19,0.20011375844478607,0.7112906575202942,0.08744853362441063,0.22517896278036964 99 | Haitian Creole,ht,15,0.06666666666666667,br|fr|su|diq|no,3,0.1751844733953476,0.5749273300170898,0.06675867736339569,0.15369660247649466 100 | Romansh,rm,16,0.0625,it|fr|en|tl|qu,3,0.40953710675239563,0.7947274446487427,0.3415144458413124,0.22089687262972196 101 | Bosnian,bs,139,0.03597122302158273,sr|hr|sh|pl|sl,0,0.39864686131477356,0.9914036989212036,0.32569590508937835,0.42717353254035395 102 | Manx,gv,6,0,cy|fr|nl|et|en,0,1,0.7409127354621887,NaN,0.32388975595434505 -------------------------------------------------------------------------------- /benchmark-testing/results/benchmark_results_0.2.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "am": { "count": 3, "accuratePredictions": 3, "accuracy": 1 }, 3 | "an": { "count": 4, "accuratePredictions": 2, "accuracy": 0.5 }, 4 | "as": { "count": 960, "accuratePredictions": 829, "accuracy": 0.8635416666666667 }, 5 | "ba": { "count": 120, "accuratePredictions": 114, "accuracy": 0.95 }, 6 | "af": { "count": 1632, "accuratePredictions": 1286, "accuracy": 0.7879901960784313 }, 7 | "bn": { "count": 1164, "accuratePredictions": 1164, "accuracy": 1 }, 8 | "bo": { "count": 20, "accuratePredictions": 20, "accuracy": 1 }, 9 | "bs": { "count": 139, "accuratePredictions": 5, "accuracy": 0.03597122302158273 }, 10 | "br": { "count": 2451, "accuratePredictions": 2339, "accuracy": 0.9543043655650755 }, 11 | "be": { "count": 6253, "accuratePredictions": 6019, "accuracy": 0.9625779625779626 }, 12 | "ar": { "count": 8761, "accuratePredictions": 8742, "accuracy": 0.9978312977970552 }, 13 | "ca": { "count": 4725, "accuratePredictions": 4174, "accuracy": 0.8833862433862434 }, 14 | "ce": { "count": 7, "accuratePredictions": 5, "accuracy": 0.7142857142857143 }, 15 | "cv": { "count": 460, "accuratePredictions": 439, "accuracy": 0.9543478260869566 }, 16 | "zh": { "count": 568, "accuratePredictions": 568, "accuracy": 1 }, 17 | "kw": { "count": 426, "accuratePredictions": 374, "accuracy": 0.8779342723004695 }, 18 | "co": { "count": 5, "accuratePredictions": 1, "accuracy": 0.2 }, 19 | "cy": { "count": 619, "accuratePredictions": 552, "accuracy": 0.8917609046849758 }, 20 | "bg": { "count": 11144, "accuratePredictions": 10999, "accuracy": 0.9869885139985642 }, 21 | "cs": { "count": 10863, "accuratePredictions": 10753, "accuracy": 0.9898738838258307 }, 22 | "dv": { "count": 15, "accuratePredictions": 15, "accuracy": 1 }, 23 | "da": { "count": 15299, "accuratePredictions": 14504, "accuracy": 0.948035819334597 }, 24 | "el": { "count": 12039, "accuratePredictions": 12039, "accuracy": 1 }, 25 | "de": { "count": 22014, "accuratePredictions": 22011, "accuracy": 0.9998637230853094 }, 26 | "eu": { "count": 2999, "accuratePredictions": 2902, "accuracy": 0.9676558852950984 }, 27 | "eo": { "count": 17841, "accuratePredictions": 17833, "accuracy": 0.999551594641556 }, 28 | "en": { "count": 22428, "accuratePredictions": 22428, "accuracy": 1 }, 29 | "fy": { "count": 107, "accuratePredictions": 71, "accuracy": 0.6635514018691588 }, 30 | "gd": { "count": 542, "accuratePredictions": 486, "accuracy": 0.8966789667896679 }, 31 | "ga": { "count": 867, "accuratePredictions": 786, "accuracy": 0.9065743944636678 }, 32 | "gl": { "count": 2618, "accuratePredictions": 1516, "accuracy": 0.5790679908326967 }, 33 | "gv": { "count": 6, "accuratePredictions": 0, "accuracy": 0 }, 34 | "gu": { "count": 57, "accuratePredictions": 57, "accuracy": 1 }, 35 | "ht": { "count": 15, "accuratePredictions": 1, "accuracy": 0.06666666666666667 }, 36 | "fi": { "count": 17406, "accuratePredictions": 17390, "accuracy": 0.9990807767436516 }, 37 | "hi": { "count": 5362, "accuratePredictions": 5360, "accuracy": 0.999627004848937 }, 38 | "he": { "count": 8616, "accuratePredictions": 8616, "accuracy": 1 }, 39 | "hr": { "count": 2222, "accuratePredictions": 1185, "accuracy": 0.5333033303330333 }, 40 | "hy": { "count": 518, "accuratePredictions": 517, "accuracy": 0.9980694980694981 }, 41 | "io": { "count": 2905, "accuratePredictions": 1396, "accuracy": 0.48055077452667816 }, 42 | "ie": { "count": 2007, "accuratePredictions": 947, "accuracy": 0.4718485301444943 }, 43 | "fr": { "count": 23076, "accuratePredictions": 23065, "accuracy": 0.999523314265904 }, 44 | "id": { "count": 9372, "accuratePredictions": 8900, "accuracy": 0.949637217242851 }, 45 | "hu": { "count": 17942, "accuratePredictions": 17936, "accuracy": 0.9996655891204994 }, 46 | "is": { "count": 6364, "accuratePredictions": 6239, "accuracy": 0.9803582652419862 }, 47 | "jv": { "count": 260, "accuratePredictions": 168, "accuracy": 0.6461538461538462 }, 48 | "ja": { "count": 2169, "accuratePredictions": 2169, "accuracy": 1 }, 49 | "ia": { "count": 18782, "accuratePredictions": 14088, "accuracy": 0.7500798636992866 }, 50 | "kn": { "count": 118, "accuratePredictions": 118, "accuracy": 1 }, 51 | "ka": { "count": 1973, "accuratePredictions": 1973, "accuracy": 1 }, 52 | "kk": { "count": 2232, "accuratePredictions": 2188, "accuracy": 0.9802867383512545 }, 53 | "km": { "count": 379, "accuratePredictions": 376, "accuracy": 0.9920844327176781 }, 54 | "ky": { "count": 66, "accuratePredictions": 58, "accuracy": 0.8787878787878788 }, 55 | "lo": { "count": 70, "accuratePredictions": 70, "accuracy": 1 }, 56 | "ko": { "count": 482, "accuratePredictions": 482, "accuracy": 1 }, 57 | "li": { "count": 3, "accuratePredictions": 1, "accuracy": 0.3333333333333333 }, 58 | "it": { "count": 18326, "accuratePredictions": 18307, "accuracy": 0.9989632216522972 }, 59 | "lb": { "count": 217, "accuratePredictions": 129, "accuracy": 0.5944700460829493 }, 60 | "ml": { "count": 517, "accuratePredictions": 517, "accuracy": 1 }, 61 | "la": { "count": 11437, "accuratePredictions": 10529, "accuracy": 0.9206085511934948 }, 62 | "lt": { "count": 13835, "accuratePredictions": 13743, "accuracy": 0.9933501987712324 }, 63 | "mt": { "count": 52, "accuratePredictions": 42, "accuracy": 0.8076923076923077 }, 64 | "my": { "count": 216, "accuratePredictions": 216, "accuracy": 1 }, 65 | "mr": { "count": 10461, "accuratePredictions": 10456, "accuracy": 0.9995220342223496 }, 66 | "nn": { "count": 657, "accuratePredictions": 343, "accuracy": 0.5220700152207002 }, 67 | "oc": { "count": 2861, "accuratePredictions": 2197, "accuracy": 0.7679133170220203 }, 68 | "os": { "count": 59, "accuratePredictions": 58, "accuracy": 0.9830508474576272 }, 69 | "pa": { "count": 88, "accuratePredictions": 88, "accuracy": 1 }, 70 | "mk": { "count": 14465, "accuratePredictions": 14407, "accuracy": 0.9959903214656066 }, 71 | "nl": { "count": 19626, "accuratePredictions": 19498, "accuracy": 0.9934780393355752 }, 72 | "rm": { "count": 16, "accuratePredictions": 1, "accuracy": 0.0625 }, 73 | "pl": { "count": 17768, "accuratePredictions": 17763, "accuracy": 0.999718595227375 }, 74 | "pt": { "count": 20174, "accuratePredictions": 20039, "accuracy": 0.9933082184990581 }, 75 | "sa": { "count": 11, "accuratePredictions": 8, "accuracy": 0.7272727272727273 }, 76 | "si": { "count": 9, "accuratePredictions": 9, "accuracy": 1 }, 77 | "ro": { "count": 13560, "accuratePredictions": 13406, "accuracy": 0.9886430678466077 }, 78 | "sl": { "count": 372, "accuratePredictions": 252, "accuracy": 0.6774193548387096 }, 79 | "so": { "count": 32, "accuratePredictions": 7, "accuracy": 0.21875 }, 80 | "sk": { "count": 4370, "accuratePredictions": 3611, "accuracy": 0.8263157894736842 }, 81 | "ru": { "count": 17329, "accuratePredictions": 17324, "accuracy": 0.9997114663281205 }, 82 | "su": { "count": 11, "accuratePredictions": 2, "accuracy": 0.18181818181818182 }, 83 | "sr": { "count": 13494, "accuratePredictions": 11456, "accuracy": 0.8489699125537276 }, 84 | "ta": { "count": 205, "accuratePredictions": 205, "accuracy": 1 }, 85 | "es": { "count": 18227, "accuratePredictions": 18193, "accuracy": 0.998134635430954 }, 86 | "te": { "count": 102, "accuratePredictions": 102, "accuracy": 1 }, 87 | "tg": { "count": 30, "accuratePredictions": 29, "accuracy": 0.9666666666666667 }, 88 | "sv": { "count": 12188, "accuratePredictions": 12050, "accuracy": 0.9886773875943551 }, 89 | "th": { "count": 572, "accuratePredictions": 572, "accuracy": 1 }, 90 | "tt": { "count": 8178, "accuratePredictions": 7917, "accuracy": 0.9680851063829787 }, 91 | "tk": { "count": 3793, "accuratePredictions": 2166, "accuracy": 0.5710519377801213 }, 92 | "tl": { "count": 10351, "accuratePredictions": 10079, "accuracy": 0.9737223456670853 }, 93 | "ug": { "count": 3692, "accuratePredictions": 3689, "accuracy": 0.9991874322860238 }, 94 | "ur": { "count": 963, "accuratePredictions": 954, "accuracy": 0.9906542056074766 }, 95 | "uk": { "count": 14285, "accuratePredictions": 14233, "accuracy": 0.9963598179908996 }, 96 | "vo": { "count": 806, "accuratePredictions": 686, "accuracy": 0.8511166253101737 }, 97 | "wa": { "count": 16, "accuratePredictions": 5, "accuracy": 0.3125 }, 98 | "yo": { "count": 5, "accuratePredictions": 3, "accuracy": 0.6 }, 99 | "vi": { "count": 13000, "accuratePredictions": 12995, "accuracy": 0.9996153846153846 }, 100 | "tr": { "count": 19919, "accuratePredictions": 19911, "accuracy": 0.9995983734123199 } 101 | } 102 | -------------------------------------------------------------------------------- /benchmark-testing/results/benchmark_results_0.2.2.json: -------------------------------------------------------------------------------- 1 | {"am":{"count":154,"accuratePredictions":154,"mislabels":[],"accuracy":1},"an":{"count":73,"accuratePredictions":1,"mislabels":[{"lang":"es","count":29},{"lang":"pt","count":17},{"lang":"it","count":6},{"lang":"en","count":3},{"lang":"fr","count":3}],"accuracy":0.0136986301369863},"as":{"count":2635,"accuratePredictions":1878,"mislabels":[{"lang":"bn","count":730},{"lang":"bpy","count":21},{"lang":"en","count":2},{"lang":"tl","count":1},{"lang":"bh","count":1}],"accuracy":0.7127134724857686},"ba":{"count":128,"accuratePredictions":107,"mislabels":[{"lang":"tt","count":15},{"lang":"ru","count":3},{"lang":"sr","count":1},{"lang":"av","count":1},{"lang":"kk","count":1}],"accuracy":0.8359375},"af":{"count":3299,"accuratePredictions":2246,"mislabels":[{"lang":"nl","count":626},{"lang":"en","count":251},{"lang":"de","count":38},{"lang":"fr","count":34},{"lang":"nds","count":24}],"accuracy":0.6808123673840558},"bn":{"count":4132,"accuratePredictions":4129,"mislabels":[{"lang":"bpy","count":2},{"lang":"as","count":1}],"accuracy":0.9992739593417231},"bo":{"count":29,"accuratePredictions":29,"mislabels":[],"accuracy":1},"bs":{"count":520,"accuratePredictions":23,"mislabels":[{"lang":"sr","count":267},{"lang":"hr","count":143},{"lang":"sh","count":22},{"lang":"it","count":10},{"lang":"pl","count":9}],"accuracy":0.04423076923076923},"be":{"count":9005,"accuratePredictions":7911,"mislabels":[{"lang":"ru","count":584},{"lang":"uk","count":434},{"lang":"bg","count":19},{"lang":"mk","count":17},{"lang":"pl","count":9}],"accuracy":0.8785119378123265},"br":{"count":5468,"accuratePredictions":4691,"mislabels":[{"lang":"en","count":133},{"lang":"fr","count":110},{"lang":"pt","count":79},{"lang":"de","count":59},{"lang":"eu","count":56}],"accuracy":0.8579005120702268},"ca":{"count":5313,"accuratePredictions":3987,"mislabels":[{"lang":"es","count":457},{"lang":"pt","count":257},{"lang":"it","count":146},{"lang":"fr","count":131},{"lang":"en","count":79}],"accuracy":0.7504234895539243},"ar":{"count":25531,"accuratePredictions":25417,"mislabels":[{"lang":"arz","count":51},{"lang":"fa","count":39},{"lang":"en","count":5},{"lang":"mzn","count":3},{"lang":"ps","count":3}],"accuracy":0.9955348399984333},"ce":{"count":25,"accuratePredictions":15,"mislabels":[{"lang":"bg","count":4},{"lang":"sr","count":2},{"lang":"mn","count":1},{"lang":"ba","count":1},{"lang":"uk","count":1}],"accuracy":0.6},"cv":{"count":1332,"accuratePredictions":1263,"mislabels":[{"lang":"ru","count":52},{"lang":"uk","count":6},{"lang":"krc","count":2},{"lang":"ba","count":2},{"lang":"sr","count":2}],"accuracy":0.9481981981981982},"bg":{"count":19328,"accuratePredictions":18076,"mislabels":[{"lang":"mk","count":836},{"lang":"ru","count":344},{"lang":"uk","count":34},{"lang":"sr","count":30},{"lang":"tg","count":4}],"accuracy":0.9352235099337748},"kw":{"count":3757,"accuratePredictions":3291,"mislabels":[{"lang":"en","count":64},{"lang":"de","count":53},{"lang":"cy","count":47},{"lang":"es","count":27},{"lang":"br","count":22}],"accuracy":0.8759648655842428},"co":{"count":13,"accuratePredictions":3,"mislabels":[{"lang":"it","count":4},{"lang":"min","count":1},{"lang":"ro","count":1},{"lang":"ilo","count":1},{"lang":"id","count":1}],"accuracy":0.23076923076923078},"cy":{"count":1167,"accuratePredictions":894,"mislabels":[{"lang":"en","count":46},{"lang":"es","count":42},{"lang":"kw","count":20},{"lang":"la","count":17},{"lang":"it","count":15}],"accuracy":0.7660668380462725},"cs":{"count":25189,"accuratePredictions":23970,"mislabels":[{"lang":"sk","count":220},{"lang":"pl","count":177},{"lang":"hu","count":98},{"lang":"en","count":87},{"lang":"sl","count":86}],"accuracy":0.951605859700663},"zh":{"count":17801,"accuratePredictions":17763,"mislabels":[{"lang":"wuu","count":28},{"lang":"yue","count":5},{"lang":"ja","count":2},{"lang":"sr","count":1},{"lang":"pt","count":1}],"accuracy":0.9978652884669401},"dv":{"count":15,"accuratePredictions":15,"mislabels":[],"accuracy":1},"da":{"count":22539,"accuratePredictions":20469,"mislabels":[{"lang":"no","count":931},{"lang":"sv","count":463},{"lang":"de","count":149},{"lang":"en","count":138},{"lang":"fr","count":81}],"accuracy":0.9081591907360576},"de":{"count":17223,"accuratePredictions":17179,"mislabels":[{"lang":"en","count":23},{"lang":"it","count":6},{"lang":"fr","count":3},{"lang":"es","count":2},{"lang":"sv","count":2}],"accuracy":0.9974452766649248},"el":{"count":24980,"accuratePredictions":24979,"mislabels":[{"lang":"en","count":1}],"accuracy":0.9999599679743795},"et":{"count":3077,"accuratePredictions":2475,"mislabels":[{"lang":"fi","count":180},{"lang":"en","count":115},{"lang":"hu","count":38},{"lang":"tr","count":34},{"lang":"it","count":24}],"accuracy":0.8043548911277218},"en":{"count":17094,"accuratePredictions":17071,"mislabels":[{"lang":"nl","count":5},{"lang":"it","count":4},{"lang":"hu","count":2},{"lang":"eo","count":2},{"lang":"es","count":2}],"accuracy":0.9986544986544986},"eu":{"count":4570,"accuratePredictions":4045,"mislabels":[{"lang":"it","count":101},{"lang":"id","count":50},{"lang":"hu","count":46},{"lang":"nl","count":41},{"lang":"eo","count":31}],"accuracy":0.8851203501094091},"eo":{"count":21641,"accuratePredictions":21569,"mislabels":[{"lang":"it","count":22},{"lang":"es","count":16},{"lang":"tr","count":6},{"lang":"pt","count":5},{"lang":"pl","count":2}],"accuracy":0.9966729818400258},"fy":{"count":282,"accuratePredictions":104,"mislabels":[{"lang":"nl","count":43},{"lang":"en","count":41},{"lang":"nds","count":21},{"lang":"de","count":15},{"lang":"fr","count":12}],"accuracy":0.36879432624113473},"gd":{"count":753,"accuratePredictions":589,"mislabels":[{"lang":"en","count":46},{"lang":"ga","count":25},{"lang":"de","count":15},{"lang":"fr","count":13},{"lang":"pam","count":8}],"accuracy":0.7822045152722443},"ga":{"count":1977,"accuratePredictions":1661,"mislabels":[{"lang":"en","count":68},{"lang":"pt","count":44},{"lang":"es","count":32},{"lang":"ca","count":23},{"lang":"gd","count":16}],"accuracy":0.840161861406171},"gl":{"count":3245,"accuratePredictions":1363,"mislabels":[{"lang":"pt","count":1047},{"lang":"es","count":612},{"lang":"it","count":94},{"lang":"en","count":30},{"lang":"fr","count":24}],"accuracy":0.4200308166409861},"gv":{"count":30,"accuratePredictions":2,"mislabels":[{"lang":"en","count":5},{"lang":"it","count":2},{"lang":"pt","count":2},{"lang":"fr","count":2},{"lang":"kw","count":2}],"accuracy":0.06666666666666667},"gu":{"count":116,"accuratePredictions":116,"mislabels":[],"accuracy":1},"ht":{"count":58,"accuratePredictions":7,"mislabels":[{"lang":"en","count":9},{"lang":"fr","count":6},{"lang":"br","count":4},{"lang":"la","count":3},{"lang":"de","count":3}],"accuracy":0.1206896551724138},"fr":{"count":16040,"accuratePredictions":15966,"mislabels":[{"lang":"en","count":26},{"lang":"it","count":16},{"lang":"ia","count":9},{"lang":"es","count":8},{"lang":"pt","count":4}],"accuracy":0.9953865336658354},"fi":{"count":20731,"accuratePredictions":20419,"mislabels":[{"lang":"it","count":74},{"lang":"en","count":39},{"lang":"eo","count":23},{"lang":"et","count":21},{"lang":"nl","count":15}],"accuracy":0.9849500747672567},"hr":{"count":4186,"accuratePredictions":1826,"mislabels":[{"lang":"sr","count":1794},{"lang":"sh","count":110},{"lang":"bs","count":87},{"lang":"sl","count":79},{"lang":"pl","count":68}],"accuracy":0.4362159579550884},"hi":{"count":11497,"accuratePredictions":11449,"mislabels":[{"lang":"mr","count":35},{"lang":"dty","count":2},{"lang":"new","count":2},{"lang":"bh","count":2},{"lang":"ne","count":2}],"accuracy":0.9958249978255197},"hy":{"count":1855,"accuratePredictions":1854,"mislabels":[{"lang":"de","count":1}],"accuracy":0.9994609164420485},"io":{"count":6495,"accuratePredictions":3155,"mislabels":[{"lang":"eo","count":2185},{"lang":"es","count":258},{"lang":"it","count":225},{"lang":"pt","count":169},{"lang":"tr","count":102}],"accuracy":0.4857582755966128},"he":{"count":26461,"accuratePredictions":26459,"mislabels":[{"lang":"en","count":1},{"lang":"yi","count":1}],"accuracy":0.9999244170666264},"ie":{"count":6538,"accuratePredictions":3389,"mislabels":[{"lang":"es","count":1139},{"lang":"it","count":416},{"lang":"en","count":333},{"lang":"fr","count":254},{"lang":"eo","count":230}],"accuracy":0.5183542367696543},"hu":{"count":20843,"accuratePredictions":20744,"mislabels":[{"lang":"en","count":20},{"lang":"pt","count":10},{"lang":"it","count":6},{"lang":"nl","count":6},{"lang":"eo","count":6}],"accuracy":0.9952502039053879},"is":{"count":7745,"accuratePredictions":7319,"mislabels":[{"lang":"da","count":64},{"lang":"et","count":53},{"lang":"cs","count":43},{"lang":"no","count":41},{"lang":"de","count":39}],"accuracy":0.9449967721110394},"id":{"count":9707,"accuratePredictions":8871,"mislabels":[{"lang":"ms","count":510},{"lang":"en","count":74},{"lang":"it","count":70},{"lang":"eo","count":31},{"lang":"tr","count":26}],"accuracy":0.9138765839085197},"jv":{"count":548,"accuratePredictions":248,"mislabels":[{"lang":"id","count":95},{"lang":"en","count":66},{"lang":"ko","count":19},{"lang":"ms","count":15},{"lang":"hu","count":14}],"accuracy":0.45255474452554745},"ia":{"count":14949,"accuratePredictions":10574,"mislabels":[{"lang":"it","count":1678},{"lang":"es","count":1301},{"lang":"fr","count":447},{"lang":"en","count":217},{"lang":"la","count":208}],"accuracy":0.7073382834972239},"kn":{"count":126,"accuratePredictions":126,"mislabels":[],"accuracy":1},"ka":{"count":4550,"accuratePredictions":4532,"mislabels":[{"lang":"xmf","count":17},{"lang":"en","count":1}],"accuracy":0.996043956043956},"kk":{"count":2373,"accuratePredictions":2197,"mislabels":[{"lang":"uk","count":28},{"lang":"tt","count":26},{"lang":"tr","count":26},{"lang":"ru","count":23},{"lang":"ky","count":15}],"accuracy":0.9258322798145807},"km":{"count":1252,"accuratePredictions":1242,"mislabels":[{"lang":"az","count":5},{"lang":"ru","count":3},{"lang":"sr","count":1},{"lang":"et","count":1}],"accuracy":0.9920127795527156},"ky":{"count":227,"accuratePredictions":171,"mislabels":[{"lang":"ru","count":29},{"lang":"kk","count":11},{"lang":"tt","count":8},{"lang":"mn","count":3},{"lang":"bg","count":1}],"accuracy":0.7533039647577092},"ko":{"count":6128,"accuratePredictions":6126,"mislabels":[{"lang":"tr","count":1},{"lang":"ja","count":1}],"accuracy":0.9996736292428199},"lo":{"count":183,"accuratePredictions":182,"mislabels":[{"lang":"el","count":1}],"accuracy":0.994535519125683},"it":{"count":20449,"accuratePredictions":20359,"mislabels":[{"lang":"es","count":38},{"lang":"en","count":14},{"lang":"fr","count":10},{"lang":"eo","count":5},{"lang":"pt","count":3}],"accuracy":0.995598806787618},"li":{"count":34,"accuratePredictions":5,"mislabels":[{"lang":"de","count":12},{"lang":"nl","count":4},{"lang":"en","count":4},{"lang":"no","count":2},{"lang":"is","count":1}],"accuracy":0.14705882352941177},"ja":{"count":28060,"accuratePredictions":28059,"mislabels":[{"lang":"zh","count":1}],"accuracy":0.9999643620812545},"lb":{"count":732,"accuratePredictions":291,"mislabels":[{"lang":"de","count":170},{"lang":"fr","count":34},{"lang":"en","count":31},{"lang":"nds","count":26},{"lang":"nl","count":22}],"accuracy":0.3975409836065574},"lv":{"count":2198,"accuratePredictions":1920,"mislabels":[{"lang":"lt","count":77},{"lang":"es","count":23},{"lang":"sr","count":16},{"lang":"en","count":15},{"lang":"fr","count":15}],"accuracy":0.8735213830755232},"ml":{"count":525,"accuratePredictions":525,"mislabels":[],"accuracy":1},"la":{"count":24699,"accuratePredictions":22178,"mislabels":[{"lang":"it","count":694},{"lang":"fr","count":463},{"lang":"en","count":383},{"lang":"es","count":278},{"lang":"pt","count":99}],"accuracy":0.8979310903275436},"lt":{"count":24184,"accuratePredictions":23211,"mislabels":[{"lang":"eo","count":138},{"lang":"fi","count":89},{"lang":"sr","count":80},{"lang":"pt","count":79},{"lang":"pl","count":71}],"accuracy":0.9597667879589812},"mt":{"count":165,"accuratePredictions":111,"mislabels":[{"lang":"hu","count":9},{"lang":"en","count":8},{"lang":"es","count":8},{"lang":"it","count":4},{"lang":"pl","count":3}],"accuracy":0.6727272727272727},"my":{"count":243,"accuratePredictions":243,"mislabels":[],"accuracy":1},"mr":{"count":25633,"accuratePredictions":25606,"mislabels":[{"lang":"hi","count":20},{"lang":"gom","count":4},{"lang":"pt","count":2},{"lang":"new","count":1}],"accuracy":0.9989466703078064},"nn":{"count":1287,"accuratePredictions":875,"mislabels":[{"lang":"da","count":192},{"lang":"sv","count":96},{"lang":"de","count":33},{"lang":"es","count":13},{"lang":"hu","count":11}],"accuracy":0.6798756798756799},"oc":{"count":4096,"accuratePredictions":2898,"mislabels":[{"lang":"es","count":329},{"lang":"fr","count":246},{"lang":"ca","count":243},{"lang":"pt","count":104},{"lang":"it","count":63}],"accuracy":0.70751953125},"os":{"count":205,"accuratePredictions":188,"mislabels":[{"lang":"ru","count":6},{"lang":"hy","count":3},{"lang":"sr","count":1},{"lang":"kv","count":1},{"lang":"mrj","count":1}],"accuracy":0.9170731707317074},"pa":{"count":156,"accuratePredictions":156,"mislabels":[],"accuracy":1},"nl":{"count":19349,"accuratePredictions":18809,"mislabels":[{"lang":"en","count":230},{"lang":"de","count":71},{"lang":"nds","count":55},{"lang":"af","count":43},{"lang":"fr","count":43}],"accuracy":0.9720915809602564},"mk":{"count":23602,"accuratePredictions":23201,"mislabels":[{"lang":"ru","count":167},{"lang":"bg","count":155},{"lang":"sr","count":62},{"lang":"uk","count":17}],"accuracy":0.9830099144140327},"rm":{"count":11,"accuratePredictions":0,"mislabels":[{"lang":"it","count":2},{"lang":"pt","count":2},{"lang":"fr","count":2},{"lang":"en","count":2},{"lang":"tl","count":1}],"accuracy":0},"pt":{"count":18352,"accuratePredictions":18047,"mislabels":[{"lang":"es","count":206},{"lang":"it","count":22},{"lang":"gl","count":21},{"lang":"fr","count":18},{"lang":"en","count":14}],"accuracy":0.9833805579773321},"ro":{"count":18367,"accuratePredictions":16962,"mislabels":[{"lang":"it","count":237},{"lang":"es","count":226},{"lang":"en","count":184},{"lang":"fr","count":139},{"lang":"pt","count":127}],"accuracy":0.9235041106332008},"sa":{"count":135,"accuratePredictions":125,"mislabels":[{"lang":"hi","count":7},{"lang":"ne","count":2},{"lang":"mr","count":1}],"accuracy":0.9259259259259259},"si":{"count":37,"accuratePredictions":37,"mislabels":[],"accuracy":1},"pl":{"count":21386,"accuratePredictions":21258,"mislabels":[{"lang":"en","count":28},{"lang":"it","count":18},{"lang":"eo","count":8},{"lang":"de","count":8},{"lang":"cs","count":8}],"accuracy":0.9940147760216964},"sl":{"count":908,"accuratePredictions":420,"mislabels":[{"lang":"sr","count":152},{"lang":"hr","count":75},{"lang":"cs","count":40},{"lang":"pl","count":32},{"lang":"bs","count":31}],"accuracy":0.46255506607929514},"sd":{"count":6,"accuratePredictions":5,"mislabels":[{"lang":"ur","count":1}],"accuracy":0.8333333333333334},"so":{"count":61,"accuratePredictions":9,"mislabels":[{"lang":"en","count":14},{"lang":"fi","count":11},{"lang":"et","count":6},{"lang":"cy","count":3},{"lang":"su","count":3}],"accuracy":0.14754098360655737},"sk":{"count":13877,"accuratePredictions":8473,"mislabels":[{"lang":"cs","count":2912},{"lang":"pl","count":358},{"lang":"sl","count":209},{"lang":"no","count":200},{"lang":"sr","count":173}],"accuracy":0.6105786553289616},"ru":{"count":20855,"accuratePredictions":20809,"mislabels":[{"lang":"uk","count":21},{"lang":"bg","count":12},{"lang":"mk","count":7},{"lang":"sr","count":3},{"lang":"be","count":2}],"accuracy":0.9977942939343083},"su":{"count":18,"accuratePredictions":4,"mislabels":[{"lang":"id","count":3},{"lang":"es","count":3},{"lang":"en","count":2},{"lang":"it","count":2},{"lang":"lmo","count":1}],"accuracy":0.2222222222222222},"es":{"count":20498,"accuratePredictions":20284,"mislabels":[{"lang":"pt","count":102},{"lang":"it","count":44},{"lang":"fr","count":11},{"lang":"ca","count":9},{"lang":"en","count":9}],"accuracy":0.9895599570689824},"ta":{"count":229,"accuratePredictions":229,"mislabels":[],"accuracy":1},"tt":{"count":8283,"accuratePredictions":7845,"mislabels":[{"lang":"ru","count":125},{"lang":"tr","count":41},{"lang":"az","count":39},{"lang":"kk","count":35},{"lang":"ky","count":25}],"accuracy":0.9471206084751902},"te":{"count":220,"accuratePredictions":220,"mislabels":[],"accuracy":1},"tg":{"count":48,"accuratePredictions":42,"mislabels":[{"lang":"ru","count":5},{"lang":"uk","count":1}],"accuracy":0.875},"sr":{"count":23128,"accuratePredictions":18629,"mislabels":[{"lang":"hr","count":1435},{"lang":"mk","count":656},{"lang":"sh","count":439},{"lang":"ru","count":347},{"lang":"sl","count":236}],"accuracy":0.8054738844690419},"th":{"count":3399,"accuratePredictions":3399,"mislabels":[],"accuracy":1},"tl":{"count":11133,"accuratePredictions":10235,"mislabels":[{"lang":"ceb","count":230},{"lang":"en","count":183},{"lang":"it","count":66},{"lang":"id","count":64},{"lang":"es","count":50}],"accuracy":0.9193389023623462},"tk":{"count":4585,"accuratePredictions":2071,"mislabels":[{"lang":"tr","count":1179},{"lang":"en","count":284},{"lang":"uz","count":166},{"lang":"et","count":119},{"lang":"pl","count":77}],"accuracy":0.45169029443838604},"sv":{"count":24466,"accuratePredictions":23156,"mislabels":[{"lang":"da","count":368},{"lang":"no","count":236},{"lang":"en","count":124},{"lang":"de","count":82},{"lang":"eo","count":49}],"accuracy":0.9464563067113545},"ug":{"count":6135,"accuratePredictions":6119,"mislabels":[{"lang":"ar","count":8},{"lang":"ba","count":3},{"lang":"tt","count":1},{"lang":"ca","count":1},{"lang":"hu","count":1}],"accuracy":0.9973920130399349},"ur":{"count":1583,"accuratePredictions":1551,"mislabels":[{"lang":"pnb","count":18},{"lang":"fa","count":7},{"lang":"ug","count":4},{"lang":"en","count":2},{"lang":"ro","count":1}],"accuracy":0.9797852179406191},"tr":{"count":18879,"accuratePredictions":18838,"mislabels":[{"lang":"en","count":9},{"lang":"eo","count":5},{"lang":"az","count":5},{"lang":"es","count":4},{"lang":"it","count":3}],"accuracy":0.9978282748026909},"vi":{"count":13130,"accuratePredictions":13092,"mislabels":[{"lang":"it","count":6},{"lang":"pms","count":6},{"lang":"eo","count":4},{"lang":"pt","count":4},{"lang":"fr","count":3}],"accuracy":0.9971058644325971},"wa":{"count":37,"accuratePredictions":11,"mislabels":[{"lang":"fr","count":7},{"lang":"en","count":4},{"lang":"no","count":3},{"lang":"it","count":2},{"lang":"gn","count":1}],"accuracy":0.2972972972972973},"yo":{"count":31,"accuratePredictions":22,"mislabels":[{"lang":"ga","count":2},{"lang":"pl","count":2},{"lang":"en","count":2},{"lang":"qu","count":1},{"lang":"ckb","count":1}],"accuracy":0.7096774193548387},"vo":{"count":3941,"accuratePredictions":2999,"mislabels":[{"lang":"id","count":109},{"lang":"en","count":98},{"lang":"eo","count":97},{"lang":"fi","count":69},{"lang":"de","count":67}],"accuracy":0.7609743719868054},"uk":{"count":23251,"accuratePredictions":22848,"mislabels":[{"lang":"ru","count":354},{"lang":"mk","count":18},{"lang":"bg","count":13},{"lang":"be","count":10},{"lang":"sr","count":5}],"accuracy":0.982667412154316}} -------------------------------------------------------------------------------- /benchmark-testing/results/benchmark_results_0.2.3.json: -------------------------------------------------------------------------------- 1 | {"am":{"count":3,"accuratePredictions":3,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9874904751777649,"highestFalseProbability":0,"correctAvgConfidence":0.9915543248450073,"incorrectAvgConfidence":null},"an":{"count":4,"accuratePredictions":2,"mislabels":[{"lang":"es","count":2}],"accuracy":0.5,"falsePositives":0,"lowestProbability":0.660906970500946,"highestFalseProbability":0.6260275840759277,"correctAvgConfidence":0.5730769485235214,"incorrectAvgConfidence":0.38740820437669754},"as":{"count":960,"accuratePredictions":829,"mislabels":[{"lang":"bn","count":131}],"accuracy":0.8635416666666667,"falsePositives":0,"lowestProbability":0.4754824936389923,"highestFalseProbability":0.9992721676826477,"correctAvgConfidence":0.8744139322772514,"incorrectAvgConfidence":0.5647418260090462},"ba":{"count":120,"accuratePredictions":114,"mislabels":[{"lang":"tt","count":5},{"lang":"av","count":1}],"accuracy":0.95,"falsePositives":0,"lowestProbability":0.47795307636260986,"highestFalseProbability":0.987030029296875,"correctAvgConfidence":0.853874352934853,"incorrectAvgConfidence":0.5168701093643904},"af":{"count":1632,"accuratePredictions":1286,"mislabels":[{"lang":"nl","count":294},{"lang":"en","count":27},{"lang":"fr","count":8},{"lang":"de","count":7},{"lang":"nds","count":6}],"accuracy":0.7879901960784313,"falsePositives":0,"lowestProbability":0.2565021514892578,"highestFalseProbability":0.9996443390846252,"correctAvgConfidence":0.7469852737733197,"incorrectAvgConfidence":0.480035386885371},"bn":{"count":1164,"accuratePredictions":1164,"mislabels":[],"accuracy":1,"falsePositives":131,"lowestProbability":0.5253420472145081,"highestFalseProbability":0,"correctAvgConfidence":0.9968120863004913,"incorrectAvgConfidence":null},"bo":{"count":20,"accuratePredictions":20,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9992840886116028,"highestFalseProbability":0,"correctAvgConfidence":0.9999102103707628,"incorrectAvgConfidence":null},"bs":{"count":139,"accuratePredictions":5,"mislabels":[{"lang":"sr","count":82},{"lang":"hr","count":41},{"lang":"sh","count":9},{"lang":"pl","count":1},{"lang":"sl","count":1}],"accuracy":0.03597122302158273,"falsePositives":0,"lowestProbability":0.39864686131477356,"highestFalseProbability":0.9914036989212036,"correctAvgConfidence":0.32569590508937835,"incorrectAvgConfidence":0.42717353254035395},"br":{"count":2451,"accuratePredictions":2339,"mislabels":[{"lang":"fr","count":17},{"lang":"nl","count":12},{"lang":"eu","count":12},{"lang":"de","count":11},{"lang":"pt","count":11}],"accuracy":0.9543043655650755,"falsePositives":0,"lowestProbability":0.11872711777687073,"highestFalseProbability":0.99974125623703,"correctAvgConfidence":0.8608614166133552,"incorrectAvgConfidence":0.15981427728456765},"be":{"count":6253,"accuratePredictions":6019,"mislabels":[{"lang":"uk","count":115},{"lang":"ru","count":96},{"lang":"pl","count":16},{"lang":"bg","count":2},{"lang":"sr","count":1}],"accuracy":0.9625779625779626,"falsePositives":0,"lowestProbability":0.2943115234375,"highestFalseProbability":0.9963804483413696,"correctAvgConfidence":0.9391700768558114,"incorrectAvgConfidence":0.46370226852934104},"ar":{"count":8761,"accuratePredictions":8742,"mislabels":[{"lang":"arz","count":9},{"lang":"fa","count":5},{"lang":"es","count":3},{"lang":"mzn","count":1},{"lang":"en","count":1}],"accuracy":0.9978312977970552,"falsePositives":0,"lowestProbability":0.1952907145023346,"highestFalseProbability":0.9997252821922302,"correctAvgConfidence":0.983906333174537,"incorrectAvgConfidence":0.6618713119096356},"ca":{"count":4725,"accuratePredictions":4174,"mislabels":[{"lang":"es","count":245},{"lang":"pt","count":118},{"lang":"fr","count":60},{"lang":"it","count":51},{"lang":"ro","count":16}],"accuracy":0.8833862433862434,"falsePositives":0,"lowestProbability":0.10038846731185913,"highestFalseProbability":0.9994297623634338,"correctAvgConfidence":0.7601376923044137,"incorrectAvgConfidence":0.2816235821618286},"ce":{"count":7,"accuratePredictions":5,"mislabels":[{"lang":"mn","count":1},{"lang":"ru","count":1}],"accuracy":0.7142857142857143,"falsePositives":0,"lowestProbability":0.36961764097213745,"highestFalseProbability":0.6981340050697327,"correctAvgConfidence":0.4036620110273361,"incorrectAvgConfidence":0.3522151783108711},"cv":{"count":460,"accuratePredictions":439,"mislabels":[{"lang":"ru","count":18},{"lang":"uk","count":1},{"lang":"ba","count":1},{"lang":"sr","count":1}],"accuracy":0.9543478260869566,"falsePositives":0,"lowestProbability":0.4113158583641052,"highestFalseProbability":0.9127076268196106,"correctAvgConfidence":0.9045468202402555,"incorrectAvgConfidence":0.2456898053309747},"zh":{"count":568,"accuratePredictions":568,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.4597342610359192,"highestFalseProbability":0,"correctAvgConfidence":0.9716624540774863,"incorrectAvgConfidence":null},"kw":{"count":426,"accuratePredictions":374,"mislabels":[{"lang":"en","count":12},{"lang":"cy","count":8},{"lang":"de","count":7},{"lang":"br","count":5},{"lang":"sq","count":3}],"accuracy":0.8779342723004695,"falsePositives":1,"lowestProbability":0.12850528955459595,"highestFalseProbability":0.998835027217865,"correctAvgConfidence":0.7223656110285457,"incorrectAvgConfidence":0.1483097232199865},"co":{"count":5,"accuratePredictions":1,"mislabels":[{"lang":"it","count":3},{"lang":"fr","count":1}],"accuracy":0.2,"falsePositives":0,"lowestProbability":0.8006647825241089,"highestFalseProbability":0.8259631395339966,"correctAvgConfidence":0.7590430341660976,"incorrectAvgConfidence":0.4008978884667158},"cy":{"count":619,"accuratePredictions":552,"mislabels":[{"lang":"es","count":14},{"lang":"en","count":9},{"lang":"la","count":8},{"lang":"kw","count":7},{"lang":"de","count":4}],"accuracy":0.8917609046849758,"falsePositives":8,"lowestProbability":0.16016614437103271,"highestFalseProbability":0.9361439943313599,"correctAvgConfidence":0.7355150005527274,"incorrectAvgConfidence":0.1879545453129642},"bg":{"count":11144,"accuratePredictions":10999,"mislabels":[{"lang":"mk","count":92},{"lang":"ru","count":47},{"lang":"uk","count":5},{"lang":"sr","count":1}],"accuracy":0.9869885139985642,"falsePositives":2,"lowestProbability":0.3436012268066406,"highestFalseProbability":0.9950229525566101,"correctAvgConfidence":0.9387939350125626,"incorrectAvgConfidence":0.3202424690127373},"cs":{"count":10863,"accuratePredictions":10753,"mislabels":[{"lang":"sk","count":40},{"lang":"pl","count":22},{"lang":"hu","count":11},{"lang":"sl","count":9},{"lang":"en","count":5}],"accuracy":0.9898738838258307,"falsePositives":1,"lowestProbability":0.14324608445167542,"highestFalseProbability":0.9298855066299438,"correctAvgConfidence":0.9387299941674171,"incorrectAvgConfidence":0.223398203690621},"dv":{"count":15,"accuratePredictions":15,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9750259518623352,"highestFalseProbability":0,"correctAvgConfidence":0.9856150934055525,"incorrectAvgConfidence":null},"da":{"count":15299,"accuratePredictions":14504,"mislabels":[{"lang":"no","count":551},{"lang":"sv","count":121},{"lang":"de","count":33},{"lang":"en","count":27},{"lang":"nn","count":22}],"accuracy":0.948035819334597,"falsePositives":2,"lowestProbability":0.13372956216335297,"highestFalseProbability":1.0000048875808716,"correctAvgConfidence":0.8043522860550157,"incorrectAvgConfidence":0.24264461969795084},"el":{"count":12039,"accuratePredictions":12039,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.5245928764343262,"highestFalseProbability":0,"correctAvgConfidence":0.9983111462375585,"incorrectAvgConfidence":null},"de":{"count":22014,"accuratePredictions":22011,"mislabels":[{"lang":"en","count":3}],"accuracy":0.9998637230853094,"falsePositives":64,"lowestProbability":0.2944102883338928,"highestFalseProbability":0.5308237671852112,"correctAvgConfidence":0.9873204391080626,"incorrectAvgConfidence":0.09905306498209636},"et":{"count":1227,"accuratePredictions":1148,"mislabels":[{"lang":"fi","count":34},{"lang":"en","count":10},{"lang":"hu","count":10},{"lang":"it","count":4},{"lang":"nl","count":3}],"accuracy":0.9356153219233904,"falsePositives":5,"lowestProbability":0.14920711517333984,"highestFalseProbability":0.9715332388877869,"correctAvgConfidence":0.7930725701567162,"incorrectAvgConfidence":0.24454671802305722},"eu":{"count":2999,"accuratePredictions":2902,"mislabels":[{"lang":"it","count":21},{"lang":"nl","count":18},{"lang":"id","count":11},{"lang":"en","count":8},{"lang":"io","count":5}],"accuracy":0.9676558852950984,"falsePositives":14,"lowestProbability":0.09961723536252975,"highestFalseProbability":0.9024008512496948,"correctAvgConfidence":0.7555618922205802,"incorrectAvgConfidence":0.12721208734533837},"eo":{"count":17841,"accuratePredictions":17833,"mislabels":[{"lang":"it","count":3},{"lang":"es","count":2},{"lang":"pt","count":1},{"lang":"fr","count":1},{"lang":"ceb","count":1}],"accuracy":0.999551594641556,"falsePositives":13,"lowestProbability":0.23145444691181183,"highestFalseProbability":0.6255041360855103,"correctAvgConfidence":0.9771947755108922,"incorrectAvgConfidence":0.15247727558016777},"en":{"count":22428,"accuratePredictions":22428,"mislabels":[],"accuracy":1,"falsePositives":120,"lowestProbability":0.31277090311050415,"highestFalseProbability":0,"correctAvgConfidence":0.9614576326805134,"incorrectAvgConfidence":null},"fy":{"count":107,"accuratePredictions":71,"mislabels":[{"lang":"nl","count":15},{"lang":"en","count":7},{"lang":"de","count":4},{"lang":"af","count":3},{"lang":"fr","count":3}],"accuracy":0.6635514018691588,"falsePositives":8,"lowestProbability":0.18109886348247528,"highestFalseProbability":0.9549309015274048,"correctAvgConfidence":0.687566593225685,"incorrectAvgConfidence":0.259281580724443},"gd":{"count":542,"accuratePredictions":486,"mislabels":[{"lang":"en","count":20},{"lang":"ga","count":13},{"lang":"de","count":4},{"lang":"fr","count":3},{"lang":"pam","count":3}],"accuracy":0.8966789667896679,"falsePositives":2,"lowestProbability":0.11731892079114914,"highestFalseProbability":0.5798916220664978,"correctAvgConfidence":0.5895466482948987,"incorrectAvgConfidence":0.13791651890746184},"ga":{"count":867,"accuratePredictions":786,"mislabels":[{"lang":"en","count":18},{"lang":"gd","count":11},{"lang":"ca","count":11},{"lang":"kv","count":6},{"lang":"cs","count":5}],"accuracy":0.9065743944636678,"falsePositives":14,"lowestProbability":0.11624463647603989,"highestFalseProbability":0.9368592500686646,"correctAvgConfidence":0.6668249724112758,"incorrectAvgConfidence":0.2168640845719679},"gl":{"count":2618,"accuratePredictions":1516,"mislabels":[{"lang":"pt","count":689},{"lang":"es","count":378},{"lang":"it","count":15},{"lang":"fr","count":5},{"lang":"ca","count":2}],"accuracy":0.5790679908326967,"falsePositives":8,"lowestProbability":0.1384539157152176,"highestFalseProbability":0.9998673796653748,"correctAvgConfidence":0.6503780012143217,"incorrectAvgConfidence":0.45353942153052706},"gv":{"count":6,"accuratePredictions":0,"mislabels":[{"lang":"cy","count":2},{"lang":"fr","count":1},{"lang":"nl","count":1},{"lang":"et","count":1},{"lang":"en","count":1}],"accuracy":0,"falsePositives":0,"lowestProbability":1,"highestFalseProbability":0.7409127354621887,"correctAvgConfidence":null,"incorrectAvgConfidence":0.32388975595434505},"gu":{"count":57,"accuratePredictions":57,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9978494048118591,"highestFalseProbability":0,"correctAvgConfidence":0.9997111081808742,"incorrectAvgConfidence":null},"ht":{"count":15,"accuratePredictions":1,"mislabels":[{"lang":"br","count":3},{"lang":"fr","count":3},{"lang":"su","count":2},{"lang":"diq","count":1},{"lang":"no","count":1}],"accuracy":0.06666666666666667,"falsePositives":3,"lowestProbability":0.1751844733953476,"highestFalseProbability":0.5749273300170898,"correctAvgConfidence":0.06675867736339569,"incorrectAvgConfidence":0.15369660247649466},"fi":{"count":17406,"accuratePredictions":17390,"mislabels":[{"lang":"it","count":6},{"lang":"et","count":3},{"lang":"en","count":2},{"lang":"hr","count":1},{"lang":"de","count":1}],"accuracy":0.9990807767436516,"falsePositives":37,"lowestProbability":0.16255062818527222,"highestFalseProbability":0.6433226466178894,"correctAvgConfidence":0.9710191056221561,"incorrectAvgConfidence":0.18440376338548958},"he":{"count":8616,"accuratePredictions":8616,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.8786194324493408,"highestFalseProbability":0,"correctAvgConfidence":0.999817035583582,"incorrectAvgConfidence":null},"hi":{"count":5362,"accuratePredictions":5360,"mislabels":[{"lang":"mr","count":2}],"accuracy":0.999627004848937,"falsePositives":0,"lowestProbability":0.5321338772773743,"highestFalseProbability":0.8880062103271484,"correctAvgConfidence":0.9939448039387319,"incorrectAvgConfidence":0.726732112467289},"hr":{"count":2222,"accuratePredictions":1185,"mislabels":[{"lang":"sr","count":852},{"lang":"sh","count":66},{"lang":"bs","count":51},{"lang":"sl","count":38},{"lang":"pl","count":12}],"accuracy":0.5333033303330333,"falsePositives":45,"lowestProbability":0.11031389981508255,"highestFalseProbability":0.9780457615852356,"correctAvgConfidence":0.4462057993209917,"incorrectAvgConfidence":0.3404354233619835},"hy":{"count":518,"accuratePredictions":517,"mislabels":[{"lang":"de","count":1}],"accuracy":0.9980694980694981,"falsePositives":0,"lowestProbability":0.9992043375968933,"highestFalseProbability":0.15621936321258545,"correctAvgConfidence":1.0000453856843372,"incorrectAvgConfidence":0.035118408501148224},"io":{"count":2905,"accuratePredictions":1396,"mislabels":[{"lang":"eo","count":1097},{"lang":"es","count":111},{"lang":"it","count":110},{"lang":"pt","count":54},{"lang":"tr","count":40}],"accuracy":0.48055077452667816,"falsePositives":7,"lowestProbability":0.1342163234949112,"highestFalseProbability":0.9914343953132629,"correctAvgConfidence":0.5664547488444712,"incorrectAvgConfidence":0.3550978258184717},"ie":{"count":2007,"accuratePredictions":947,"mislabels":[{"lang":"es","count":467},{"lang":"it","count":132},{"lang":"fr","count":102},{"lang":"en","count":77},{"lang":"ia","count":74}],"accuracy":0.4718485301444943,"falsePositives":7,"lowestProbability":0.0876827985048294,"highestFalseProbability":0.9982254505157471,"correctAvgConfidence":0.37375914048941444,"incorrectAvgConfidence":0.20109766928417572},"fr":{"count":23076,"accuratePredictions":23065,"mislabels":[{"lang":"en","count":7},{"lang":"es","count":2},{"lang":"it","count":1},{"lang":"ru","count":1}],"accuracy":0.999523314265904,"falsePositives":238,"lowestProbability":0.2356633096933365,"highestFalseProbability":0.6747938990592957,"correctAvgConfidence":0.9778449336716961,"incorrectAvgConfidence":0.17877410623160275},"id":{"count":9372,"accuratePredictions":8900,"mislabels":[{"lang":"ms","count":427},{"lang":"it","count":16},{"lang":"en","count":11},{"lang":"eo","count":4},{"lang":"tr","count":4}],"accuracy":0.949637217242851,"falsePositives":16,"lowestProbability":0.15933756530284882,"highestFalseProbability":0.9737618565559387,"correctAvgConfidence":0.7402594791836996,"incorrectAvgConfidence":0.3008475312963128},"hu":{"count":17942,"accuratePredictions":17936,"mislabels":[{"lang":"tr","count":1},{"lang":"br","count":1},{"lang":"it","count":1},{"lang":"de","count":1},{"lang":"en","count":1}],"accuracy":0.9996655891204994,"falsePositives":43,"lowestProbability":0.21547481417655945,"highestFalseProbability":0.9958308935165405,"correctAvgConfidence":0.9872564970510354,"incorrectAvgConfidence":0.2987443840441604},"is":{"count":6364,"accuratePredictions":6239,"mislabels":[{"lang":"et","count":29},{"lang":"no","count":19},{"lang":"da","count":17},{"lang":"hu","count":12},{"lang":"cs","count":9}],"accuracy":0.9803582652419862,"falsePositives":4,"lowestProbability":0.14347882568836212,"highestFalseProbability":0.9222594499588013,"correctAvgConfidence":0.9331574130853139,"incorrectAvgConfidence":0.20815998595952986},"jv":{"count":260,"accuratePredictions":168,"mislabels":[{"lang":"id","count":39},{"lang":"en","count":17},{"lang":"ms","count":8},{"lang":"ko","count":5},{"lang":"su","count":3}],"accuracy":0.6461538461538462,"falsePositives":5,"lowestProbability":0.11977090686559677,"highestFalseProbability":0.941997766494751,"correctAvgConfidence":0.48727030853929354,"incorrectAvgConfidence":0.18967366327896065},"ja":{"count":2169,"accuratePredictions":2169,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9178405404090881,"highestFalseProbability":0,"correctAvgConfidence":0.9993141423071018,"incorrectAvgConfidence":null},"kn":{"count":118,"accuratePredictions":118,"mislabels":[],"accuracy":1,"falsePositives":1,"lowestProbability":0.9560707807540894,"highestFalseProbability":0,"correctAvgConfidence":0.9986984921360537,"incorrectAvgConfidence":null},"ia":{"count":18782,"accuratePredictions":14088,"mislabels":[{"lang":"es","count":1802},{"lang":"it","count":1711},{"lang":"fr","count":508},{"lang":"la","count":229},{"lang":"pt","count":178}],"accuracy":0.7500798636992866,"falsePositives":82,"lowestProbability":0.08685839176177979,"highestFalseProbability":0.9834362268447876,"correctAvgConfidence":0.5305242322956538,"incorrectAvgConfidence":0.2132283623167998},"ka":{"count":1973,"accuratePredictions":1973,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.7867827415466309,"highestFalseProbability":0,"correctAvgConfidence":0.9956593665799054,"incorrectAvgConfidence":null},"km":{"count":379,"accuratePredictions":376,"mislabels":[{"lang":"az","count":2},{"lang":"et","count":1}],"accuracy":0.9920844327176781,"falsePositives":0,"lowestProbability":0.1995551884174347,"highestFalseProbability":0.5972086191177368,"correctAvgConfidence":0.9853872296326177,"incorrectAvgConfidence":0.18748741348584494},"ky":{"count":66,"accuratePredictions":58,"mislabels":[{"lang":"ru","count":4},{"lang":"kk","count":4}],"accuracy":0.8787878787878788,"falsePositives":4,"lowestProbability":0.39130115509033203,"highestFalseProbability":0.7497158050537109,"correctAvgConfidence":0.7278588729135789,"incorrectAvgConfidence":0.3713524490594864},"kk":{"count":2232,"accuratePredictions":2188,"mislabels":[{"lang":"ru","count":10},{"lang":"tr","count":9},{"lang":"tt","count":8},{"lang":"uk","count":6},{"lang":"ky","count":4}],"accuracy":0.9802867383512545,"falsePositives":4,"lowestProbability":0.3560357093811035,"highestFalseProbability":0.9922317266464233,"correctAvgConfidence":0.9706690864912705,"incorrectAvgConfidence":0.3102114117408002},"lo":{"count":70,"accuratePredictions":70,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.3941526412963867,"highestFalseProbability":0,"correctAvgConfidence":0.7733188960435134,"incorrectAvgConfidence":null},"ko":{"count":482,"accuratePredictions":482,"mislabels":[],"accuracy":1,"falsePositives":7,"lowestProbability":0.8198571801185608,"highestFalseProbability":0,"correctAvgConfidence":0.9989503776111682,"incorrectAvgConfidence":null},"li":{"count":3,"accuratePredictions":1,"mislabels":[{"lang":"de","count":2}],"accuracy":0.3333333333333333,"falsePositives":1,"lowestProbability":0.6388689279556274,"highestFalseProbability":0.9507495760917664,"correctAvgConfidence":0.5632038861513138,"incorrectAvgConfidence":0.4825317715294659},"it":{"count":18326,"accuratePredictions":18307,"mislabels":[{"lang":"es","count":11},{"lang":"de","count":3},{"lang":"fr","count":2},{"lang":"en","count":1},{"lang":"la","count":1}],"accuracy":0.9989632216522972,"falsePositives":2207,"lowestProbability":0.1876032054424286,"highestFalseProbability":0.9551967978477478,"correctAvgConfidence":0.9713309805001584,"incorrectAvgConfidence":0.2656080092205421},"lb":{"count":217,"accuratePredictions":129,"mislabels":[{"lang":"de","count":50},{"lang":"nds","count":8},{"lang":"sv","count":6},{"lang":"fr","count":6},{"lang":"nl","count":4}],"accuracy":0.5944700460829493,"falsePositives":3,"lowestProbability":0.13198049366474152,"highestFalseProbability":0.9986009001731873,"correctAvgConfidence":0.6048251575422149,"incorrectAvgConfidence":0.36636357403437037},"lv":{"count":1243,"accuratePredictions":1193,"mislabels":[{"lang":"lt","count":12},{"lang":"hr","count":4},{"lang":"sr","count":4},{"lang":"fi","count":3},{"lang":"eo","count":3}],"accuracy":0.9597747385358005,"falsePositives":4,"lowestProbability":0.14727765321731567,"highestFalseProbability":0.9997793436050415,"correctAvgConfidence":0.8689372204577105,"incorrectAvgConfidence":0.19104408176950527},"ml":{"count":517,"accuratePredictions":517,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9986559152603149,"highestFalseProbability":0,"correctAvgConfidence":0.9999117751173325,"incorrectAvgConfidence":null},"la":{"count":11437,"accuratePredictions":10529,"mislabels":[{"lang":"fr","count":259},{"lang":"it","count":218},{"lang":"en","count":170},{"lang":"es","count":99},{"lang":"pt","count":21}],"accuracy":0.9206085511934948,"falsePositives":292,"lowestProbability":0.07717550545930862,"highestFalseProbability":1.0000468492507935,"correctAvgConfidence":0.6563664864164119,"incorrectAvgConfidence":0.1617820091332059},"lt":{"count":13835,"accuratePredictions":13743,"mislabels":[{"lang":"fi","count":16},{"lang":"pl","count":13},{"lang":"eo","count":13},{"lang":"pt","count":8},{"lang":"sr","count":7}],"accuracy":0.9933501987712324,"falsePositives":20,"lowestProbability":0.12790657579898834,"highestFalseProbability":0.9843482971191406,"correctAvgConfidence":0.9371611838942122,"incorrectAvgConfidence":0.19300622200471876},"mt":{"count":52,"accuratePredictions":42,"mislabels":[{"lang":"es","count":3},{"lang":"cs","count":3},{"lang":"pt","count":1},{"lang":"sr","count":1},{"lang":"eo","count":1}],"accuracy":0.8076923076923077,"falsePositives":7,"lowestProbability":0.17778615653514862,"highestFalseProbability":0.44417107105255127,"correctAvgConfidence":0.562098236668611,"incorrectAvgConfidence":0.0677537739276886},"my":{"count":216,"accuratePredictions":216,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9940844178199768,"highestFalseProbability":0,"correctAvgConfidence":0.9998873020938769,"incorrectAvgConfidence":null},"mr":{"count":10461,"accuratePredictions":10456,"mislabels":[{"lang":"hi","count":5}],"accuracy":0.9995220342223496,"falsePositives":2,"lowestProbability":0.4904531240463257,"highestFalseProbability":1.00002920627594,"correctAvgConfidence":0.9920979104482953,"incorrectAvgConfidence":0.8198977816457045},"nn":{"count":657,"accuratePredictions":525,"mislabels":[{"lang":"da","count":77},{"lang":"sv","count":32},{"lang":"de","count":9},{"lang":"es","count":5},{"lang":"fi","count":2}],"accuracy":0.7990867579908676,"falsePositives":29,"lowestProbability":0.20111340284347534,"highestFalseProbability":0.9768945574760437,"correctAvgConfidence":0.5750402996634165,"incorrectAvgConfidence":0.3076396750102779},"oc":{"count":2861,"accuratePredictions":2197,"mislabels":[{"lang":"ca","count":242},{"lang":"es","count":173},{"lang":"fr","count":130},{"lang":"pt","count":33},{"lang":"it","count":21}],"accuracy":0.7679133170220203,"falsePositives":27,"lowestProbability":0.14034049212932587,"highestFalseProbability":0.9993643760681152,"correctAvgConfidence":0.693478145898965,"incorrectAvgConfidence":0.27567044537355256},"os":{"count":59,"accuratePredictions":58,"mislabels":[{"lang":"ru","count":1}],"accuracy":0.9830508474576272,"falsePositives":0,"lowestProbability":0.20986901223659515,"highestFalseProbability":0.7272360324859619,"correctAvgConfidence":0.8207800509568689,"incorrectAvgConfidence":0.5724851340055466},"pa":{"count":88,"accuratePredictions":88,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9979016780853271,"highestFalseProbability":0,"correctAvgConfidence":0.9997773331272645,"incorrectAvgConfidence":null},"mk":{"count":14465,"accuratePredictions":14407,"mislabels":[{"lang":"bg","count":35},{"lang":"sr","count":14},{"lang":"ru","count":9}],"accuracy":0.9959903214656066,"falsePositives":93,"lowestProbability":0.3797056972980499,"highestFalseProbability":0.8783867359161377,"correctAvgConfidence":0.9659177295866043,"incorrectAvgConfidence":0.2429016182391808},"nl":{"count":19626,"accuratePredictions":19498,"mislabels":[{"lang":"en","count":57},{"lang":"af","count":21},{"lang":"de","count":18},{"lang":"nds","count":9},{"lang":"fr","count":7}],"accuracy":0.9934780393355752,"falsePositives":382,"lowestProbability":0.09129912406206131,"highestFalseProbability":0.9536797404289246,"correctAvgConfidence":0.9579142892651017,"incorrectAvgConfidence":0.23610969319270225},"rm":{"count":16,"accuratePredictions":1,"mislabels":[{"lang":"it","count":5},{"lang":"fr","count":2},{"lang":"en","count":2},{"lang":"tl","count":1},{"lang":"qu","count":1}],"accuracy":0.0625,"falsePositives":3,"lowestProbability":0.40953710675239563,"highestFalseProbability":0.7947274446487427,"correctAvgConfidence":0.3415144458413124,"incorrectAvgConfidence":0.22089687262972196},"pl":{"count":17768,"accuratePredictions":17763,"mislabels":[{"lang":"en","count":2},{"lang":"eo","count":1},{"lang":"de","count":1},{"lang":"ro","count":1}],"accuracy":0.999718595227375,"falsePositives":88,"lowestProbability":0.21345049142837524,"highestFalseProbability":0.6913073062896729,"correctAvgConfidence":0.9890330630600913,"incorrectAvgConfidence":0.15380255579948426},"pt":{"count":20174,"accuratePredictions":20039,"mislabels":[{"lang":"es","count":95},{"lang":"gl","count":15},{"lang":"it","count":7},{"lang":"en","count":5},{"lang":"fr","count":4}],"accuracy":0.9933082184990581,"falsePositives":1149,"lowestProbability":0.21880872547626495,"highestFalseProbability":0.9230474233627319,"correctAvgConfidence":0.9484523955782658,"incorrectAvgConfidence":0.24518370324814762},"sa":{"count":11,"accuratePredictions":8,"mislabels":[{"lang":"hi","count":2},{"lang":"ne","count":1}],"accuracy":0.7272727272727273,"falsePositives":0,"lowestProbability":0.8556936979293823,"highestFalseProbability":0.8185657262802124,"correctAvgConfidence":0.9226900283247232,"incorrectAvgConfidence":0.4112839549779892},"si":{"count":9,"accuratePredictions":9,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9964900016784668,"highestFalseProbability":0,"correctAvgConfidence":0.9990718097635383,"incorrectAvgConfidence":null},"ro":{"count":13560,"accuratePredictions":13406,"mislabels":[{"lang":"es","count":31},{"lang":"fr","count":28},{"lang":"it","count":22},{"lang":"en","count":17},{"lang":"pt","count":14}],"accuracy":0.9886430678466077,"falsePositives":133,"lowestProbability":0.15412086248397827,"highestFalseProbability":0.8736457824707031,"correctAvgConfidence":0.9464068983322231,"incorrectAvgConfidence":0.16759528258404174},"sl":{"count":372,"accuratePredictions":252,"mislabels":[{"lang":"sr","count":59},{"lang":"hr","count":19},{"lang":"bs","count":13},{"lang":"pl","count":8},{"lang":"eo","count":5}],"accuracy":0.6774193548387096,"falsePositives":62,"lowestProbability":0.17472949624061584,"highestFalseProbability":0.9461504220962524,"correctAvgConfidence":0.5286300588437977,"incorrectAvgConfidence":0.22120561103026073},"so":{"count":32,"accuratePredictions":7,"mislabels":[{"lang":"fi","count":9},{"lang":"eo","count":2},{"lang":"cy","count":2},{"lang":"en","count":2},{"lang":"az","count":2}],"accuracy":0.21875,"falsePositives":1,"lowestProbability":0.09245844185352325,"highestFalseProbability":0.956329882144928,"correctAvgConfidence":0.09194009538207736,"incorrectAvgConfidence":0.15861030662432313},"sk":{"count":4370,"accuratePredictions":3611,"mislabels":[{"lang":"cs","count":583},{"lang":"pl","count":33},{"lang":"sl","count":28},{"lang":"no","count":21},{"lang":"sr","count":17}],"accuracy":0.8263157894736842,"falsePositives":45,"lowestProbability":0.1748054176568985,"highestFalseProbability":1.0000356435775757,"correctAvgConfidence":0.757084173876699,"incorrectAvgConfidence":0.34975553321144875},"ru":{"count":17329,"accuratePredictions":17324,"mislabels":[{"lang":"bg","count":2},{"lang":"kk","count":1},{"lang":"uk","count":1},{"lang":"mk","count":1}],"accuracy":0.9997114663281205,"falsePositives":241,"lowestProbability":0.5000648498535156,"highestFalseProbability":0.7860463261604309,"correctAvgConfidence":0.9907578862055559,"incorrectAvgConfidence":0.16911868155002593},"su":{"count":11,"accuratePredictions":2,"mislabels":[{"lang":"id","count":6},{"lang":"ms","count":2},{"lang":"es","count":1}],"accuracy":0.18181818181818182,"falsePositives":19,"lowestProbability":0.20011375844478607,"highestFalseProbability":0.7112906575202942,"correctAvgConfidence":0.08744853362441063,"incorrectAvgConfidence":0.22517896278036964},"sr":{"count":13494,"accuratePredictions":11456,"mislabels":[{"lang":"hr","count":963},{"lang":"sh","count":441},{"lang":"mk","count":154},{"lang":"bs","count":126},{"lang":"sl","count":107}],"accuracy":0.8489699125537276,"falsePositives":1050,"lowestProbability":0.12105872482061386,"highestFalseProbability":0.9961422681808472,"correctAvgConfidence":0.6466786750228418,"incorrectAvgConfidence":0.21803273776773696},"ta":{"count":205,"accuratePredictions":205,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9989544749259949,"highestFalseProbability":0,"correctAvgConfidence":1.0000112452849863,"incorrectAvgConfidence":null},"es":{"count":18227,"accuratePredictions":18193,"mislabels":[{"lang":"pt","count":21},{"lang":"it","count":7},{"lang":"io","count":2},{"lang":"ca","count":1},{"lang":"ia","count":1}],"accuracy":0.998134635430954,"falsePositives":3476,"lowestProbability":0.26517876982688904,"highestFalseProbability":0.840143084526062,"correctAvgConfidence":0.9406717692031589,"incorrectAvgConfidence":0.22759082803831382},"te":{"count":102,"accuratePredictions":102,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9983260035514832,"highestFalseProbability":0,"correctAvgConfidence":0.9999092835657447,"incorrectAvgConfidence":null},"tg":{"count":30,"accuratePredictions":29,"mislabels":[{"lang":"ru","count":1}],"accuracy":0.9666666666666667,"falsePositives":0,"lowestProbability":0.5033898949623108,"highestFalseProbability":0.4936623275279999,"correctAvgConfidence":0.8163995144142094,"incorrectAvgConfidence":0.32341302931308746},"sv":{"count":12188,"accuratePredictions":12050,"mislabels":[{"lang":"no","count":59},{"lang":"da","count":55},{"lang":"en","count":6},{"lang":"fi","count":4},{"lang":"id","count":3}],"accuracy":0.9886773875943551,"falsePositives":174,"lowestProbability":0.1215486153960228,"highestFalseProbability":0.9114173054695129,"correctAvgConfidence":0.9320943643945397,"incorrectAvgConfidence":0.2572491969238373},"th":{"count":572,"accuratePredictions":572,"mislabels":[],"accuracy":1,"falsePositives":0,"lowestProbability":0.9927242994308472,"highestFalseProbability":0,"correctAvgConfidence":0.9997663607907868,"incorrectAvgConfidence":null},"tt":{"count":8178,"accuratePredictions":7917,"mislabels":[{"lang":"az","count":70},{"lang":"tr","count":51},{"lang":"ru","count":37},{"lang":"fi","count":19},{"lang":"kk","count":13}],"accuracy":0.9680851063829787,"falsePositives":13,"lowestProbability":0.14898742735385895,"highestFalseProbability":1.000041127204895,"correctAvgConfidence":0.9237682296735699,"incorrectAvgConfidence":0.3369664904009205},"tk":{"count":3793,"accuratePredictions":2166,"mislabels":[{"lang":"tr","count":942},{"lang":"uz","count":146},{"lang":"en","count":116},{"lang":"et","count":79},{"lang":"io","count":41}],"accuracy":0.5710519377801213,"falsePositives":0,"lowestProbability":0.09297733008861542,"highestFalseProbability":0.9993958473205566,"correctAvgConfidence":0.5266673936840375,"incorrectAvgConfidence":0.3539530090760075},"tl":{"count":10351,"accuratePredictions":10079,"mislabels":[{"lang":"ceb","count":108},{"lang":"en","count":57},{"lang":"id","count":24},{"lang":"es","count":15},{"lang":"war","count":11}],"accuracy":0.9737223456670853,"falsePositives":21,"lowestProbability":0.09424613416194916,"highestFalseProbability":0.9999354481697083,"correctAvgConfidence":0.7973100713083318,"incorrectAvgConfidence":0.2603196013111462},"ug":{"count":3692,"accuratePredictions":3689,"mislabels":[{"lang":"ba","count":1},{"lang":"ru","count":1},{"lang":"hu","count":1}],"accuracy":0.9991874322860238,"falsePositives":0,"lowestProbability":0.5213584303855896,"highestFalseProbability":0.5531967878341675,"correctAvgConfidence":0.9962514697004626,"incorrectAvgConfidence":0.24187888701756796},"ur":{"count":963,"accuratePredictions":954,"mislabels":[{"lang":"pnb","count":6},{"lang":"fa","count":1},{"lang":"ro","count":1},{"lang":"en","count":1}],"accuracy":0.9906542056074766,"falsePositives":9,"lowestProbability":0.5764018893241882,"highestFalseProbability":0.9979611039161682,"correctAvgConfidence":0.976251529633783,"incorrectAvgConfidence":0.4885160554226281},"uk":{"count":14285,"accuratePredictions":14233,"mislabels":[{"lang":"ru","count":51},{"lang":"sr","count":1}],"accuracy":0.9963598179908996,"falsePositives":133,"lowestProbability":0.3561241626739502,"highestFalseProbability":0.9522985816001892,"correctAvgConfidence":0.9853175570592393,"incorrectAvgConfidence":0.38070394650388223},"vo":{"count":806,"accuratePredictions":686,"mislabels":[{"lang":"id","count":16},{"lang":"de","count":16},{"lang":"fi","count":13},{"lang":"en","count":12},{"lang":"eo","count":9}],"accuracy":0.8511166253101737,"falsePositives":15,"lowestProbability":0.12574473023414612,"highestFalseProbability":0.9909709095954895,"correctAvgConfidence":0.5384087601954725,"incorrectAvgConfidence":0.1557349536800757},"wa":{"count":16,"accuratePredictions":5,"mislabels":[{"lang":"fr","count":7},{"lang":"pt","count":1},{"lang":"tl","count":1},{"lang":"oc","count":1},{"lang":"en","count":1}],"accuracy":0.3125,"falsePositives":1,"lowestProbability":0.6337143182754517,"highestFalseProbability":0.9523198008537292,"correctAvgConfidence":0.683312127366662,"incorrectAvgConfidence":0.30093699278817937},"yo":{"count":5,"accuratePredictions":3,"mislabels":[{"lang":"sk","count":1},{"lang":"rm","count":1}],"accuracy":0.6,"falsePositives":1,"lowestProbability":0.4282495677471161,"highestFalseProbability":0.39966583251953125,"correctAvgConfidence":0.3706127094725768,"incorrectAvgConfidence":0.08796234056353569},"tr":{"count":19919,"accuratePredictions":19911,"mislabels":[{"lang":"eo","count":2},{"lang":"en","count":1},{"lang":"it","count":1},{"lang":"fr","count":1},{"lang":"nds","count":1}],"accuracy":0.9995983734123199,"falsePositives":1092,"lowestProbability":0.22616560757160187,"highestFalseProbability":0.6153099536895752,"correctAvgConfidence":0.9927376141332793,"incorrectAvgConfidence":0.2094208262860775},"vi":{"count":13000,"accuratePredictions":12995,"mislabels":[{"lang":"eo","count":3},{"lang":"hu","count":1},{"lang":"fr","count":1}],"accuracy":0.9996153846153846,"falsePositives":9,"lowestProbability":0.31661665439605713,"highestFalseProbability":0.9998860359191895,"correctAvgConfidence":0.9981391458102108,"incorrectAvgConfidence":0.6576308641175274}} -------------------------------------------------------------------------------- /benchmark-testing/results/reliability_list_0.2.2.json: -------------------------------------------------------------------------------- 1 | [ 2 | "th", 3 | "ml", 4 | "my", 5 | "ta", 6 | "te", 7 | "pa", 8 | "am", 9 | "kn", 10 | "gu", 11 | "si", 12 | "bo", 13 | "dv", 14 | "ja", 15 | "el", 16 | "he", 17 | "ko", 18 | "hy", 19 | "bn", 20 | "mr", 21 | "en", 22 | "zh", 23 | "tr", 24 | "ru", 25 | "de", 26 | "ug", 27 | "vi", 28 | "eo", 29 | "ka", 30 | "hi", 31 | "it", 32 | "ar", 33 | "fr", 34 | "hu", 35 | "lo", 36 | "pl", 37 | "km", 38 | "es", 39 | "fi", 40 | "pt", 41 | "mk", 42 | "uk", 43 | "ur", 44 | "nl", 45 | "lt", 46 | "cs" 47 | ] 48 | -------------------------------------------------------------------------------- /benchmark-testing/results/reliability_list_0.2.3.json: -------------------------------------------------------------------------------- 1 | ["en","el","he","ja","ka","bn","th","zh","ml","ko","my","ta","kn","te","pa","lo","gu","bo","dv","de","pl","ru","hu","hi","vi","tr","eo","fr","mr","ug","fi","it","es","hy","ar","uk","mk","nl","lt","pt","km","ur","cs","sv","ro","bg","os","is","kk","tl","tt","eu","tg","be","lv","cv","br","ba"] -------------------------------------------------------------------------------- /model/.gitkeep: -------------------------------------------------------------------------------- 1 | place holder for location of fast-text model -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@smodin/fast-text-language-detection", 3 | "version": "0.2.4", 4 | "description": "Language detection with facebook fast-text model", 5 | "homepage": "https://github.com/smodin-io/fast-text-language-detection", 6 | "keywords": [ 7 | "language", 8 | "detection", 9 | "fast-text" 10 | ], 11 | "main": "dist/my-lib.cjs.js", 12 | "module": "dist/my-lib.esm.js", 13 | "browser": "dist/my-lib.umd.js", 14 | "files": [ 15 | "dist" 16 | ], 17 | "scripts": { 18 | "install": "npm-install-fetch", 19 | "dev": "rollup -c -w", 20 | "build": "rollup -c", 21 | "test": "echo \"Error: no test specified\" && exit 1" 22 | }, 23 | "npm-install-fetch": { 24 | "name": "(@smodin/fast-text-language-detection): FastText LID-176 model", 25 | "input": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", 26 | "output": "model/fast-text-lid-model.bin" 27 | }, 28 | "devDependencies": { 29 | "@rollup/plugin-commonjs": "^19.0.2", 30 | "@rollup/plugin-node-resolve": "^13.0.4", 31 | "@rollup/plugin-typescript": "^11.1.0", 32 | "eslint": "^7.26.0", 33 | "rollup": "^2.56.0", 34 | "tslib": "^2.5.0", 35 | "ts-node": "^10.9.1", 36 | "typescript": "^4.2.4" 37 | }, 38 | "dependencies": { 39 | "fast-text": "^1.0.3", 40 | "npm-install-fetch": "^1.3.8" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | import resolve from '@rollup/plugin-node-resolve' 2 | import commonjs from '@rollup/plugin-commonjs' 3 | import typescript from '@rollup/plugin-typescript' 4 | import pkg from './package.json' 5 | 6 | export default [ 7 | // browser-friendly UMD build 8 | { 9 | input: 'src/index.ts', 10 | output: { 11 | name: 'myLib', 12 | file: pkg.browser, 13 | format: 'umd', 14 | }, 15 | plugins: [resolve(), commonjs(), typescript({ tsconfig: './tsconfig.json' })], 16 | }, 17 | 18 | // CommonJS (for Node) and ES module (for bundlers) build. 19 | // (We could have three entries in the configuration array 20 | // instead of two, but it's quicker to generate multiple 21 | // builds from a single configuration where possible, using 22 | // an array for the `output` option, where we can specify 23 | // `file` and `format` for each target) 24 | { 25 | input: 'src/index.ts', 26 | output: [ 27 | { file: pkg.main, format: 'cjs' }, 28 | { file: pkg.module, format: 'es' }, 29 | ], 30 | plugins: [typescript({ tsconfig: './tsconfig.json' })], 31 | }, 32 | ] 33 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /* internal requirements */ 2 | const path = require('path') 3 | 4 | /* external requirements */ 5 | const { Classifier } = require('fast-text') 6 | 7 | // from benchmark-testing/results/reliability_list_.json 8 | const reliabilityList = [ 9 | 'th', 10 | 'ml', 11 | 'my', 12 | 'ta', 13 | 'te', 14 | 'pa', 15 | 'am', 16 | 'kn', 17 | 'gu', 18 | 'si', 19 | 'bo', 20 | 'dv', 21 | 'ja', 22 | 'el', 23 | 'he', 24 | 'ko', 25 | 'hy', 26 | 'bn', 27 | 'mr', 28 | 'en', 29 | 'zh', 30 | 'tr', 31 | 'ru', 32 | 'de', 33 | 'ug', 34 | 'vi', 35 | 'eo', 36 | 'ka', 37 | 'hi', 38 | 'it', 39 | 'ar', 40 | 'fr', 41 | 'hu', 42 | 'lo', 43 | 'pl', 44 | 'km', 45 | 'es', 46 | 'fi', 47 | 'pt', 48 | 'mk', 49 | 'uk', 50 | 'ur', 51 | 'nl', 52 | 'lt', 53 | 'cs', 54 | ] 55 | 56 | const languageIsoCodes = [ 57 | 'af', 58 | 'als', 59 | 'am', 60 | 'an', 61 | 'ar', 62 | 'arz', 63 | 'as', 64 | 'ast', 65 | 'av', 66 | 'az', 67 | 'azb', 68 | 'ba', 69 | 'bar', 70 | 'bcl', 71 | 'be', 72 | 'bg', 73 | 'bh', 74 | 'bn', 75 | 'bo', 76 | 'bpy', 77 | 'br', 78 | 'bs', 79 | 'bxr', 80 | 'ca', 81 | 'cbk', 82 | 'ce', 83 | 'ceb', 84 | 'ckb', 85 | 'co', 86 | 'cs', 87 | 'cv', 88 | 'cy', 89 | 'da', 90 | 'de', 91 | 'diq', 92 | 'dsb', 93 | 'dty', 94 | 'dv', 95 | 'el', 96 | 'eml', 97 | 'en', 98 | 'eo', 99 | 'es', 100 | 'et', 101 | 'eu', 102 | 'fa', 103 | 'fi', 104 | 'fr', 105 | 'frr', 106 | 'fy', 107 | 'ga', 108 | 'gd', 109 | 'gl', 110 | 'gn', 111 | 'gom', 112 | 'gu', 113 | 'gv', 114 | 'he', 115 | 'hi', 116 | 'hif', 117 | 'hr', 118 | 'hsb', 119 | 'ht', 120 | 'hu', 121 | 'hy', 122 | 'ia', 123 | 'id', 124 | 'ie', 125 | 'ilo', 126 | 'io', 127 | 'is', 128 | 'it', 129 | 'ja', 130 | 'jbo', 131 | 'jv', 132 | 'ka', 133 | 'kk', 134 | 'km', 135 | 'kn', 136 | 'ko', 137 | 'krc', 138 | 'ku', 139 | 'kv', 140 | 'kw', 141 | 'ky', 142 | 'la', 143 | 'lb', 144 | 'lez', 145 | 'li', 146 | 'lmo', 147 | 'lo', 148 | 'lrc', 149 | 'lt', 150 | 'lv', 151 | 'mai', 152 | 'mg', 153 | 'mhr', 154 | 'min', 155 | 'mk', 156 | 'ml', 157 | 'mn', 158 | 'mr', 159 | 'mrj', 160 | 'ms', 161 | 'mt', 162 | 'mwl', 163 | 'my', 164 | 'myv', 165 | 'mzn', 166 | 'nah', 167 | 'nap', 168 | 'nds', 169 | 'ne', 170 | 'new', 171 | 'nl', 172 | 'nn', 173 | 'no', 174 | 'oc', 175 | 'or', 176 | 'os', 177 | 'pa', 178 | 'pam', 179 | 'pfl', 180 | 'pl', 181 | 'pms', 182 | 'pnb', 183 | 'ps', 184 | 'pt', 185 | 'qu', 186 | 'rm', 187 | 'ro', 188 | 'ru', 189 | 'rue', 190 | 'sa', 191 | 'sah', 192 | 'sc', 193 | 'scn', 194 | 'sco', 195 | 'sd', 196 | 'sh', 197 | 'si', 198 | 'sk', 199 | 'sl', 200 | 'so', 201 | 'sq', 202 | 'sr', 203 | 'su', 204 | 'sv', 205 | 'sw', 206 | 'ta', 207 | 'te', 208 | 'tg', 209 | 'th', 210 | 'tk', 211 | 'tl', 212 | 'tr', 213 | 'tt', 214 | 'tyv', 215 | 'ug', 216 | 'uk', 217 | 'ur', 218 | 'uz', 219 | 'vec', 220 | 'vep', 221 | 'vi', 222 | 'vls', 223 | 'vo', 224 | 'wa', 225 | 'war', 226 | 'wuu', 227 | 'xal', 228 | 'xmf', 229 | 'yi', 230 | 'yo', 231 | 'yue', 232 | 'zh', 233 | ] 234 | // Some characters can misconstrude the results 235 | // Removing \n and : because of this issue: https://github.com/indix/whatthelang/issues/12 236 | function formatInput(text: string) { 237 | return text.replace(/[\n:]/g, '') 238 | } 239 | 240 | /* the API class */ 241 | class LanguageDetection { 242 | constructor(options = {}) { 243 | this.options = Object.assign( 244 | {}, 245 | { 246 | model: path.join(__dirname, '..', 'model', 'fast-text-lid-model.bin'), 247 | }, 248 | options 249 | ) 250 | this.classifier = new Classifier(this.options.model) 251 | this.languageIsoCodes = languageIsoCodes 252 | } 253 | predict(text: string, k = 1) { 254 | return new Promise((resolve, reject) => { 255 | this.classifier.predict(formatInput(text), k, (err, res) => { 256 | if (err) reject(err) 257 | else { 258 | res = res.map((item) => { 259 | const lang = item.label.replace(/^__label__/, '') 260 | return { 261 | lang, 262 | prob: item.value, 263 | isReliableLanguage: reliabilityList.includes(lang), 264 | } 265 | }) 266 | resolve(res) 267 | } 268 | }) 269 | }) 270 | } 271 | } 272 | 273 | module.exports = LanguageDetection 274 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "module": "esnext", 5 | "moduleResolution": "node", 6 | "strict": true, 7 | "esModuleInterop": true 8 | } 9 | } --------------------------------------------------------------------------------