├── .gitignore ├── LICENSE ├── README.md ├── bin └── cli.js ├── package-lock.json ├── package.json ├── src ├── components │ ├── checkByteOrderMark.js │ ├── checkUTF.js │ ├── processContent.js │ └── processing-content │ │ ├── calculateConfidenceScore.js │ │ └── countAllMatches.js ├── config │ ├── byteOrderMarkObject.js │ └── languageObject.js ├── index-browser.js ├── index-node.js └── index.d.ts └── tests ├── browser ├── browser-test │ ├── README.md │ ├── package-lock.json │ ├── package.json │ ├── public │ │ ├── index.html │ │ ├── manifest.json │ │ └── robots.txt │ ├── src │ │ ├── App.tsx │ │ ├── index.tsx │ │ └── react-app-env.d.ts │ └── tsconfig.json ├── html-test │ ├── app.js │ └── index.html └── live-demo │ ├── LICENSE │ ├── README.md │ ├── package-lock.json │ ├── package.json │ ├── public │ ├── index.html │ ├── manifest.json │ └── robots.txt │ └── src │ ├── App.js │ ├── defaultFileInfo.js │ ├── index.css │ └── index.js └── node ├── node-ts-test ├── index.ts ├── package-lock.json ├── package.json └── tsconfig.json └── node.test.js /.gitignore: -------------------------------------------------------------------------------- 1 | umd/ 2 | node_modules 3 | build -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 gignu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detect-File-Encoding-And-Language 2 | 3 | ![npm](https://img.shields.io/npm/dm/detect-file-encoding-and-language) 4 | ![npm](https://img.shields.io/npm/v/detect-file-encoding-and-language) 5 | ![npm bundle size](https://img.shields.io/bundlephobia/min/detect-file-encoding-and-language) 6 | 7 | [![NPM stats](https://nodei.co/npm/detect-file-encoding-and-language.svg?downloadRank=true&downloads=true)](https://www.npmjs.org/package/detect-file-encoding-and-language) 8 | 9 | ## Functionality 10 | 11 | Determine the encoding and language of text files! 12 | 13 | - Detects 40 languages as well as the appropriate encoding 14 | - Available as CLI, in Node.js and in the browser 15 | - Supports .txt, .srt, .sub, .html, .csv, .tsv 16 | - Works best with large inputs 17 | - Completely free, no API key required 18 | 19 | For reliable encoding and language detection, use files containing at least 500 words of coherent text. Smaller inputs can work as well but the results might be less accurate and in some cases incorrect. 20 | 21 | ## Live Demo 22 | 23 | Feel free to test the functionality of this NPM package [here](https://detect-file-encoding-and-language-live-demo.netlify.app/). Upload your own files and see if the encoding and language are detected correctly! 24 | 25 | ## Installation 26 | 27 | ``` 28 | npm install detect-file-encoding-and-language 29 | ``` 30 | 31 | ## Usage 32 | 33 | ### Via CDN 34 | 35 | ```js 36 | // index.html 37 | 38 | 39 | 40 | 41 | 42 | 43 | // app.js 44 | document.getElementById("my-input-field").addEventListener("change", (e) => { 45 | const file = e.target.files[0]; 46 | languageEncoding(file).then((fileInfo) => console.log(fileInfo)); 47 | // Possible result: { language: english, encoding: UTF-8, confidence: { encoding: 1, language: 1 } } 48 | }); 49 | ``` 50 | 51 | If you don't want to use a CDN feel free to [download the source code](https://github.com/gignupg/Detect-File-Encoding-and-Language/wiki/Downloading-the-Source-Code)! 52 | 53 | ### In React 54 | 55 | ```js 56 | // App.js 57 | import languageEncoding from "detect-file-encoding-and-language"; 58 | export default function App() { 59 | function inputHandler(e) { 60 | const file = e.target.files[0]; 61 | languageEncoding(file).then((fileInfo) => console.log(fileInfo)); 62 | // Possible result: { language: english, encoding: UTF-8, confidence: { encoding: 1, language: 1 } } 63 | } 64 | return ; 65 | } 66 | ``` 67 | 68 | ### In Node 69 | #### File 70 | 71 | ```js 72 | // server.js 73 | const languageEncoding = require("detect-file-encoding-and-language"); 74 | const pathToFile = "/home/username/documents/my-text-file.txt"; 75 | languageEncoding(pathToFile).then((fileInfo) => console.log(fileInfo)); 76 | // Possible result: { language: japanese, encoding: Shift-JIS, confidence: { encoding: 0.94, language: 0.94 } } 77 | ``` 78 | 79 | #### Buffer 80 | 81 | ```js 82 | // server.js 83 | const languageEncoding = require("detect-file-encoding-and-language"); 84 | const content = Buffer.from("file content"); 85 | languageEncoding(content).then((fileInfo) => console.log(fileInfo)); 86 | // Possible result: { language: japanese, encoding: Shift-JIS, confidence: { encoding: 0.94, language: 0.94 } } 87 | ``` 88 | 89 | ### Via CLI 90 | 91 | ```bash 92 | # Installation 93 | npm install -g detect-file-encoding-and-language 94 | 95 | # Usage 96 | dfeal "/home/username/Documents/subtitle file.srt" 97 | # Possible result: { language: french, encoding: CP1252, confidence: { encoding: 0.99, language: 0.99 } } 98 | ``` 99 | 100 | ### Using a buffer (browser) 101 | Check out [this issue page](https://github.com/gignupg/Detect-File-Encoding-And-Language/issues/3#issuecomment-1476074963)! @davuses posted a very simple code snippet there that converts your buffer into a blob which you can then pass into the function instead of a file! 102 | 103 | ## Supported Languages 104 | 105 | - Polish 106 | - Czech 107 | - Hungarian 108 | - Romanian 109 | - Slovak 110 | - Slovenian 111 | - Albanian 112 | - Russian 113 | - Ukrainian 114 | - Bulgarian 115 | - English 116 | - French 117 | - Portuguese 118 | - Spanish 119 | - German 120 | - Italian 121 | - Danish 122 | - Norwegian 123 | - Swedish 124 | - Dutch 125 | - Finnish 126 | - Serbo-Croatian 127 | - Estonian 128 | - Icelandic 129 | - Malay-Indonesian 130 | - Greek 131 | - Turkish 132 | - Hebrew 133 | - Arabic 134 | - Farsi-Persian 135 | - Lithuanian 136 | - Chinese-Simplified 137 | - Chinese-Traditional 138 | - Japanese 139 | - Korean 140 | - Thai 141 | - Bengali 142 | - Hindi 143 | - Urdu 144 | - Vietnamese 145 | 146 | ## Used Encodings 147 | 148 | - UTF-8 149 | - UTF-16LE 150 | - UTF-16BE 151 | - UTF-32LE 152 | - UTF-32BE 153 | - UTF-7 154 | - UTF-1 155 | - UTF-EBCDIC 156 | - SCSU 157 | - BOCU-1 158 | - CP1250 159 | - CP1251 160 | - CP1252 161 | - CP1253 162 | - CP1254 163 | - CP1255 164 | - CP1256 165 | - CP1257 166 | - GB18030 167 | - BIG5 168 | - Shift-JIS 169 | - EUC-KR 170 | - TIS-620 171 | 172 | ## Confidence Score 173 | 174 | The confidence score ranges from 0 to 1. It's an object that contains two different confidence scores. The language confidence score and the encoding confidence score. Both confidence scores will be the same if the detected encoding is Unicode. Otherwise the confidence score for the language and the encoding is calculated seperately. It is based on the amount of matches that were found for a particular language and the frequency of those matches. If you want to learn more about how it all works, check out the [Wiki entry](https://github.com/gignupg/Detect-File-Encoding-and-Language/wiki)! 175 | 176 | ## License 177 | 178 | This project is licensed under the MIT License 179 | 180 | ![License](https://img.shields.io/badge/License-MIT-yellowgreen) 181 | -------------------------------------------------------------------------------- /bin/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const languageEncoding = require("../src/index-node.js"); 3 | 4 | const path = process.argv[2]; 5 | const notEnoughArguments = process.argv.length < 3; 6 | const tooManyArguments = process.argv[3]; 7 | 8 | if (notEnoughArguments) console.error('Error: No argument passed in. Please pass in the file path as an argument! If the path contains spaces, surround it with quotes or use backslashes to escape spaces.'); 9 | if (tooManyArguments) console.warn('Warning: Too many arguments passed in. Ignoring all extra arguments. Only one argument (the file path) can be passed in! If the path contains spaces, surround it with quotes or use backslashes to escape spaces.'); 10 | 11 | languageEncoding(path) 12 | .then((fileInfo) => console.info(JSON.stringify(fileInfo, null, 4))) 13 | .catch((error) => console.error(error)) -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "detect-file-encoding-and-language", 3 | "version": "2.4.0", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "detect-file-encoding-and-language", 9 | "version": "2.4.0", 10 | "license": "MIT", 11 | "bin": { 12 | "dfeal": "bin/cli.js" 13 | } 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "detect-file-encoding-and-language", 3 | "version": "2.4.0", 4 | "description": "Charset Detector - Detect the encoding and language of text files - Use it in the browser, with Node.js, or via CLI", 5 | "main": "src/index-node.js", 6 | "browser": "src/index-browser.js", 7 | "types": "src/index.d.ts", 8 | "scripts": { 9 | "test": "node ./tests/node/node.test.js", 10 | "testSingleFile": "node ./bin/cli.js", 11 | "build": "browserify ./src/index-browser.js --standalone languageEncoding > ./umd/language-encoding.min.js", 12 | "minify": "uglifyjs ./umd/language-encoding.min.js --compress --output ./umd/language-encoding.min.js", 13 | "prepublishOnly": "npm test" 14 | }, 15 | "bin": { 16 | "dfeal": "./bin/cli.js" 17 | }, 18 | "files": [ 19 | "umd", 20 | "src" 21 | ], 22 | "repository": { 23 | "type": "git", 24 | "url": "git+https://github.com/gignupg/Detect-File-Encoding-And-Language.git" 25 | }, 26 | "keywords": [ 27 | "detect", 28 | "encoding", 29 | "charset", 30 | "detection", 31 | "language", 32 | "detector", 33 | "file", 34 | "detector", 35 | "files", 36 | "tool", 37 | "character", 38 | "set", 39 | "node", 40 | "browser", 41 | "javascript", 42 | "server", 43 | "client", 44 | "cli", 45 | "srt", 46 | "txt", 47 | "sub", 48 | "html", 49 | "csv", 50 | "tsv", 51 | "utf8", 52 | "utf-8", 53 | "utf-16le", 54 | "utf-16be", 55 | "utf-32le", 56 | "utf-32be", 57 | "utf-7", 58 | "utf-1", 59 | "UTF-EBCDIC", 60 | "SCSU", 61 | "BOCU-1", 62 | "cp-1250", 63 | "cp-1251", 64 | "cp-1252", 65 | "cp-1253", 66 | "cp-1254", 67 | "cp-1255", 68 | "cp-1256", 69 | "cp1250", 70 | "cp1251", 71 | "cp1252", 72 | "cp1253", 73 | "cp1254", 74 | "cp1255", 75 | "cp1256", 76 | "windows-1250", 77 | "windows-1251", 78 | "windows-1252", 79 | "windows-1253", 80 | "windows-1254", 81 | "windows-1255", 82 | "windows-1256", 83 | "gb18030", 84 | "big5", 85 | "big-5", 86 | "shift-jis", 87 | "shift_jis", 88 | "shift jis", 89 | "EUC-KR", 90 | "TIS-620" 91 | ], 92 | "author": "Linus Komnick", 93 | "license": "MIT", 94 | "bugs": { 95 | "url": "https://github.com/gignupg/Detect-File-Encoding-and-Language/issues" 96 | }, 97 | "homepage": "https://github.com/gignupg/Detect-File-Encoding-and-Language#readme" 98 | } 99 | -------------------------------------------------------------------------------- /src/components/checkByteOrderMark.js: -------------------------------------------------------------------------------- 1 | const byteOrderMarks = require("../config/byteOrderMarkObject.js"); 2 | 3 | module.exports = (uInt8Start) => { 4 | for (const element of byteOrderMarks) { 5 | if (element.regex.test(uInt8Start)) return element.encoding; 6 | } 7 | 8 | return null; 9 | }; 10 | -------------------------------------------------------------------------------- /src/components/checkUTF.js: -------------------------------------------------------------------------------- 1 | module.exports = (content) => { 2 | for (let b = 0; b < content.length; b++) { 3 | // If ? is encountered it's definitely not utf8! 4 | if (content[b] === "�") { 5 | return false; 6 | } 7 | } 8 | return true; 9 | } -------------------------------------------------------------------------------- /src/components/processContent.js: -------------------------------------------------------------------------------- 1 | const countAllMatches = require("./processing-content/countAllMatches.js"); 2 | const calculateConfidenceScore = require("./processing-content/calculateConfidenceScore.js"); 3 | const byteOrderMarkObject = require("../config/byteOrderMarkObject.js"); 4 | 5 | module.exports = (data, fileInfo) => { 6 | data.languageArr = countAllMatches(data, fileInfo.encoding); 7 | 8 | fileInfo.language = data.languageArr.reduce((acc, val) => 9 | acc.count > val.count ? acc : val 10 | ).name; 11 | 12 | // "pos" gives us the position in the language array that has the most matches 13 | data.pos = data.languageArr.findIndex( 14 | (elem) => elem.name === fileInfo.language 15 | ); 16 | 17 | // Determine the encoding 18 | if (!fileInfo.encoding) { 19 | fileInfo.encoding = data.languageArr[data.pos].encoding; 20 | } 21 | 22 | const calculations = calculateConfidenceScore(data, fileInfo); 23 | 24 | if (fileInfo.confidence.encoding) { 25 | fileInfo.confidence.language = calculations; 26 | } else { 27 | fileInfo.confidence.encoding = calculations; 28 | fileInfo.confidence.language = calculations; 29 | } 30 | 31 | // Edge case, when no matches were found 32 | if (!data.languageArr[data.pos].count) { 33 | fileInfo.language = null; 34 | fileInfo.confidence.language = null; 35 | 36 | if (!byteOrderMarkObject.some(obj => obj.encoding === fileInfo.encoding)) { 37 | fileInfo.encoding = null; 38 | fileInfo.confidence.encoding = null; 39 | } 40 | } 41 | 42 | return fileInfo; 43 | }; 44 | -------------------------------------------------------------------------------- /src/components/processing-content/calculateConfidenceScore.js: -------------------------------------------------------------------------------- 1 | module.exports = (data, fileInfo) => { 2 | const charRegex = new RegExp( 3 | /\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/, 4 | "g" 5 | ); 6 | const totalCharacters = data.content.replace(charRegex, "").length; 7 | const langArr = data.languageArr; 8 | const pos = data.pos; 9 | 10 | const secondLanguage = langArr.reduce((acc, val) => { 11 | if (acc.name === fileInfo.language) return val; 12 | if (val.name === fileInfo.language) return acc; 13 | 14 | return acc.count >= val.count ? acc : val; 15 | }); 16 | 17 | const languageRatio = 18 | langArr[pos].count / (secondLanguage.count + langArr[pos].count); 19 | const characterWordRatio = langArr[pos].count / totalCharacters; 20 | 21 | let lowerLimit = null; 22 | let upperLimit = null; 23 | const multiplier = 0.8; 24 | 25 | if (fileInfo.encoding === "UTF-8" || fileInfo.encoding === "UTF-16LE") { 26 | lowerLimit = langArr[pos].utfFrequency 27 | ? langArr[pos].utfFrequency.low * multiplier 28 | : null; 29 | upperLimit = langArr[pos].utfFrequency 30 | ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2 31 | : null; 32 | } else { 33 | lowerLimit = langArr[pos].isoFrequency 34 | ? langArr[pos].isoFrequency.low * multiplier 35 | : null; 36 | upperLimit = langArr[pos].isoFrequency 37 | ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2 38 | : null; 39 | } 40 | 41 | let confidenceScore; 42 | 43 | if (!lowerLimit || !upperLimit) { 44 | confidenceScore = null; 45 | } else if (characterWordRatio >= upperLimit) { 46 | confidenceScore = 1; 47 | } else if (characterWordRatio > lowerLimit) { 48 | const range = upperLimit - lowerLimit; 49 | const surplus = characterWordRatio - lowerLimit; 50 | const confidenceRaisePercentage = surplus / range; 51 | const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage; 52 | confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2)); 53 | } else { 54 | confidenceScore = Number( 55 | (languageRatio * (characterWordRatio / lowerLimit)).toFixed(2) 56 | ); 57 | } 58 | 59 | return confidenceScore; 60 | }; 61 | -------------------------------------------------------------------------------- /src/components/processing-content/countAllMatches.js: -------------------------------------------------------------------------------- 1 | const languageArr = require("../../config/languageObject.js"); 2 | 3 | module.exports = (data, encoding) => { 4 | const newLanguageArr = []; 5 | 6 | // Cloning the language array and making sure that "count" has no reference to "languageArr"! 7 | languageArr.forEach((obj) => { 8 | const updatedLangObj = {}; 9 | Object.keys(obj).forEach((key) => { 10 | if (key !== "count") { 11 | updatedLangObj[key] = obj[key]; 12 | } else { 13 | updatedLangObj.count = 0; 14 | } 15 | }); 16 | newLanguageArr.push(updatedLangObj); 17 | }); 18 | 19 | const regex = encoding ? "utfRegex" : "isoRegex"; 20 | 21 | // Populating the count property of the language array 22 | newLanguageArr.forEach((lang) => { 23 | if (lang[regex]) { 24 | const matches = data.content.match(lang[regex]); 25 | 26 | if (matches) lang.count = matches.length; 27 | } 28 | }); 29 | 30 | return newLanguageArr; 31 | }; 32 | -------------------------------------------------------------------------------- /src/config/byteOrderMarkObject.js: -------------------------------------------------------------------------------- 1 | module.exports = [ 2 | { 3 | encoding: "UTF-EBCDIC", 4 | regex: new RegExp("221 115 102 115"), 5 | }, 6 | { 7 | encoding: "GB-18030", 8 | regex: new RegExp("132 49 149 51"), 9 | }, 10 | { 11 | encoding: "UTF-32LE", 12 | regex: new RegExp("255 254 0 0"), 13 | }, 14 | { 15 | encoding: "UTF-32BE", 16 | regex: new RegExp("0 0 254 255"), 17 | }, 18 | { 19 | encoding: "UTF-8", 20 | regex: new RegExp("239 187 191"), 21 | }, 22 | { 23 | encoding: "UTF-7", 24 | regex: new RegExp("43 47 118"), 25 | }, 26 | { 27 | encoding: "UTF-1", 28 | regex: new RegExp("247 100 76"), 29 | }, 30 | { 31 | encoding: "SCSU", 32 | regex: new RegExp("14 254 255"), 33 | }, 34 | { 35 | encoding: "BOCU-1", 36 | regex: new RegExp("251 238 40"), 37 | }, 38 | { 39 | encoding: "UTF-16BE", 40 | regex: new RegExp("254 255"), 41 | }, 42 | { 43 | encoding: "UTF-16LE", 44 | regex: new RegExp("255 254"), 45 | }, 46 | ]; 47 | -------------------------------------------------------------------------------- /src/config/languageObject.js: -------------------------------------------------------------------------------- 1 | const flag = "gi"; 2 | 3 | const sharedRegex = { 4 | czech: new RegExp(/jsem|jsi/, flag), 5 | hungarian: new RegExp(/\snem\s/, flag), 6 | slovak: new RegExp(/poriadku|myslím|\ssme\s/, flag), 7 | slovenian: new RegExp(/\skaj\s|lahko|zdaj/, flag), 8 | albanian: new RegExp(/nuk/, flag), 9 | english: new RegExp(/ the /, flag), 10 | french: new RegExp(/c'est/, flag), 11 | portuguese: new RegExp(/ não /, flag), 12 | spanish: new RegExp(/estaba|\smuy\s|siempre|ahora/, flag), 13 | german: new RegExp(/\sdas\s/, flag), 14 | italian: new RegExp(/\sche\s/, flag), 15 | danish: new RegExp(/hvad|noget/, flag), 16 | norwegian: new RegExp(/deg/, flag), 17 | swedish: new RegExp(/ jag /, flag), 18 | dutch: new RegExp(/ het /, flag), 19 | finnish: new RegExp(/hän/, flag), 20 | "serbo-croatian": new RegExp(/ sam | kako /, flag), 21 | estonian: new RegExp(/\sseda\s|\spole\s|midagi/, flag), 22 | icelandic: new RegExp(/Það/, flag), 23 | "malay-indonesian": new RegExp(/tidak/, flag), 24 | turkish: new RegExp(/ bir /, flag), 25 | lithuanian: new RegExp(/taip|\stai\s/, flag), 26 | bengali: new RegExp(/এটা/, flag), 27 | hindi: new RegExp(/हैं/, flag), 28 | urdu: new RegExp(/ایک/, flag), 29 | vietnamese: new RegExp(/ không /, flag) 30 | }; 31 | 32 | const sharedFrequency = { 33 | polish: { low: 0.004355, high: 0.005102 }, 34 | czech: { low: 0.004433, high: 0.007324 }, 35 | hungarian: { low: 0.004994, high: 0.005183 }, 36 | romanian: { low: 0.003319, high: 0.004190 }, 37 | slovak: { low: 0.001736, high: 0.002557 }, 38 | slovenian: { low: 0.004111, high: 0.004959 }, 39 | albanian: { low: 0.003773, high: 0.007313 }, 40 | ukrainian: { low: 0.002933, high: 0.005389 }, 41 | english: { low: 0.004679, high: 0.007580 }, 42 | french: { low: 0.003016, high: 0.004825 }, 43 | portuguese: { low: 0.003406, high: 0.005032 }, 44 | spanish: { low: 0.002348, high: 0.002881 }, 45 | german: { low: 0.004044, high: 0.004391 }, 46 | italian: { low: 0.003889, high: 0.005175 }, 47 | danish: { low: 0.003630, high: 0.004189 }, 48 | norwegian: { low: 0.002410, high: 0.003918 }, 49 | swedish: { low: 0.004916, high: 0.007221 }, 50 | dutch: { low: 0.003501, high: 0.004150 }, 51 | finnish: { low: 0.003308, high: 0.005135 }, 52 | "serbo-croatian": { low: 0.002568, high: 0.005182 }, 53 | estonian: { low: 0.002892, high: 0.003963 }, 54 | icelandic: { low: 0.004366, high: 0.004366 }, 55 | "malay-indonesian": { low: 0.002825, high: 0.003932 }, 56 | greek: { low: 0.003440, high: 0.004862 }, 57 | turkish: { low: 0.002915, high: 0.004588 }, 58 | hebrew: { low: 0.003663, high: 0.004666 }, 59 | lithuanian: { low: 0.003277, high: 0.003768 }, 60 | bengali: { low: 0.003155, high: 0.005236 }, 61 | hindi: { low: 0.004159, high: 0.006478 }, 62 | urdu: { low: 0.004118, high: 0.005851 }, 63 | vietnamese: { low: 0.003387, high: 0.005191 } 64 | }; 65 | 66 | module.exports = [ 67 | { 68 | name: "polish", 69 | count: 0, 70 | utfRegex: new RegExp(/się/, flag), 71 | isoRegex: new RegExp(/siê/, flag), 72 | encoding: "CP1250", 73 | utfFrequency: sharedFrequency.polish, 74 | isoFrequency: sharedFrequency.polish 75 | }, 76 | { 77 | name: "czech", 78 | count: 0, 79 | utfRegex: sharedRegex.czech, 80 | isoRegex: sharedRegex.czech, 81 | encoding: "CP1250", 82 | utfFrequency: sharedFrequency.czech, 83 | isoFrequency: sharedFrequency.czech 84 | }, 85 | { 86 | name: "hungarian", 87 | count: 0, 88 | utfRegex: sharedRegex.hungarian, 89 | isoRegex: sharedRegex.hungarian, 90 | encoding: "CP1250", 91 | utfFrequency: sharedFrequency.hungarian, 92 | isoFrequency: sharedFrequency.hungarian 93 | }, 94 | { 95 | name: "romanian", 96 | count: 0, 97 | utfRegex: new RegExp(/sunt|eşti/, flag), 98 | isoRegex: new RegExp(/sunt|eºti/, flag), 99 | encoding: "CP1250", 100 | utfFrequency: sharedFrequency.romanian, 101 | isoFrequency: sharedFrequency.romanian 102 | }, 103 | { 104 | name: "slovak", 105 | count: 0, 106 | utfRegex: sharedRegex.slovak, 107 | isoRegex: sharedRegex.slovak, 108 | encoding: "CP1250", 109 | utfFrequency: sharedFrequency.slovak, 110 | isoFrequency: sharedFrequency.slovak 111 | }, 112 | { 113 | name: "slovenian", 114 | count: 0, 115 | utfRegex: sharedRegex.slovenian, 116 | isoRegex: sharedRegex.slovenian, 117 | encoding: "CP1250", 118 | utfFrequency: sharedFrequency.slovenian, 119 | isoFrequency: sharedFrequency.slovenian 120 | }, 121 | { 122 | name: "albanian", 123 | count: 0, 124 | utfRegex: sharedRegex.albanian, 125 | isoRegex: sharedRegex.albanian, 126 | encoding: "CP1250", 127 | utfFrequency: sharedFrequency.albanian, 128 | isoFrequency: sharedFrequency.albanian 129 | }, 130 | { 131 | name: "russian", 132 | count: 0, 133 | utfRegex: new RegExp(/что/, flag), 134 | isoRegex: new RegExp(/÷òî/, flag), 135 | encoding: "CP1251", 136 | utfFrequency: { low: 0.004965, high: 0.005341 }, 137 | isoFrequency: { low: 0.003884, high: 0.003986 } 138 | }, 139 | { 140 | name: "ukrainian", 141 | count: 0, 142 | utfRegex: new RegExp(/він|але/, flag), 143 | isoRegex: new RegExp(/â³í|àëå/, flag), 144 | encoding: "CP1251", 145 | utfFrequency: sharedFrequency.ukrainian, 146 | isoFrequency: sharedFrequency.ukrainian 147 | }, 148 | { 149 | name: "bulgarian", 150 | count: 0, 151 | utfRegex: new RegExp(/това|какво/, flag), 152 | isoRegex: new RegExp(/òîâà|äîáðå|êaêâo/, flag), 153 | encoding: "CP1251", 154 | utfFrequency: { low: 0.005225, high: 0.005628 }, 155 | isoFrequency: { low: 0.002767, high: 0.004951 } 156 | }, 157 | { 158 | name: "english", 159 | count: 0, 160 | utfRegex: sharedRegex.english, 161 | isoRegex: sharedRegex.english, 162 | encoding: "CP1252", 163 | utfFrequency: sharedFrequency.english, 164 | isoFrequency: sharedFrequency.english 165 | }, 166 | { 167 | name: "french", 168 | count: 0, 169 | utfRegex: sharedRegex.french, 170 | isoRegex: sharedRegex.french, 171 | encoding: "CP1252", 172 | utfFrequency: sharedFrequency.french, 173 | isoFrequency: sharedFrequency.french 174 | }, 175 | { 176 | name: "portuguese", 177 | count: 0, 178 | utfRegex: sharedRegex.portuguese, 179 | isoRegex: sharedRegex.portuguese, 180 | encoding: "CP1252", 181 | utfFrequency: sharedFrequency.portuguese, 182 | isoFrequency: sharedFrequency.portuguese 183 | }, 184 | { 185 | name: "spanish", 186 | count: 0, 187 | utfRegex: sharedRegex.spanish, 188 | isoRegex: sharedRegex.spanish, 189 | encoding: "CP1252", 190 | utfFrequency: sharedFrequency.spanish, 191 | isoFrequency: sharedFrequency.spanish 192 | }, 193 | { 194 | name: "german", 195 | count: 0, 196 | utfRegex: sharedRegex.german, 197 | isoRegex: sharedRegex.german, 198 | encoding: "CP1252", 199 | utfFrequency: sharedFrequency.german, 200 | isoFrequency: sharedFrequency.german 201 | }, 202 | { 203 | name: "italian", 204 | count: 0, 205 | utfRegex: sharedRegex.italian, 206 | isoRegex: sharedRegex.italian, 207 | encoding: "CP1252", 208 | utfFrequency: sharedFrequency.italian, 209 | isoFrequency: sharedFrequency.italian 210 | }, 211 | { 212 | name: "danish", 213 | count: 0, 214 | utfRegex: sharedRegex.danish, 215 | isoRegex: sharedRegex.danish, 216 | encoding: "CP1252", 217 | utfFrequency: sharedFrequency.danish, 218 | isoFrequency: sharedFrequency.danish 219 | }, 220 | { 221 | name: "norwegian", 222 | count: 0, 223 | utfRegex: sharedRegex.norwegian, 224 | isoRegex: sharedRegex.norwegian, 225 | encoding: "CP1252", 226 | utfFrequency: sharedFrequency.norwegian, 227 | isoFrequency: sharedFrequency.norwegian 228 | }, 229 | { 230 | name: "swedish", 231 | count: 0, 232 | utfRegex: sharedRegex.swedish, 233 | isoRegex: sharedRegex.swedish, 234 | encoding: "CP1252", 235 | utfFrequency: sharedFrequency.swedish, 236 | isoFrequency: sharedFrequency.swedish 237 | }, 238 | { 239 | name: "dutch", 240 | count: 0, 241 | utfRegex: sharedRegex.dutch, 242 | isoRegex: sharedRegex.dutch, 243 | encoding: "CP1252", 244 | utfFrequency: sharedFrequency.dutch, 245 | isoFrequency: sharedFrequency.dutch 246 | }, 247 | { 248 | name: "finnish", 249 | count: 0, 250 | utfRegex: sharedRegex.finnish, 251 | isoRegex: sharedRegex.finnish, 252 | encoding: "CP1252", 253 | utfFrequency: sharedFrequency.finnish, 254 | isoFrequency: sharedFrequency.finnish 255 | }, 256 | { 257 | name: "serbo-croatian", 258 | count: 0, 259 | utfRegex: sharedRegex["serbo-croatian"], 260 | isoRegex: sharedRegex["serbo-croatian"], 261 | encoding: "CP1252", 262 | utfFrequency: sharedFrequency["serbo-croatian"], 263 | isoFrequency: sharedFrequency["serbo-croatian"] 264 | }, 265 | { 266 | name: "estonian", 267 | count: 0, 268 | utfRegex: sharedRegex.estonian, 269 | isoRegex: sharedRegex.estonian, 270 | encoding: "CP1252", 271 | utfFrequency: sharedFrequency.estonian, 272 | isoFrequency: sharedFrequency.estonian 273 | }, 274 | { 275 | name: "icelandic", 276 | count: 0, 277 | utfRegex: sharedRegex.icelandic, 278 | isoRegex: sharedRegex.icelandic, 279 | encoding: "CP1252", 280 | utfFrequency: sharedFrequency.icelandic, 281 | isoFrequency: sharedFrequency.icelandic 282 | }, 283 | { 284 | name: "malay-indonesian", 285 | count: 0, 286 | utfRegex: sharedRegex["malay-indonesian"], 287 | isoRegex: sharedRegex["malay-indonesian"], 288 | encoding: "CP1252", 289 | utfFrequency: sharedFrequency["malay-indonesian"], 290 | isoFrequency: sharedFrequency["malay-indonesian"] 291 | }, 292 | { 293 | name: "greek", 294 | count: 0, 295 | utfRegex: new RegExp(/είναι/, flag), 296 | isoRegex: new RegExp(/åßíáé/, flag), 297 | encoding: "CP1253", 298 | utfFrequency: sharedFrequency.greek, 299 | isoFrequency: sharedFrequency.greek 300 | }, 301 | { 302 | name: "turkish", 303 | count: 0, 304 | utfRegex: sharedRegex.turkish, 305 | isoRegex: sharedRegex.turkish, 306 | encoding: "CP1254", 307 | utfFrequency: sharedFrequency.turkish, 308 | isoFrequency: sharedFrequency.turkish 309 | }, 310 | { 311 | name: "hebrew", 312 | count: 0, 313 | utfRegex: new RegExp(/אתה/, flag), 314 | isoRegex: new RegExp(/àúä/, flag), 315 | encoding: "CP1255", 316 | utfFrequency: sharedFrequency.hebrew, 317 | isoFrequency: sharedFrequency.hebrew 318 | }, 319 | { 320 | name: "arabic", 321 | count: 0, 322 | utfRegex: new RegExp(/هذا/, flag), 323 | isoRegex: new RegExp(/åðç/, flag), 324 | encoding: "CP1256", 325 | utfFrequency: { low: 0.003522, high: 0.004348 }, 326 | isoFrequency: { low: 0.003773, high: 0.005559 } 327 | }, 328 | { 329 | name: "farsi-persian", 330 | count: 0, 331 | utfRegex: new RegExp(/اون/, flag), 332 | isoRegex: new RegExp(/çíä/, flag), 333 | encoding: "CP1256", 334 | utfFrequency: { low: 0.002761, high: 0.004856 }, 335 | isoFrequency: { low: 0.003010, high: 0.006646 } 336 | }, 337 | { 338 | name: "lithuanian", 339 | count: 0, 340 | utfRegex: sharedRegex.lithuanian, 341 | isoRegex: sharedRegex.lithuanian, 342 | encoding: "CP1257", 343 | utfFrequency: sharedFrequency.lithuanian, 344 | isoFrequency: sharedFrequency.lithuanian 345 | }, 346 | { 347 | name: "chinese-simplified", 348 | count: 0, 349 | utfRegex: new RegExp(/么/, flag), 350 | isoRegex: new RegExp(/´ó|¶¯|Å®/, flag), 351 | encoding: "GB18030", 352 | utfFrequency: { low: 0.009567, high: 0.011502 }, 353 | isoFrequency: { low: 0.003137, high: 0.005009 } 354 | }, 355 | { 356 | name: "chinese-traditional", 357 | count: 0, 358 | utfRegex: new RegExp(/們/, flag), 359 | isoRegex: new RegExp(/¦b/, flag), 360 | encoding: "BIG5", 361 | utfFrequency: { low: 0.012484, high: 0.014964 }, 362 | isoFrequency: { low: 0.005063, high: 0.005822 } 363 | }, 364 | { 365 | name: "japanese", 366 | count: 0, 367 | utfRegex: new RegExp(/ど/, flag), 368 | isoRegex: new RegExp(/‚»|‚Á‚Ä/, flag), 369 | encoding: "Shift-JIS", 370 | utfFrequency: { low: 0.004257, high: 0.006585 }, 371 | isoFrequency: { low: 0.004286, high: 0.004653 } 372 | }, 373 | { 374 | name: "korean", 375 | count: 0, 376 | utfRegex: new RegExp(/도/, flag), 377 | isoRegex: new RegExp(/àö¾î|å¾ß|¡¼­/, flag), 378 | encoding: "EUC-KR", 379 | utfFrequency: { low: 0.010910, high: 0.013670 }, 380 | isoFrequency: { low: 0.004118, high: 0.004961 } 381 | }, 382 | { 383 | name: "thai", 384 | count: 0, 385 | utfRegex: new RegExp(/แฮร์รี่|พอตเตอร์/, flag), 386 | isoRegex: new RegExp(/áîãìãõè|¾íµàµíãì­/, flag), 387 | encoding: "TIS-620", 388 | utfFrequency: { low: 0.003194, high: 0.003468 }, 389 | isoFrequency: { low: 0.002091, high: 0.002303 } 390 | }, 391 | // The following languages don't seem to have their own encoding 392 | // Subtitle files in these languages seem to almost exclusively use UTF encoding. 393 | { 394 | name: "bengali", 395 | count: 0, 396 | utfRegex: sharedRegex.bengali, 397 | isoRegex: sharedRegex.bengali, 398 | utfFrequency: sharedFrequency.bengali, 399 | isoFrequency: sharedFrequency.bengali 400 | }, 401 | { 402 | name: "hindi", 403 | count: 0, 404 | utfRegex: sharedRegex.hindi, 405 | isoRegex: sharedRegex.hindi, 406 | utfFrequency: sharedFrequency.hindi, 407 | isoFrequency: sharedFrequency.hindi 408 | }, 409 | { 410 | name: "urdu", 411 | count: 0, 412 | utfRegex: sharedRegex.urdu, 413 | isoRegex: sharedRegex.urdu, 414 | utfFrequency: sharedFrequency.urdu, 415 | isoFrequency: sharedFrequency.urdu 416 | }, 417 | { 418 | name: "vietnamese", 419 | count: 0, 420 | utfRegex: sharedRegex.vietnamese, 421 | isoRegex: sharedRegex.vietnamese, 422 | utfFrequency: sharedFrequency.vietnamese, 423 | isoFrequency: sharedFrequency.vietnamese 424 | }, 425 | ]; -------------------------------------------------------------------------------- /src/index-browser.js: -------------------------------------------------------------------------------- 1 | const checkUTF = require("./components/checkUTF.js"); 2 | const processContent = require("./components/processContent.js"); 3 | const checkByteOrderMark = require("./components/checkByteOrderMark.js"); 4 | 5 | module.exports = (file) => { 6 | return new Promise((resolve, reject) => { 7 | const fileInfo = { 8 | encoding: null, 9 | language: null, 10 | confidence: { 11 | encoding: null, 12 | language: null, 13 | }, 14 | }; 15 | const data = {}; 16 | 17 | // Check the byte order mark! 18 | const byteOrderMarkBuffer = new FileReader(); 19 | 20 | byteOrderMarkBuffer.onload = () => { 21 | const uInt8String = new Uint8Array(byteOrderMarkBuffer.result).slice(0, 4).join(" "); 22 | const byteOrderMark = checkByteOrderMark(uInt8String); 23 | 24 | if (byteOrderMark) { 25 | fileInfo.encoding = byteOrderMark; 26 | fileInfo.confidence.encoding = 1; 27 | 28 | const byteOrderMarkReader = new FileReader(); 29 | 30 | byteOrderMarkReader.onload = () => { 31 | data.content = byteOrderMarkReader.result; 32 | resolve(processContent(data, fileInfo)); 33 | }; 34 | 35 | byteOrderMarkReader.onerror = (err) => { 36 | reject(err); 37 | }; 38 | 39 | byteOrderMarkReader.readAsText(file, fileInfo.encoding); 40 | } else { 41 | // Read with UTF-8 first, then with ISO-8859-1 42 | const utfReader = new FileReader(); 43 | 44 | utfReader.onload = () => { 45 | const utfContent = utfReader.result; 46 | 47 | const utf8 = checkUTF(utfContent); 48 | 49 | if (utf8) { 50 | fileInfo.encoding = "UTF-8"; 51 | fileInfo.confidence.encoding = 1; 52 | } 53 | 54 | if (utf8) { 55 | data.content = utfContent; 56 | resolve(processContent(data, fileInfo)); 57 | } else { 58 | const isoReader = new FileReader(); 59 | 60 | isoReader.onload = () => { 61 | data.content = isoReader.result; 62 | resolve(processContent(data, fileInfo)); 63 | }; 64 | 65 | isoReader.readAsText(file, "ISO-8859-1"); 66 | } 67 | }; 68 | 69 | utfReader.onerror = (err) => { 70 | reject(err); 71 | }; 72 | 73 | utfReader.readAsText(file, "UTF-8"); 74 | } 75 | }; 76 | 77 | byteOrderMarkBuffer.onerror = (err) => { 78 | reject(err); 79 | }; 80 | 81 | byteOrderMarkBuffer.readAsArrayBuffer(file); 82 | }); 83 | }; 84 | -------------------------------------------------------------------------------- /src/index-node.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs"); 2 | const stream = require("stream"); 3 | const checkUTF = require("./components/checkUTF.js"); 4 | const processContent = require("./components/processContent.js"); 5 | const checkByteOrderMark = require("./components/checkByteOrderMark.js"); 6 | 7 | function getStream(filePath, start, end) { 8 | if (filePath instanceof Buffer) { 9 | return stream.Readable.from(filePath.subarray(start, end)) 10 | } 11 | 12 | return fs.createReadStream(filePath, { start, end }) 13 | } 14 | 15 | function getContent(filePath, encoding, callback) { 16 | if (filePath instanceof Buffer) { 17 | return callback(null, filePath.toString(encoding)) 18 | } 19 | 20 | return fs.readFile(filePath, encoding, callback) 21 | } 22 | 23 | module.exports = (filePath) => { 24 | return new Promise((resolve, reject) => { 25 | let isEmpty = true; 26 | const fileInfo = { 27 | encoding: null, 28 | language: null, 29 | confidence: { 30 | encoding: null, 31 | language: null, 32 | }, 33 | }; 34 | const data = {}; 35 | 36 | // Reading the first four bytes and checking if they coincide with one of the predefined byte order marks. 37 | const readStream = getStream(filePath, 0, 3); 38 | 39 | readStream.on("data", function (buffer) { 40 | isEmpty = false; 41 | const uInt8Array = new Uint8Array(buffer); 42 | const uInt8String = uInt8Array.join(" "); 43 | const byteOrderMark = checkByteOrderMark(uInt8String); 44 | 45 | if (byteOrderMark) { 46 | fileInfo.encoding = byteOrderMark; 47 | fileInfo.confidence.encoding = 1; 48 | 49 | // Node.js only supports UTF-8 and UTF-16LE. If one of them has been detected, we know how to read the content 50 | if (fileInfo.encoding === "UTF-8" || fileInfo.encoding === "UTF-16LE") { 51 | getContent(filePath, fileInfo.encoding, (err, utfContent) => { 52 | if (err) reject(err); 53 | data.content = utfContent; 54 | resolve(processContent(data, fileInfo)); 55 | }); 56 | 57 | // If the encoding in the byteOrderMarkObject is not UTF-8 or UTF-16LE we return the encoding without the language 58 | } else { 59 | if (fileInfo.encoding === "GB-18030") { 60 | fileInfo.language = "chinese-simplified"; 61 | fileInfo.confidence.language = 1; 62 | } 63 | resolve(fileInfo); 64 | } 65 | } else { 66 | getContent(filePath, "UTF-8", (err, utfContent) => { 67 | if (err) reject(err); 68 | 69 | const utf8 = checkUTF(utfContent); 70 | 71 | if (utf8) { 72 | fileInfo.encoding = "UTF-8"; 73 | fileInfo.confidence.encoding = 1; 74 | } 75 | 76 | if (utf8) { 77 | data.content = utfContent; 78 | resolve(processContent(data, fileInfo)); 79 | } else { 80 | getContent(filePath, "latin1", (err, isoContent) => { 81 | if (err) reject(err); 82 | 83 | data.content = isoContent; 84 | resolve(processContent(data, fileInfo)); 85 | }); 86 | } 87 | }); 88 | } 89 | }); 90 | readStream.on("end", function () { 91 | if(isEmpty) resolve(fileInfo) 92 | }) 93 | // This catches any errors that happen while creating the readable stream (usually invalid names) 94 | readStream.on("error", function (err) { 95 | reject(err); 96 | }); 97 | }); 98 | }; 99 | -------------------------------------------------------------------------------- /src/index.d.ts: -------------------------------------------------------------------------------- 1 | export interface FileInfo { 2 | encoding: null | 'UTF-EBCDIC' | 'GB18030' | 'UTF-32LE' | 'UTF-32BE' | 'UTF-8' | 'UTF-7' | 'UTF-1' | 'SCSU' | 'BOCU-1' | 'UTF-16BE' | 'UTF-16LE' | 'latin1' | 'ISO-8859-1' | 'CP1250' | 'CP1251' | 'CP1252' | 'CP1253' | 'CP1254' | 'CP1255' | 'CP1256' | 'CP1257' | 'BIG5' | 'Shift-JIS' | 'EUC-KR' | 'TIS-620'; 3 | language: null | 'polish' | 'czech' | 'hungarian' | 'romanian' | 'slovak' | 'slovenian' | 'albanian' | 'russian' | 'ukrainian' | 'bulgarian' | 'english' | 'french' | 'portuguese' | 'spanish' | 'german' | 'italian' | 'danish' | 'norwegian' | 'swedish' | 'dutch' | 'finnish' | 'serbo-croatian' | 'estonian' | 'icelandic' | 'malay-indonesian' | 'greek' | 'turkish' | 'hebrew' | 'arabic' | 'farsi-persian' | 'lithuanian' | 'chinese-simplified' | 'chinese-traditional' | 'japanese' | 'korean' | 'thai' | 'bengali' | 'hindi' | 'urdu' | 'vietnamese'; 4 | confidence: { 5 | encoding: null | number; 6 | language: null | number; 7 | }, 8 | } 9 | 10 | declare function DetectFileEncodingAndLanguage(clientSideFileOrServerSidePath: File | Blob | string | Buffer | URL): Promise; 11 | export default DetectFileEncodingAndLanguage; 12 | -------------------------------------------------------------------------------- /tests/browser/browser-test/README.md: -------------------------------------------------------------------------------- 1 | The reason there is no live version for this repo is because we want to make sure to always run `npm update` before testing, so we're testing with the latest version of `detect-file-encoding-and-language` (the one on GitHub, not NPM!). 2 | 3 | ![Screenshot from 2022-06-14 04-04-30](https://user-images.githubusercontent.com/52784332/173478057-d3197be7-6e4f-485f-bb80-0d1a6450f0fe.png) 4 | -------------------------------------------------------------------------------- /tests/browser/browser-test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "browser-test", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@testing-library/jest-dom": "^5.16.4", 7 | "@testing-library/react": "^13.2.0", 8 | "@testing-library/user-event": "^13.5.0", 9 | "@types/jest": "^27.5.1", 10 | "@types/node": "^16.11.36", 11 | "@types/react": "^18.0.9", 12 | "@types/react-dom": "^18.0.4", 13 | "browser-fs-access": "^0.29.5", 14 | "detect-file-encoding-and-language": "git+https://github.com/gignupg/Detect-File-Encoding-and-Language.git", 15 | "materialize-css": "^1.0.0-rc.2", 16 | "react": "^18.1.0", 17 | "react-dom": "^18.1.0", 18 | "react-scripts": "5.0.1", 19 | "typescript": "^4.6.4", 20 | "web-vitals": "^2.1.4" 21 | }, 22 | "scripts": { 23 | "start": "react-scripts start", 24 | "build": "react-scripts build", 25 | "test": "react-scripts test", 26 | "eject": "react-scripts eject" 27 | }, 28 | "eslintConfig": { 29 | "extends": [ 30 | "react-app", 31 | "react-app/jest" 32 | ] 33 | }, 34 | "browserslist": { 35 | "production": [ 36 | ">0.2%", 37 | "not dead", 38 | "not op_mini all" 39 | ], 40 | "development": [ 41 | "last 1 chrome version", 42 | "last 1 firefox version", 43 | "last 1 safari version" 44 | ] 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tests/browser/browser-test/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 17 | 18 | 27 | 28 | 29 | React App 30 | 31 | 32 | 33 |
34 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /tests/browser/browser-test/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /tests/browser/browser-test/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /tests/browser/browser-test/src/App.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import languageEncoding from "detect-file-encoding-and-language"; 3 | import { directoryOpen } from 'browser-fs-access'; 4 | 5 | type Status = '' | 'error' | 'success' | 'loading'; 6 | 7 | const minConfidence = 0.95; 8 | let error = false; 9 | 10 | export default function App() { 11 | const [status, setStatus] = useState(''); 12 | const [progress, setProgress] = useState('0%') 13 | 14 | function setError(file: any, fileInfo: any) { 15 | setStatus('error'); 16 | error = true; 17 | console.info('file:', file); 18 | console.info('fileInfo:', fileInfo); 19 | } 20 | 21 | async function inputHandler() { 22 | error = false; 23 | const files = await directoryOpen({recursive: true}); 24 | const filesToCheck = files.length 25 | let filesChecked = 0 26 | setStatus('loading') 27 | 28 | for (const file of files) { 29 | const folderNameArr = file.directoryHandle?.name.split('_'); 30 | const expectedLanguage = folderNameArr ? folderNameArr[0] : null; 31 | const expectedEncoding = folderNameArr ? folderNameArr[1] : null; 32 | 33 | const fileInfo = await languageEncoding(file) 34 | if (!expectedLanguage) { 35 | console.error("Expected language not found in folder name", file.directoryHandle?.name); 36 | setError(file, fileInfo); 37 | 38 | } else if (!expectedEncoding) { 39 | console.error("Expected encoding not found in folder name", file.directoryHandle?.name); 40 | setError(file, fileInfo); 41 | 42 | } else if (!fileInfo.confidence.encoding || fileInfo.confidence.encoding < minConfidence) { 43 | console.error("Encoding Confidence too low!", fileInfo.confidence.encoding); 44 | setError(file, fileInfo); 45 | 46 | } else if (!fileInfo.confidence.language || fileInfo.confidence.language < minConfidence) { 47 | console.error("Language Confidence too low!", fileInfo.confidence.language); 48 | setError(file, fileInfo); 49 | 50 | } else if (fileInfo.language !== expectedLanguage) { 51 | console.error(`Language mismatch! Expected ${expectedLanguage} but got ${fileInfo.language}`); 52 | setError(file, fileInfo); 53 | 54 | } else if (fileInfo.encoding !== expectedEncoding) { 55 | console.error(`Encoding mismatch! Expected ${expectedEncoding} but got ${fileInfo.encoding}`); 56 | setError(file, fileInfo); 57 | } 58 | filesChecked++ 59 | setProgress(`${filesChecked / filesToCheck * 100}%`) 60 | } 61 | 62 | if (!error) { 63 | console.info("All tests passed!"); 64 | setStatus('success'); 65 | } 66 | 67 | setProgress('0%') 68 | } 69 | 70 | return ( 71 | <> 72 | 75 |
76 |
77 |
78 | 79 | Select a folder that contains subtitle files or subdirectories with subtitle files. 80 | Then open the browser console to see whether tests are passing or failing. 81 | Make sure you're running the latest version of detect-file-encoding-and-language 82 | by taking a closer look at the package in the node modules folder or by downlaoding 83 | a fresh clone of this repo! 84 | 85 |
86 | { 87 | status === 'loading' && ( 88 |
89 |
90 |
91 | ) 92 | } 93 | { 94 | status === 'error' && ( 95 |
96 | 97 | Test failed! For more details open the console to see the error logs! 98 | 99 |
100 | ) 101 | } 102 | { 103 | status === 'success' && ( 104 |
105 | 106 | All tests passed! 107 | 108 |
109 | ) 110 | } 111 |
112 |
113 | 114 | 115 | ) 116 | } -------------------------------------------------------------------------------- /tests/browser/browser-test/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom/client'; 3 | import App from './App'; 4 | 5 | const root = ReactDOM.createRoot( 6 | document.getElementById('root') as HTMLElement 7 | ); 8 | root.render( 9 | 10 | 11 | 12 | ); -------------------------------------------------------------------------------- /tests/browser/browser-test/src/react-app-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /tests/browser/browser-test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": [ 5 | "dom", 6 | "dom.iterable", 7 | "esnext" 8 | ], 9 | "allowJs": true, 10 | "skipLibCheck": true, 11 | "esModuleInterop": true, 12 | "allowSyntheticDefaultImports": true, 13 | "strict": true, 14 | "forceConsistentCasingInFileNames": true, 15 | "noFallthroughCasesInSwitch": true, 16 | "module": "esnext", 17 | "moduleResolution": "node", 18 | "resolveJsonModule": true, 19 | "isolatedModules": true, 20 | "noEmit": true, 21 | "jsx": "react-jsx" 22 | }, 23 | "include": [ 24 | "src" 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /tests/browser/html-test/app.js: -------------------------------------------------------------------------------- 1 | document.getElementById("my-input-field").addEventListener("change", (e) => { 2 | const file = e.target.files[0]; 3 | languageEncoding(file).then((fileInfo) => console.log(fileInfo)); 4 | // Possible result: { language: english, encoding: UTF-8, confidence: { encoding: 1, language: 1 } } 5 | }); -------------------------------------------------------------------------------- /tests/browser/html-test/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/browser/live-demo/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 gignu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/browser/live-demo/README.md: -------------------------------------------------------------------------------- 1 | # Encoding-and-Language-Detector 2 | 3 | Online tool to detect the file encoding and language of text files. 4 | 5 | Try it out [here](https://detect-file-encoding-and-language-live-demo.netlify.app/)! 6 | -------------------------------------------------------------------------------- /tests/browser/live-demo/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "live-demo", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@testing-library/jest-dom": "^4.2.4", 7 | "@testing-library/react": "^9.3.2", 8 | "@testing-library/user-event": "^7.1.2", 9 | "detect-file-encoding-and-language": "^2.1.0", 10 | "i": "^0.3.6", 11 | "npm": "^7.6.3", 12 | "react": "^16.13.1", 13 | "react-dom": "^16.13.1", 14 | "react-scripts": "3.4.1" 15 | }, 16 | "scripts": { 17 | "start": "react-scripts start", 18 | "build": "react-scripts build", 19 | "test": "react-scripts test", 20 | "eject": "react-scripts eject" 21 | }, 22 | "eslintConfig": { 23 | "extends": "react-app" 24 | }, 25 | "browserslist": { 26 | "production": [ 27 | ">0.2%", 28 | "not dead", 29 | "not op_mini all" 30 | ], 31 | "development": [ 32 | "last 1 chrome version", 33 | "last 1 firefox version", 34 | "last 1 safari version" 35 | ] 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/browser/live-demo/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 15 | 16 | 17 | 21 | 22 | 23 | 24 | Live Demo 25 | 26 | 27 | 28 |
29 | 30 | 31 | -------------------------------------------------------------------------------- /tests/browser/live-demo/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /tests/browser/live-demo/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /tests/browser/live-demo/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from "react"; 2 | import defaultFileInfo from "./defaultFileInfo.js"; 3 | import languageEncoding from "detect-file-encoding-and-language"; 4 | 5 | function App() { 6 | const [file, setFile] = useState(null); 7 | const [fileInfo, setFileInfo] = useState(defaultFileInfo); 8 | const [textContent, setTextContent] = useState(""); 9 | 10 | function fileUpload(e) { 11 | if (e.target.files.length) { 12 | setFile(e.target.files[0]); 13 | 14 | languageEncoding(e.target.files[0]).then((data) => { 15 | setFileInfo(data); 16 | }); 17 | } 18 | } 19 | 20 | useEffect(() => { 21 | if (fileInfo.encoding) { 22 | const reader = new FileReader(); 23 | 24 | reader.onload = () => { 25 | setTextContent(reader.result); 26 | }; 27 | 28 | reader.readAsText(file, fileInfo.encoding); 29 | } 30 | }, [file, fileInfo]); 31 | 32 | return ( 33 |
34 |
35 |
36 |

Encoding and Language Detector

37 |
38 |
39 |
40 | File Upload 41 | 42 |
43 |
44 | 45 |
46 |
47 |
48 | {textContent && ( 49 |
50 |

51 | Language: {fileInfo.language} 52 |

53 |

54 | Encoding: {fileInfo.encoding} 55 |

56 |

57 | Confidence: {fileInfo.confidence.language} 58 |

59 |

60 | Confidence: {fileInfo.confidence.encoding} 61 |

62 |
)} 63 |
64 | {textContent ?
Content
:
Functionality
} 65 | {textContent ? ( 66 |

{textContent}

67 | ) : ( 68 |
69 |

Determine the encoding and language of text files!

70 |
    71 |
  • 72 | Detects 40 languages as well as the appropriate encoding 73 |
  • 74 |
  • Available as CLI, in Node.js and in the browser
  • 75 |
  • Supports .txt, .srt, and .sub
  • 76 |
  • Works best with large inputs
  • 77 |
  • Completely free, no API key required
  • 78 |
79 |

80 | For reliable encoding and language detection, use files 81 | containing 500 words or more. Smaller inputs can work as well 82 | but the results might be less accurate and in some cases 83 | incorrect. 84 |

85 |

86 | Feel free to upload your own files and see if the encoding and 87 | language are detected correctly! 88 |

89 |
90 | )} 91 |
92 |
93 |
94 |
95 | ); 96 | } 97 | 98 | export default App; 99 | -------------------------------------------------------------------------------- /tests/browser/live-demo/src/defaultFileInfo.js: -------------------------------------------------------------------------------- 1 | export default { 2 | language: "", 3 | encoding: "", 4 | confidence: "" 5 | }; 6 | -------------------------------------------------------------------------------- /tests/browser/live-demo/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: lightgray; 3 | } 4 | 5 | h4 { 6 | margin-bottom: 60px; 7 | text-align: center; 8 | } 9 | 10 | .card-panel { 11 | overflow: hidden; 12 | min-height: 95vh; 13 | } -------------------------------------------------------------------------------- /tests/browser/live-demo/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import './index.css'; 4 | import App from './App'; 5 | 6 | ReactDOM.render(, document.getElementById('root')); 7 | -------------------------------------------------------------------------------- /tests/node/node-ts-test/index.ts: -------------------------------------------------------------------------------- 1 | // Looks at the "scripts" section in package.json to run this code! 2 | import languageEncoding from "detect-file-encoding-and-language"; 3 | const pathToFile = "/home/gignu/Documents/Subtitle Database/Samples for each Format/polish-cp-1250-sample-subtitles.srt"; 4 | languageEncoding(pathToFile).then((fileInfo) => console.log(fileInfo)); 5 | -------------------------------------------------------------------------------- /tests/node/node-ts-test/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "version": "1.0.0", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "test", 9 | "version": "1.0.0", 10 | "license": "ISC", 11 | "dependencies": { 12 | "detect-file-encoding-and-language": "git+https://github.com/gignupg/Detect-File-Encoding-and-Language.git" 13 | }, 14 | "devDependencies": { 15 | "ts-node": "^10.9.1", 16 | "typescript": "^4.8.2" 17 | } 18 | }, 19 | "node_modules/@cspotcode/source-map-support": { 20 | "version": "0.8.1", 21 | "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", 22 | "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", 23 | "dev": true, 24 | "dependencies": { 25 | "@jridgewell/trace-mapping": "0.3.9" 26 | }, 27 | "engines": { 28 | "node": ">=12" 29 | } 30 | }, 31 | "node_modules/@jridgewell/resolve-uri": { 32 | "version": "3.1.0", 33 | "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz", 34 | "integrity": "sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w==", 35 | "dev": true, 36 | "engines": { 37 | "node": ">=6.0.0" 38 | } 39 | }, 40 | "node_modules/@jridgewell/sourcemap-codec": { 41 | "version": "1.4.14", 42 | "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz", 43 | "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==", 44 | "dev": true 45 | }, 46 | "node_modules/@jridgewell/trace-mapping": { 47 | "version": "0.3.9", 48 | "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", 49 | "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", 50 | "dev": true, 51 | "dependencies": { 52 | "@jridgewell/resolve-uri": "^3.0.3", 53 | "@jridgewell/sourcemap-codec": "^1.4.10" 54 | } 55 | }, 56 | "node_modules/@tsconfig/node10": { 57 | "version": "1.0.9", 58 | "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.9.tgz", 59 | "integrity": "sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA==", 60 | "dev": true 61 | }, 62 | "node_modules/@tsconfig/node12": { 63 | "version": "1.0.11", 64 | "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", 65 | "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", 66 | "dev": true 67 | }, 68 | "node_modules/@tsconfig/node14": { 69 | "version": "1.0.3", 70 | "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", 71 | "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", 72 | "dev": true 73 | }, 74 | "node_modules/@tsconfig/node16": { 75 | "version": "1.0.3", 76 | "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.3.tgz", 77 | "integrity": "sha512-yOlFc+7UtL/89t2ZhjPvvB/DeAr3r+Dq58IgzsFkOAvVC6NMJXmCGjbptdXdR9qsX7pKcTL+s87FtYREi2dEEQ==", 78 | "dev": true 79 | }, 80 | "node_modules/@types/node": { 81 | "version": "18.7.14", 82 | "resolved": "https://registry.npmjs.org/@types/node/-/node-18.7.14.tgz", 83 | "integrity": "sha512-6bbDaETVi8oyIARulOE9qF1/Qdi/23z6emrUh0fNJRUmjznqrixD4MpGDdgOFk5Xb0m2H6Xu42JGdvAxaJR/wA==", 84 | "dev": true, 85 | "peer": true 86 | }, 87 | "node_modules/acorn": { 88 | "version": "8.8.0", 89 | "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.8.0.tgz", 90 | "integrity": "sha512-QOxyigPVrpZ2GXT+PFyZTl6TtOFc5egxHIP9IlQ+RbupQuX4RkT/Bee4/kQuC02Xkzg84JcT7oLYtDIQxp+v7w==", 91 | "dev": true, 92 | "bin": { 93 | "acorn": "bin/acorn" 94 | }, 95 | "engines": { 96 | "node": ">=0.4.0" 97 | } 98 | }, 99 | "node_modules/acorn-walk": { 100 | "version": "8.2.0", 101 | "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", 102 | "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", 103 | "dev": true, 104 | "engines": { 105 | "node": ">=0.4.0" 106 | } 107 | }, 108 | "node_modules/arg": { 109 | "version": "4.1.3", 110 | "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", 111 | "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", 112 | "dev": true 113 | }, 114 | "node_modules/create-require": { 115 | "version": "1.1.1", 116 | "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", 117 | "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", 118 | "dev": true 119 | }, 120 | "node_modules/detect-file-encoding-and-language": { 121 | "version": "2.3.0", 122 | "resolved": "git+ssh://git@github.com/gignupg/Detect-File-Encoding-and-Language.git#f94553de99d25bfa7bcc2b9d6ebec3bfaea774ee", 123 | "license": "MIT", 124 | "bin": { 125 | "dfeal": "bin/cli.js" 126 | } 127 | }, 128 | "node_modules/diff": { 129 | "version": "4.0.2", 130 | "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", 131 | "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", 132 | "dev": true, 133 | "engines": { 134 | "node": ">=0.3.1" 135 | } 136 | }, 137 | "node_modules/make-error": { 138 | "version": "1.3.6", 139 | "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", 140 | "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", 141 | "dev": true 142 | }, 143 | "node_modules/ts-node": { 144 | "version": "10.9.1", 145 | "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.1.tgz", 146 | "integrity": "sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw==", 147 | "dev": true, 148 | "dependencies": { 149 | "@cspotcode/source-map-support": "^0.8.0", 150 | "@tsconfig/node10": "^1.0.7", 151 | "@tsconfig/node12": "^1.0.7", 152 | "@tsconfig/node14": "^1.0.0", 153 | "@tsconfig/node16": "^1.0.2", 154 | "acorn": "^8.4.1", 155 | "acorn-walk": "^8.1.1", 156 | "arg": "^4.1.0", 157 | "create-require": "^1.1.0", 158 | "diff": "^4.0.1", 159 | "make-error": "^1.1.1", 160 | "v8-compile-cache-lib": "^3.0.1", 161 | "yn": "3.1.1" 162 | }, 163 | "bin": { 164 | "ts-node": "dist/bin.js", 165 | "ts-node-cwd": "dist/bin-cwd.js", 166 | "ts-node-esm": "dist/bin-esm.js", 167 | "ts-node-script": "dist/bin-script.js", 168 | "ts-node-transpile-only": "dist/bin-transpile.js", 169 | "ts-script": "dist/bin-script-deprecated.js" 170 | }, 171 | "peerDependencies": { 172 | "@swc/core": ">=1.2.50", 173 | "@swc/wasm": ">=1.2.50", 174 | "@types/node": "*", 175 | "typescript": ">=2.7" 176 | }, 177 | "peerDependenciesMeta": { 178 | "@swc/core": { 179 | "optional": true 180 | }, 181 | "@swc/wasm": { 182 | "optional": true 183 | } 184 | } 185 | }, 186 | "node_modules/typescript": { 187 | "version": "4.8.2", 188 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.8.2.tgz", 189 | "integrity": "sha512-C0I1UsrrDHo2fYI5oaCGbSejwX4ch+9Y5jTQELvovfmFkK3HHSZJB8MSJcWLmCUBzQBchCrZ9rMRV6GuNrvGtw==", 190 | "dev": true, 191 | "bin": { 192 | "tsc": "bin/tsc", 193 | "tsserver": "bin/tsserver" 194 | }, 195 | "engines": { 196 | "node": ">=4.2.0" 197 | } 198 | }, 199 | "node_modules/v8-compile-cache-lib": { 200 | "version": "3.0.1", 201 | "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", 202 | "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", 203 | "dev": true 204 | }, 205 | "node_modules/yn": { 206 | "version": "3.1.1", 207 | "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", 208 | "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", 209 | "dev": true, 210 | "engines": { 211 | "node": ">=6" 212 | } 213 | } 214 | }, 215 | "dependencies": { 216 | "@cspotcode/source-map-support": { 217 | "version": "0.8.1", 218 | "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", 219 | "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", 220 | "dev": true, 221 | "requires": { 222 | "@jridgewell/trace-mapping": "0.3.9" 223 | } 224 | }, 225 | "@jridgewell/resolve-uri": { 226 | "version": "3.1.0", 227 | "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz", 228 | "integrity": "sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w==", 229 | "dev": true 230 | }, 231 | "@jridgewell/sourcemap-codec": { 232 | "version": "1.4.14", 233 | "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz", 234 | "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==", 235 | "dev": true 236 | }, 237 | "@jridgewell/trace-mapping": { 238 | "version": "0.3.9", 239 | "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", 240 | "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", 241 | "dev": true, 242 | "requires": { 243 | "@jridgewell/resolve-uri": "^3.0.3", 244 | "@jridgewell/sourcemap-codec": "^1.4.10" 245 | } 246 | }, 247 | "@tsconfig/node10": { 248 | "version": "1.0.9", 249 | "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.9.tgz", 250 | "integrity": "sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA==", 251 | "dev": true 252 | }, 253 | "@tsconfig/node12": { 254 | "version": "1.0.11", 255 | "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", 256 | "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", 257 | "dev": true 258 | }, 259 | "@tsconfig/node14": { 260 | "version": "1.0.3", 261 | "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", 262 | "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", 263 | "dev": true 264 | }, 265 | "@tsconfig/node16": { 266 | "version": "1.0.3", 267 | "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.3.tgz", 268 | "integrity": "sha512-yOlFc+7UtL/89t2ZhjPvvB/DeAr3r+Dq58IgzsFkOAvVC6NMJXmCGjbptdXdR9qsX7pKcTL+s87FtYREi2dEEQ==", 269 | "dev": true 270 | }, 271 | "@types/node": { 272 | "version": "18.7.14", 273 | "resolved": "https://registry.npmjs.org/@types/node/-/node-18.7.14.tgz", 274 | "integrity": "sha512-6bbDaETVi8oyIARulOE9qF1/Qdi/23z6emrUh0fNJRUmjznqrixD4MpGDdgOFk5Xb0m2H6Xu42JGdvAxaJR/wA==", 275 | "dev": true, 276 | "peer": true 277 | }, 278 | "acorn": { 279 | "version": "8.8.0", 280 | "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.8.0.tgz", 281 | "integrity": "sha512-QOxyigPVrpZ2GXT+PFyZTl6TtOFc5egxHIP9IlQ+RbupQuX4RkT/Bee4/kQuC02Xkzg84JcT7oLYtDIQxp+v7w==", 282 | "dev": true 283 | }, 284 | "acorn-walk": { 285 | "version": "8.2.0", 286 | "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", 287 | "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", 288 | "dev": true 289 | }, 290 | "arg": { 291 | "version": "4.1.3", 292 | "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", 293 | "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", 294 | "dev": true 295 | }, 296 | "create-require": { 297 | "version": "1.1.1", 298 | "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", 299 | "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", 300 | "dev": true 301 | }, 302 | "detect-file-encoding-and-language": { 303 | "version": "git+ssh://git@github.com/gignupg/Detect-File-Encoding-and-Language.git#f94553de99d25bfa7bcc2b9d6ebec3bfaea774ee", 304 | "from": "detect-file-encoding-and-language@git+https://github.com/gignupg/Detect-File-Encoding-and-Language.git" 305 | }, 306 | "diff": { 307 | "version": "4.0.2", 308 | "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", 309 | "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", 310 | "dev": true 311 | }, 312 | "make-error": { 313 | "version": "1.3.6", 314 | "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", 315 | "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", 316 | "dev": true 317 | }, 318 | "ts-node": { 319 | "version": "10.9.1", 320 | "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.1.tgz", 321 | "integrity": "sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw==", 322 | "dev": true, 323 | "requires": { 324 | "@cspotcode/source-map-support": "^0.8.0", 325 | "@tsconfig/node10": "^1.0.7", 326 | "@tsconfig/node12": "^1.0.7", 327 | "@tsconfig/node14": "^1.0.0", 328 | "@tsconfig/node16": "^1.0.2", 329 | "acorn": "^8.4.1", 330 | "acorn-walk": "^8.1.1", 331 | "arg": "^4.1.0", 332 | "create-require": "^1.1.0", 333 | "diff": "^4.0.1", 334 | "make-error": "^1.1.1", 335 | "v8-compile-cache-lib": "^3.0.1", 336 | "yn": "3.1.1" 337 | } 338 | }, 339 | "typescript": { 340 | "version": "4.8.2", 341 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.8.2.tgz", 342 | "integrity": "sha512-C0I1UsrrDHo2fYI5oaCGbSejwX4ch+9Y5jTQELvovfmFkK3HHSZJB8MSJcWLmCUBzQBchCrZ9rMRV6GuNrvGtw==", 343 | "dev": true 344 | }, 345 | "v8-compile-cache-lib": { 346 | "version": "3.0.1", 347 | "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", 348 | "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", 349 | "dev": true 350 | }, 351 | "yn": { 352 | "version": "3.1.1", 353 | "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", 354 | "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", 355 | "dev": true 356 | } 357 | } 358 | } 359 | -------------------------------------------------------------------------------- /tests/node/node-ts-test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "scripts": { 8 | "test": "ts-node-esm index.ts", 9 | "install-ts-node": "sudo npm install -g ts-node" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "devDependencies": { 15 | "ts-node": "^10.9.1", 16 | "typescript": "^4.8.2" 17 | }, 18 | "dependencies": { 19 | "detect-file-encoding-and-language": "git+https://github.com/gignupg/Detect-File-Encoding-and-Language.git" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/node/node-ts-test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig to read more about this file */ 4 | 5 | /* Projects */ 6 | // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ 7 | // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ 8 | // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ 9 | // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ 10 | // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ 11 | // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ 12 | 13 | /* Language and Environment */ 14 | "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ 15 | // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ 16 | // "jsx": "preserve", /* Specify what JSX code is generated. */ 17 | // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */ 18 | // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ 19 | // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ 20 | // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ 21 | // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ 22 | // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ 23 | // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ 24 | // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ 25 | // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ 26 | 27 | /* Modules */ 28 | "module": "ES2020", /* Specify what module code is generated. */ 29 | // "rootDir": "./", /* Specify the root folder within your source files. */ 30 | "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ 31 | // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ 32 | // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ 33 | // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ 34 | // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ 35 | // "types": [], /* Specify type package names to be included without being referenced in a source file. */ 36 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 37 | // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ 38 | // "resolveJsonModule": true, /* Enable importing .json files. */ 39 | // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ 40 | 41 | /* JavaScript Support */ 42 | // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ 43 | // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ 44 | // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ 45 | 46 | /* Emit */ 47 | // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ 48 | // "declarationMap": true, /* Create sourcemaps for d.ts files. */ 49 | // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ 50 | // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ 51 | // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ 52 | // "outDir": "./", /* Specify an output folder for all emitted files. */ 53 | // "removeComments": true, /* Disable emitting comments. */ 54 | // "noEmit": true, /* Disable emitting files from a compilation. */ 55 | // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ 56 | // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ 57 | // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ 58 | // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ 59 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 60 | // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ 61 | // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ 62 | // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ 63 | // "newLine": "crlf", /* Set the newline character for emitting files. */ 64 | // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ 65 | // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ 66 | // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ 67 | // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ 68 | // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ 69 | // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ 70 | 71 | /* Interop Constraints */ 72 | // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ 73 | // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ 74 | "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ 75 | // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ 76 | "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ 77 | 78 | /* Type Checking */ 79 | "strict": true, /* Enable all strict type-checking options. */ 80 | // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ 81 | // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ 82 | // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ 83 | // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ 84 | // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ 85 | // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ 86 | // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ 87 | // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ 88 | // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ 89 | // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ 90 | // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ 91 | // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ 92 | // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ 93 | // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ 94 | // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ 95 | // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ 96 | // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ 97 | // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ 98 | 99 | /* Completeness */ 100 | // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ 101 | "skipLibCheck": true /* Skip type checking all .d.ts files. */ 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /tests/node/node.test.js: -------------------------------------------------------------------------------- 1 | const languageEncoding = require("../../src/index-node.js"); 2 | const fs = require("fs"); 3 | 4 | // Making sure all important files are there 5 | // Checking for CLI, Node.js and Browser/UNPKG 6 | checkLocation("bin", "cli.js"); 7 | checkLocation("src", "index-node.js"); 8 | checkLocation("umd", "language-encoding.min.js"); 9 | 10 | // Test all files in the 'language folders' dataset 11 | const folderPath = "/home/gignu/Documents/Subtitle Database/Language Folders/"; 12 | const testFiles = getFiles(folderPath); 13 | const minConfidence = 0.95; 14 | 15 | testFiles.forEach((file) => { 16 | languageEncoding(file) 17 | .then((fileInfo) => { 18 | const testFileArray = file.split("/"); 19 | const folderNameArr = testFileArray[testFileArray.length - 2].split('_'); 20 | const expectedLanguage = folderNameArr ? folderNameArr[0] : null; 21 | const expectedEncoding = folderNameArr ? folderNameArr[1] : null; 22 | 23 | if (!expectedLanguage) { 24 | console.error("Expected language not found in folder name:", file.directoryHandle?.name); 25 | setError(file, fileInfo); 26 | 27 | } else if (!expectedEncoding) { 28 | console.error("Expected encoding not found in folder name:", file.directoryHandle?.name); 29 | setError(file, fileInfo); 30 | 31 | } else if (!fileInfo.confidence.encoding || fileInfo.confidence.encoding < minConfidence) { 32 | console.error("Encoding Confidence too low:", fileInfo.confidence.encoding); 33 | setError(file, fileInfo); 34 | 35 | } else if (!fileInfo.confidence.language || fileInfo.confidence.language < minConfidence) { 36 | console.error("Language Confidence too low:", fileInfo.confidence.language); 37 | setError(file, fileInfo); 38 | 39 | } else if (fileInfo.language !== expectedLanguage) { 40 | console.error(`Language mismatch! Expected ${expectedLanguage} but got ${fileInfo.language}`); 41 | setError(file, fileInfo); 42 | 43 | } else if (fileInfo.encoding !== expectedEncoding) { 44 | console.error(`Encoding mismatch! Expected ${expectedEncoding} but got ${fileInfo.encoding}`); 45 | setError(file, fileInfo); 46 | } 47 | }) 48 | .catch((error) => { 49 | console.error(error); 50 | }); 51 | }); 52 | 53 | // Test buffer usage 54 | const buffer = Buffer.from("Content of a file"); 55 | languageEncoding(buffer).then((bufferFileInfo) => { 56 | if (bufferFileInfo.encoding !== "UTF-8") { 57 | setError("buffer", bufferFileInfo); 58 | } 59 | }); 60 | 61 | // Recursively find all files in a folder and all it's subdirectories 62 | function getFiles(dir, files_) { 63 | files_ = files_ || []; 64 | var files = fs.readdirSync(dir); 65 | for (var i in files) { 66 | var name = dir + "/" + files[i]; 67 | if (fs.statSync(name).isDirectory()) { 68 | getFiles(name, files_); 69 | } else { 70 | files_.push(name); 71 | } 72 | } 73 | return files_; 74 | } 75 | 76 | function checkLocation(folder, file) { 77 | const dir = fs.readdirSync("/home/gignu/GitHub/Detect-File-Encoding-And-Language/" + folder); 78 | const fileFound = dir.some((fileName) => fileName === file); 79 | if (!fileFound) { 80 | console.error(`Error: Expected ${file} to be located here: /home/gignu/GitHub/Detect-File-Encoding-and-Language/${folder}`); 81 | process.exit(1); 82 | } 83 | } 84 | 85 | function setError(file, fileInfo) { 86 | console.info('fileInfo:', fileInfo); 87 | console.info('file:', file); 88 | process.exit(1); 89 | } --------------------------------------------------------------------------------