├── .editorconfig ├── .gitattributes ├── .github ├── dependabot.yml ├── release-drafter.yml └── workflows │ ├── codeql-analysis.yml │ └── main.yml ├── .gitignore ├── README.md ├── SECURITY.md ├── UTF-unknown.sln ├── UTF-unknown.sln.DotSettings ├── appveyor.yml ├── example ├── ConsoleExample.csproj ├── DetectFile.cs └── app.config ├── license ├── MPL-1.1.txt ├── gpl-2.0.txt └── lgpl-2.1.txt ├── logo.png ├── src ├── CharsetDetector.cs ├── Core │ ├── Analyzers │ │ ├── CharDistributionAnalyser.cs │ │ └── MultiByte │ │ │ ├── Chinese │ │ │ ├── BIG5DistributionAnalyser.cs │ │ │ ├── EUCTWDistributionAnalyser.cs │ │ │ └── GB18030DistributionAnalyser.cs │ │ │ ├── Japanese │ │ │ ├── EUCJPContextAnalyser.cs │ │ │ ├── EUCJPDistributionAnalyser.cs │ │ │ ├── JapaneseContextAnalyser.cs │ │ │ ├── SJISContextAnalyser.cs │ │ │ └── SJISDistributionAnalyser.cs │ │ │ └── Korean │ │ │ └── EUCKRDistributionAnalyser.cs │ ├── BitPackage.cs │ ├── CodepageName.cs │ ├── InputState.cs │ ├── Models │ │ ├── MultiByte │ │ │ ├── Chinese │ │ │ │ ├── BIG5SMModel.cs │ │ │ │ ├── EUCTWSMModel.cs │ │ │ │ ├── GB18030_SMModel.cs │ │ │ │ ├── HZ_GB_2312_SMModel.cs │ │ │ │ └── Iso_2022_CN_SMModel.cs │ │ │ ├── Japanese │ │ │ │ ├── EUCJPSMModel.cs │ │ │ │ ├── Iso_2022_JP_SMModel.cs │ │ │ │ └── SJIS_SMModel.cs │ │ │ ├── Korean │ │ │ │ ├── CP949SMModel.cs │ │ │ │ ├── EUCKRSMModel.cs │ │ │ │ └── Iso_2022_KR_SMModel.cs │ │ │ ├── UCS2BE_SMModel.cs │ │ │ ├── UCS2LE_SMModel.cs │ │ │ └── UTF8_SMModel.cs │ │ ├── SequenceModel.cs │ │ ├── SingleByte │ │ │ ├── Arabic │ │ │ │ ├── ArabicModel.cs │ │ │ │ ├── Iso_8859_6_ArabicModel.cs │ │ │ │ └── Windows_1256_ArabicModel.cs │ │ │ ├── Bulgarian │ │ │ │ ├── BulgarianModel.cs │ │ │ │ ├── Iso_8859_5_BulgarianModel.cs │ │ │ │ └── Windows_1251_BulgarianModel.cs │ │ │ ├── Croatian │ │ │ │ ├── CroatianModel.cs │ │ │ │ ├── Ibm852_CroatianModel.cs │ │ │ │ ├── Iso_8859_13_CroatianModel.cs │ │ │ │ ├── Iso_8859_16_CroatianModel.cs │ │ │ │ ├── Iso_8859_2_CroatianModel.cs │ │ │ │ ├── Mac_Centraleurope_CroatianModel.cs │ │ │ │ └── Windows_1250_CroatianModel.cs │ │ │ ├── Czech │ │ │ │ ├── CzechModel.cs │ │ │ │ ├── Ibm852_CzechModel.cs │ │ │ │ ├── Iso_8859_2_CzechModel.cs │ │ │ │ ├── Mac_Centraleurope_CzechModel.cs │ │ │ │ └── Windows_1250_CzechModel.cs │ │ │ ├── Danish │ │ │ │ ├── DanishModel.cs │ │ │ │ ├── Iso_8859_15_DanishModel.cs │ │ │ │ ├── Iso_8859_1_DanishModel.cs │ │ │ │ └── Windows_1252_DanishModel.cs │ │ │ ├── Esperanto │ │ │ │ ├── EsperantoModel.cs │ │ │ │ └── Iso_8859_3_EsperantoModel.cs │ │ │ ├── Estonian │ │ │ │ ├── EstonianModel.cs │ │ │ │ ├── Iso_8859_13_EstonianModel.cs │ │ │ │ ├── Iso_8859_15_EstonianModel.cs │ │ │ │ ├── Iso_8859_4_EstonianModel.cs │ │ │ │ ├── Windows_1252_EstonianModel.cs │ │ │ │ └── Windows_1257_EstonianModel.cs │ │ │ ├── Finnish │ │ │ │ ├── FinnishModel.cs │ │ │ │ ├── Iso_8859_13_FinnishModel.cs │ │ │ │ ├── Iso_8859_15_FinnishModel.cs │ │ │ │ ├── Iso_8859_1_FinnishModel.cs │ │ │ │ ├── Iso_8859_4_FinnishModel.cs │ │ │ │ ├── Iso_8859_9_FinnishModel.cs │ │ │ │ └── Windows_1252_FinnishModel.cs │ │ │ ├── French │ │ │ │ ├── FrenchModel.cs │ │ │ │ ├── Iso_8859_15_FrenchModel.cs │ │ │ │ ├── Iso_8859_1_FrenchModel.cs │ │ │ │ └── Windows_1252_FrenchModel.cs │ │ │ ├── German │ │ │ │ ├── GermanModel.cs │ │ │ │ ├── Iso_8859_1_GermanModel.cs │ │ │ │ └── Windows_1252_GermanModel.cs │ │ │ ├── Greek │ │ │ │ ├── GreekModel.cs │ │ │ │ ├── Iso_8859_7_GreekModel.cs │ │ │ │ └── Windows_1253_GreekModel.cs │ │ │ ├── Hebrew │ │ │ │ ├── HebrewModel.cs │ │ │ │ └── Windows_1255_HebrewModel.cs │ │ │ ├── Hungarian │ │ │ │ ├── HungarianModel.cs │ │ │ │ ├── Iso_8859_2_HungarianModel.cs │ │ │ │ └── Windows_1250_HungarianModel.cs │ │ │ ├── Irish │ │ │ │ ├── IrishModel.cs │ │ │ │ ├── Iso_8859_15_IrishModel.cs │ │ │ │ ├── Iso_8859_1_IrishModel.cs │ │ │ │ ├── Iso_8859_9_IrishModel.cs │ │ │ │ └── Windows_1252_IrishModel.cs │ │ │ ├── Italian │ │ │ │ ├── Iso_8859_15_ItalianModel.cs │ │ │ │ ├── Iso_8859_1_ItalianModel.cs │ │ │ │ ├── Iso_8859_3_ItalianModel.cs │ │ │ │ ├── Iso_8859_9_ItalianModel.cs │ │ │ │ ├── ItalianModel.cs │ │ │ │ └── Windows_1252_ItalianModel.cs │ │ │ ├── Latvian │ │ │ │ ├── Iso_8859_10_LatvianModel.cs │ │ │ │ ├── Iso_8859_13_LatvianModel.cs │ │ │ │ ├── Iso_8859_4_LatvianModel.cs │ │ │ │ └── LatvianModel.cs │ │ │ ├── Lithuanian │ │ │ │ ├── Iso_8859_10_LithuanianModel.cs │ │ │ │ ├── Iso_8859_13_LithuanianModel.cs │ │ │ │ ├── Iso_8859_4_LithuanianModel.cs │ │ │ │ └── LithuanianModel.cs │ │ │ ├── Maltese │ │ │ │ ├── Iso_8859_3_MalteseModel.cs │ │ │ │ └── MalteseModel.cs │ │ │ ├── Polish │ │ │ │ ├── Ibm852_PolishModel.cs │ │ │ │ ├── Iso_8859_13_PolishModel.cs │ │ │ │ ├── Iso_8859_16_PolishModel.cs │ │ │ │ ├── Iso_8859_2_PolishModel.cs │ │ │ │ ├── Mac_Centraleurope_PolishModel.cs │ │ │ │ ├── PolishModel.cs │ │ │ │ └── Windows_1250_PolishModel.cs │ │ │ ├── Portuguese │ │ │ │ ├── Iso_8859_15_PortugueseModel.cs │ │ │ │ ├── Iso_8859_1_PortugueseModel.cs │ │ │ │ ├── Iso_8859_9_PortugueseModel.cs │ │ │ │ ├── PortugueseModel.cs │ │ │ │ └── Windows_1252_PortugueseModel.cs │ │ │ ├── Romanian │ │ │ │ ├── Ibm852_RomanianModel.cs │ │ │ │ ├── Iso_8859_16_RomanianModel.cs │ │ │ │ ├── Iso_8859_2_RomanianModel.cs │ │ │ │ ├── RomanianModel.cs │ │ │ │ └── Windows_1250_RomanianModel.cs │ │ │ ├── Russian │ │ │ │ ├── Ibm855_RussianModel.cs │ │ │ │ ├── Ibm866_RussianModel.cs │ │ │ │ ├── Iso_8859_5_RussianModel.cs │ │ │ │ ├── Koi8r_Model.cs │ │ │ │ ├── RussianModel.cs │ │ │ │ ├── Windows_1251_RussianModel.cs │ │ │ │ └── X_Mac_Cyrillic_RussianModel.cs │ │ │ ├── Slovak │ │ │ │ ├── Ibm852_SlovakModel.cs │ │ │ │ ├── Iso_8859_2_SlovakModel.cs │ │ │ │ ├── Mac_Centraleurope_SlovakModel.cs │ │ │ │ ├── SlovakModel.cs │ │ │ │ └── Windows_1250_SlovakModel.cs │ │ │ ├── Slovene │ │ │ │ ├── Ibm852_SloveneModel.cs │ │ │ │ ├── Iso_8859_16_SloveneModel.cs │ │ │ │ ├── Iso_8859_2_SloveneModel.cs │ │ │ │ ├── Mac_Centraleurope_SloveneModel.cs │ │ │ │ ├── SloveneModel.cs │ │ │ │ └── Windows_1250_SloveneModel.cs │ │ │ ├── Spanish │ │ │ │ ├── Iso_8859_15_SpanishModel.cs │ │ │ │ ├── Iso_8859_1_SpanishModel.cs │ │ │ │ ├── SpanishModel.cs │ │ │ │ └── Windows_1252_SpanishModel.cs │ │ │ ├── Swedish │ │ │ │ ├── Iso_8859_15_SwedishModel.cs │ │ │ │ ├── Iso_8859_1_SwedishModel.cs │ │ │ │ ├── Iso_8859_4_SwedishModel.cs │ │ │ │ ├── Iso_8859_9_SwedishModel.cs │ │ │ │ ├── SwedishModel.cs │ │ │ │ └── Windows_1252_SwedishModel.cs │ │ │ ├── Thai │ │ │ │ ├── Iso_8859_11_ThaiModel.cs │ │ │ │ ├── ThaiModel.cs │ │ │ │ └── Tis_620_ThaiModel.cs │ │ │ ├── Turkish │ │ │ │ ├── Iso_8859_3_TurkishModel.cs │ │ │ │ ├── Iso_8859_9_TurkishModel.cs │ │ │ │ └── TurkishModel.cs │ │ │ └── Vietnamese │ │ │ │ ├── VietnameseModel.cs │ │ │ │ ├── Viscii_VietnameseModel.cs │ │ │ │ └── Windows_1258_VietnameseModel.cs │ │ └── StateMachineModel.cs │ └── Probers │ │ ├── CharsetProber.cs │ │ ├── CodingStateMachine.cs │ │ ├── EscCharsetProber.cs │ │ ├── HebrewProber.cs │ │ ├── Latin1Prober.cs │ │ ├── MBCSGroupProber.cs │ │ ├── MultiByte │ │ ├── Chinese │ │ │ ├── Big5Prober.cs │ │ │ ├── EUCTWProber.cs │ │ │ └── GB18030Prober.cs │ │ ├── Japanese │ │ │ ├── EUCJPProber.cs │ │ │ └── SJISProber.cs │ │ ├── Korean │ │ │ ├── CP949Prober.cs │ │ │ └── EUCKRProber.cs │ │ └── UTF8Prober.cs │ │ ├── ProbingState.cs │ │ ├── SBCSGroupProber.cs │ │ └── SingleByteCharSetProber.cs ├── DetectionDetail.cs ├── DetectionResult.cs ├── UTF-unknown.csproj └── UtfUnknown.snk └── tests ├── BitPackageTest.cs ├── CharsetDetectorTest.cs ├── CharsetDetectorTestBatch.cs ├── Data ├── README.md ├── big5 │ └── 1.txt ├── cp949 │ ├── cp949_1.txt │ └── cp949_2.txt ├── euc-jp │ └── 1.txt ├── euc-kr │ ├── euc1.txt │ └── euc2.txt ├── gb18030 │ └── 1.txt ├── ibm852 │ └── lang_ce_ibm852.txt ├── ibm855 │ ├── 1.txt │ └── 2.txt ├── ibm866 │ └── 1.txt ├── iso-2022-jp │ └── 1.txt ├── iso-2022-kr │ ├── iso1.txt │ └── iso2.txt ├── iso-8859-1 │ ├── 1.txt │ ├── 3.txt │ └── 4.txt ├── iso-8859-11 │ └── lang_th_iso-8859-11.txt ├── iso-8859-13 │ └── lang_et_iso-8859-13.txt ├── iso-8859-15 │ └── lang_da_iso-8859-15.txt ├── iso-8859-2 │ └── lang_ce_iso-8859-2.txt ├── iso-8859-3 │ └── lang_eo_iso-8859-3.txt ├── iso-8859-4 │ └── lang_et_iso-8859-4.txt ├── iso-8859-5 │ └── lang_ru_iso-8859-5.txt ├── iso-8859-6 │ └── lang_ar_iso-8859-6.txt ├── iso-8859-7 │ ├── greek.txt │ └── lang_le_iso-8859-7.txt ├── iso-8859-9 │ └── lang_tr_iso-8859-9.txt ├── koi8-r │ ├── 1.txt │ └── 2.txt ├── shift-jis │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ └── 4.txt ├── tis-620 │ └── lang_th_tis-620.txt ├── utf-16be │ └── lang_fr_utf-16.be ├── utf-16le │ └── lang_ko_utf-16.le ├── utf-32le │ └── lang_fr_utf-32.le ├── utf-8 │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── emoji.html.txt │ ├── greek.txt │ ├── he1.txt │ ├── he2.txt │ ├── he3.txt │ └── russian.txt ├── windows-1250 │ └── lang_ce_windows-1250.txt ├── windows-1251 │ └── 1.txt ├── windows-1252 (latin1) │ └── 2.txt ├── windows-1253 │ └── lang_le_windows-1253.txt ├── windows-1255 │ ├── he1.txt │ ├── he2.txt │ └── he3.txt ├── windows-1256 │ └── lang_ar_windows-1256.txt ├── windows-1257 │ └── lang_et_windows-1257.txt ├── windows-1258 │ └── lang_vi_windows-1258.txt ├── x-mac-ce │ └── lang_cs_mac-centraleurope.txt └── x-mac-cyrillic │ └── 1.txt ├── DataUnsupported ├── euc-tw │ └── euc-tw1.txt ├── iso-8859-10 │ └── lang_lv_iso-8859-10.txt ├── iso-8859-16 │ └── lang_sl_iso-8859-16.txt └── viscii │ └── lang_vi_viscii.txt ├── DetectionDetailTests.cs ├── EncodingJsonConverter.cs └── UTF-unknown.Tests.csproj /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "nuget" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'Version $NEXT_MINOR_VERSION' 2 | tag-template: 'v$NEXT_MINOR_VERSION' 3 | version-template: '$MAJOR.$MINOR' 4 | categories: 5 | - title: '🚀 Features' 6 | labels: 7 | - 'feature' 8 | - title: '👍 Enhancements' 9 | labels: 10 | - 'enhancement' 11 | - 'documentation' 12 | - title: '🐛 Bug Fixes' 13 | labels: 14 | - 'fix' 15 | - 'bugfix' 16 | - 'bug' 17 | - title: '🔧 Maintenance' 18 | labels: 19 | - 'refactor' 20 | - 'test' 21 | - 'tests' 22 | exclude-labels: 23 | - 'skip-changelog' 24 | - 'build' 25 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 26 | template: | 27 | $CHANGES 28 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '26 18 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'csharp' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | # branches to consider in the event; optional, defaults to all 6 | branches: 7 | - master 8 | 9 | jobs: 10 | update_release_draft: 11 | runs-on: ubuntu-latest 12 | steps: 13 | # Drafts your next Release notes as Pull Requests are merged into "master" 14 | - uses: release-drafter/release-drafter@v5 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-* 2 | *.vshost.* 3 | log.txt 4 | NLogMerged.api.xml 5 | packages/ 6 | StyleCop.Cache 7 | test-results/ 8 | 9 | # User-specific files 10 | *.rsuser 11 | *.suo 12 | *.user 13 | *.userosscache 14 | *.sln.docstates 15 | 16 | # User-specific files (MonoDevelop/Xamarin Studio) 17 | *.userprefs 18 | 19 | # User-specific files (IntelliJ IDEA) 20 | .idea/ 21 | 22 | # Build results 23 | [Dd]ebug/ 24 | [Dd]ebugPublic/ 25 | [Rr]elease/ 26 | [Rr]eleases/ 27 | x64/ 28 | x86/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Ll]og/ 33 | [Ll]ogs/ 34 | [Bb]uild/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # StyleCop 52 | StyleCopReport.xml 53 | 54 | # Files built by Visual Studio 55 | *_i.c 56 | *_p.c 57 | *_h.h 58 | *.ilk 59 | *.meta 60 | *.obj 61 | *.iobj 62 | *.pch 63 | *.pdb 64 | *.ipdb 65 | *.pgc 66 | *.pgd 67 | *.rsp 68 | *.sbr 69 | *.tlb 70 | *.tli 71 | *.tlh 72 | *.tmp 73 | *.tmp_proj 74 | *_wpftmp.csproj 75 | *.log 76 | *.vspscc 77 | *.vssscc 78 | .builds 79 | *.pidb 80 | *.svclog 81 | *.scc 82 | 83 | # Visual Studio profiler 84 | *.psess 85 | *.vsp 86 | *.vspx 87 | *.sap 88 | 89 | # Visual Studio Trace Files 90 | *.e2e 91 | 92 | # ReSharper is a .NET coding add-in 93 | _ReSharper*/ 94 | *.[Rr]e[Ss]harper 95 | *.DotSettings.user 96 | 97 | # JustCode is a .NET coding add-in 98 | .JustCode 99 | 100 | # DotCover is a Code Coverage Tool 101 | *.dotCover 102 | 103 | # AxoCover is a Code Coverage Tool 104 | .axoCover/* 105 | !.axoCover/settings.json 106 | 107 | # Visual Studio code coverage results 108 | *.coverage 109 | *.coveragexml 110 | 111 | # NuGet Packages 112 | *.nupkg 113 | # NuGet Symbol Packages 114 | *.snupkg 115 | # The packages folder can be ignored because of Package Restore 116 | **/[Pp]ackages/* 117 | # except build/, which is used as an MSBuild target. 118 | !**/[Pp]ackages/build/ 119 | # Uncomment if necessary however generally it will be regenerated when needed 120 | #!**/[Pp]ackages/repositories.config 121 | # NuGet v3's project.json files produces more ignorable files 122 | *.nuget.props 123 | *.nuget.targets 124 | 125 | # Visual Studio cache files 126 | # files ending in .cache can be ignored 127 | *.[Cc]ache 128 | # but keep track of directories ending in .cache 129 | !?*.[Cc]ache/ 130 | 131 | # Others 132 | ClientBin/ 133 | ~$* 134 | *~ 135 | *.dbmdl 136 | *.dbproj.schemaview 137 | *.jfm 138 | *.pfx 139 | *.publishsettings 140 | orleans.codegen.cs 141 | 142 | # Backup & report files from converting an old project file 143 | # to a newer Visual Studio version. Backup files are not needed, 144 | # because we have git ;-) 145 | _UpgradeReport_Files/ 146 | Backup*/ 147 | UpgradeLog*.XML 148 | UpgradeLog*.htm 149 | ServiceFabricBackup/ 150 | *.rptproj.bak 151 | 152 | # MSBuild Binary and Structured Log 153 | *.binlog 154 | 155 | # MFractors (Xamarin productivity tool) working folder 156 | .mfractor/ 157 | 158 | # Local History for Visual Studio 159 | .localhistory/ 160 | 161 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 162 | MigrationBackup/ 163 | 164 | # Ionide (cross platform F# VS Code tools) working folder 165 | .ionide/ -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Currently supported with security updates: 6 | 7 | | Version | Supported | 8 | | ------- | ------------------ | 9 | | 2.x.x | :white_check_mark: | 10 | | 1.x.x | :x: | 11 | 12 | ## Reporting a Vulnerability 13 | 14 | Please open a GitHub issue, don't report details and leave your email address. I will contact you ASAP. Thanks! 15 | -------------------------------------------------------------------------------- /UTF-unknown.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.2.32616.157 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UTF-unknown", "src\UTF-unknown.csproj", "{64CA7BA7-EFD9-4475-BB66-40B187622A73}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ConsoleExample", "example\ConsoleExample.csproj", "{386C6ABF-44EA-4418-B90E-E8D21E4C2475}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UTF-unknown.Tests", "Tests\UTF-unknown.Tests.csproj", "{1922DCC9-A45F-4627-9087-CD492BBF7F38}" 11 | EndProject 12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{052846B2-CA56-482F-B477-6E33523C091E}" 13 | ProjectSection(SolutionItems) = preProject 14 | .editorconfig = .editorconfig 15 | EndProjectSection 16 | EndProject 17 | Global 18 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 19 | Debug|Any CPU = Debug|Any CPU 20 | Release|Any CPU = Release|Any CPU 21 | EndGlobalSection 22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 23 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 24 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Debug|Any CPU.Build.0 = Debug|Any CPU 25 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Release|Any CPU.ActiveCfg = Release|Any CPU 26 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Release|Any CPU.Build.0 = Release|Any CPU 27 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 28 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Debug|Any CPU.Build.0 = Debug|Any CPU 29 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Release|Any CPU.ActiveCfg = Release|Any CPU 30 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Release|Any CPU.Build.0 = Release|Any CPU 31 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 32 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Debug|Any CPU.Build.0 = Debug|Any CPU 33 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Release|Any CPU.ActiveCfg = Release|Any CPU 34 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Release|Any CPU.Build.0 = Release|Any CPU 35 | EndGlobalSection 36 | GlobalSection(SolutionProperties) = preSolution 37 | HideSolutionNode = FALSE 38 | EndGlobalSection 39 | GlobalSection(ExtensibilityGlobals) = postSolution 40 | SolutionGuid = {0C7AF656-EF20-4880-8EB9-9BF101340A03} 41 | EndGlobalSection 42 | EndGlobal 43 | -------------------------------------------------------------------------------- /UTF-unknown.sln.DotSettings: -------------------------------------------------------------------------------- 1 |  2 | True 3 | True 4 | True 5 | True -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | info_version: 2.5.1 3 | 4 | version: 2.0.{build} 5 | 6 | clone_folder: c:\utfUnknown 7 | image: Visual Studio 2019 8 | configuration: Release 9 | platform: Any CPU 10 | nuget: 11 | project_feed: true 12 | init: 13 | - git config --global core.autocrlf true 14 | build_script: 15 | - ps: dotnet build -c Release 16 | test_script: 17 | - ps: cd .\tests\ 18 | - ps: dotnet test 19 | - ps: cd .. 20 | after_build: 21 | - ps: msbuild -t:Pack .\src\ -p:Configuration=Release -p:IncludeSymbols=true -p:SymbolPackageFormat=snupkg -p:ContinuousIntegrationBuild=true -p:EmbedUntrackedSources=true -verbosity:minimal 22 | - ps: Get-ChildItem '**\test-diag.log' | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name } 23 | artifacts: 24 | - path: '**\*.nupkg' 25 | - path: '**\*.snupkg' 26 | - path: '**\test-diag.log' 27 | dotnet_csproj: 28 | patch: true 29 | file: '**\*.csproj' 30 | version: $(info_version) 31 | package_version: $(info_version) 32 | assembly_version: 2.0.0.0 33 | file_version: '{version}' 34 | informational_version: $(info_version) 35 | deploy: 36 | - provider: NuGet 37 | api_key: 38 | secure: iB1GljKDgO1ynQjVNyXRQY1Ib3nOuCvV+UkPGK44U/5tIhKvZm7ZSgUG5CeQj2/z 39 | on: 40 | branch: master 41 | -------------------------------------------------------------------------------- /example/ConsoleExample.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Exe 5 | netcoreapp3.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /example/DetectFile.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using UtfUnknown; 4 | 5 | namespace ConsoleExample 6 | { 7 | public class DetectFile 8 | { 9 | /// 10 | /// Command line example: detects the encoding of the given file. 11 | /// 12 | /// a filename 13 | public static void Main(string[] args) 14 | { 15 | if (args.Length == 0) 16 | { 17 | Console.WriteLine("Usage: ConsoleExample "); 18 | return; 19 | } 20 | 21 | var filename = args[0]; 22 | if (!File.Exists(filename)) 23 | { 24 | Console.WriteLine($"File not found: {filename}"); 25 | return; 26 | } 27 | 28 | var result = CharsetDetector.DetectFromFile(filename); 29 | var message = result.Detected != null 30 | ? $"Detected encoding {result.Detected.Encoding.WebName} with confidence {result.Detected.Confidence}." 31 | : $"Detection failed: {filename}"; 32 | Console.WriteLine(message); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /example/app.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/logo.png -------------------------------------------------------------------------------- /src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class EUCJPContextAnalyser : JapaneseContextAnalyser 4 | { 5 | private const byte HIRAGANA_FIRST_BYTE = 0xA4; 6 | 7 | protected override int GetOrder(byte[] buf, int offset, out int charLen) 8 | { 9 | byte high = buf[offset]; 10 | 11 | //find out current char's byte length 12 | if (high == 0x8E || high >= 0xA1 && high <= 0xFE) 13 | charLen = 2; 14 | else if (high == 0xBF) 15 | charLen = 3; 16 | else 17 | charLen = 1; 18 | 19 | // return its order if it is hiragana 20 | if (high == HIRAGANA_FIRST_BYTE) { 21 | byte low = buf[offset+1]; 22 | if (low >= 0xA1 && low <= 0xF3) 23 | return low - 0xA1; 24 | } 25 | return -1; 26 | } 27 | 28 | protected override int GetOrder(byte[] buf, int offset) 29 | { 30 | // We are only interested in Hiragana 31 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 32 | byte low = buf[offset+1]; 33 | if (low >= 0xA1 && low <= 0xF3) 34 | return low - 0xA1; 35 | } 36 | return -1; 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class EUCJPDistributionAnalyser : SJISDistributionAnalyser 4 | { 5 | /// 6 | /// first byte range: 0xa0 -- 0xfe 7 | /// second byte range: 0xa1 -- 0xfe 8 | /// no validation needed here. State machine has done that 9 | /// 10 | public override int GetOrder(byte[] buf, int offset) 11 | { 12 | if (buf[offset] >= 0xA0) 13 | return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1; 14 | else 15 | return -1; 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class SJISContextAnalyser : JapaneseContextAnalyser 4 | { 5 | private const byte HIRAGANA_FIRST_BYTE = 0x82; 6 | 7 | protected override int GetOrder(byte[] buf, int offset, out int charLen) 8 | { 9 | //find out current char's byte length 10 | if (buf[offset] >= 0x81 && buf[offset] <= 0x9F 11 | || buf[offset] >= 0xe0 && buf[offset] <= 0xFC) 12 | charLen = 2; 13 | else 14 | charLen = 1; 15 | 16 | // return its order if it is hiragana 17 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 18 | byte low = buf[offset+1]; 19 | if (low >= 0x9F && low <= 0xF1) 20 | return low - 0x9F; 21 | } 22 | return -1; 23 | } 24 | 25 | protected override int GetOrder(byte[] buf, int offset) 26 | { 27 | // We are only interested in Hiragana 28 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 29 | byte low = buf[offset+1]; 30 | if (low >= 0x9F && low <= 0xF1) 31 | return low - 0x9F; 32 | } 33 | return -1; 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /src/Core/BitPackage.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Kohei TAKETA (Java port) 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | namespace UtfUnknown.Core 40 | { 41 | public class BitPackage 42 | { 43 | public static int INDEX_SHIFT_4BITS = 3; 44 | public static int INDEX_SHIFT_8BITS = 2; 45 | public static int INDEX_SHIFT_16BITS = 1; 46 | public static int SHIFT_MASK_4BITS = 7; 47 | public static int SHIFT_MASK_8BITS = 3; 48 | 49 | public static int SHIFT_MASK_16BITS = 1; 50 | public static int BIT_SHIFT_4BITS = 2; 51 | public static int BIT_SHIFT_8BITS = 3; 52 | 53 | public static int BIT_SHIFT_16BITS = 4; 54 | public static int UNIT_MASK_4BITS = 0x0000000F; 55 | public static int UNIT_MASK_8BITS = 0x000000FF; 56 | 57 | public static int UNIT_MASK_16BITS = 0x0000FFFF; 58 | 59 | private int indexShift; 60 | private int shiftMask; 61 | private int bitShift; 62 | private int unitMask; 63 | private int[] data; 64 | 65 | public BitPackage(int indexShift, int shiftMask, 66 | int bitShift, int unitMask, int[] data) 67 | { 68 | this.indexShift = indexShift; 69 | this.shiftMask = shiftMask; 70 | this.bitShift = bitShift; 71 | this.unitMask = unitMask; 72 | this.data = data; 73 | } 74 | 75 | public static int Pack16bits(int a, int b) 76 | { 77 | return ((b << 16) | a); 78 | } 79 | 80 | public static int Pack8bits(int a, int b, int c, int d) 81 | { 82 | return Pack16bits((b << 8) | a, (d << 8) | c); 83 | } 84 | 85 | public static int Pack4bits(int a, int b, int c, int d, 86 | int e, int f, int g, int h) 87 | { 88 | return Pack8bits((b << 4) | a, (d << 4) | c, 89 | (f << 4) | e, (h << 4) | g); 90 | } 91 | 92 | public int Unpack(int i) 93 | { 94 | return (data[i >> indexShift] >> 95 | ((i & shiftMask) << bitShift)) & unitMask; 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /src/Core/InputState.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core 2 | { 3 | enum InputState 4 | { 5 | PureASCII=0, 6 | 7 | /// 8 | /// Found escape character or HZ "~{" 9 | /// 10 | EscASCII = 1, 11 | 12 | /// 13 | /// non-ascii byte (high-byte) 14 | /// 15 | Highbyte = 2 16 | }; 17 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Chinese/BIG5SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 2 | { 3 | public class BIG5SMModel : StateMachineModel 4 | { 5 | private readonly static int[] BIG5_cls = { 6 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 7 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 9 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f 22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87 23 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f 24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97 25 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f 26 | BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7 27 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af 28 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7 29 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf 30 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7 31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf 32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7 33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df 34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef 36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7 37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] BIG5_st = { 41 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f 43 | BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17 44 | }; 45 | 46 | private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0}; 47 | 48 | public BIG5SMModel() : base( 49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 50 | BitPackage.SHIFT_MASK_4BITS, 51 | BitPackage.BIT_SHIFT_4BITS, 52 | BitPackage.UNIT_MASK_4BITS, BIG5_cls), 53 | 5, 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, BIG5_st), 58 | BIG5CharLenTable, CodepageName.BIG5) 59 | { 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Chinese/EUCTWSMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 2 | { 3 | public class EUCTWSMModel : StateMachineModel 4 | { 5 | private readonly static int[] EUCTW_cls = { 6 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07 7 | BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17 9 | BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f 10 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27 11 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f 12 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37 13 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f 14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 23 | BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 26 | BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7 27 | BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af 28 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7 29 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf 30 | BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7 31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf 32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7 33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df 34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef 36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7 37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] EUCTW_st = { 41 | BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17 44 | BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f 45 | BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27 46 | BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f 47 | }; 48 | 49 | private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 }; 50 | 51 | public EUCTWSMModel() : base( 52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 53 | BitPackage.SHIFT_MASK_4BITS, 54 | BitPackage.BIT_SHIFT_4BITS, 55 | BitPackage.UNIT_MASK_4BITS, EUCTW_cls), 56 | 7, 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, EUCTW_st), 61 | EUCTWCharLenTable, CodepageName.EUC_TW) 62 | { 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Chinese/GB18030_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 2 | { 3 | public class GB18030_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] GB18030_cls = { 6 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 7 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 9 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 12 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37 13 | BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f 14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f 22 | BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87 23 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f 24 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97 25 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f 26 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7 27 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af 28 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7 29 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf 30 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7 31 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf 32 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7 33 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df 34 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7 35 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef 36 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7 37 | BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] GB18030_st = { 41 | BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17 44 | BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f 45 | BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27 46 | BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f 47 | }; 48 | 49 | // To be accurate, the length of class 6 can be either 2 or 4. 50 | // But it is not necessary to discriminate between the two since 51 | // it is used for frequency analysis only, and we are validating 52 | // each code range there as well. So it is safe to set it to be 53 | // 2 here. 54 | private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2}; 55 | 56 | public GB18030_SMModel() : base( 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, GB18030_cls), 61 | 7, 62 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 63 | BitPackage.SHIFT_MASK_4BITS, 64 | BitPackage.BIT_SHIFT_4BITS, 65 | BitPackage.UNIT_MASK_4BITS, GB18030_st), 66 | GB18030CharLenTable, CodepageName.GB18030) 67 | { 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Chinese/Iso_2022_CN_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 2 | { 3 | public class Iso_2022_CN_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] ISO2022CN_cls = { 6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] ISO2022CN_st = { 41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START), //00-07 42 | BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f 43 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME), //10-17 44 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR), //18-1f 45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //20-27 46 | BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //28-2f 47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //30-37 48 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f 49 | }; 50 | 51 | private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 52 | 53 | public Iso_2022_CN_SMModel() : base( 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls), 58 | 9, 59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 60 | BitPackage.SHIFT_MASK_4BITS, 61 | BitPackage.BIT_SHIFT_4BITS, 62 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_st), 63 | ISO2022CNCharLenTable, CodepageName.ISO_2022_CN) 64 | { 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Japanese/EUCJPSMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 2 | { 3 | public class EUCJPSMModel : StateMachineModel 4 | { 5 | private readonly static int[] EUCJP_cls = { 6 | //BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07 7 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07 8 | BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f 9 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17 10 | BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f 11 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27 12 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f 13 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37 14 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f 15 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47 16 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f 17 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57 18 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f 19 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67 20 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f 21 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77 22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f 23 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87 24 | BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f 25 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97 26 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f 27 | BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff 39 | }; 40 | 41 | private readonly static int[] EUCJP_st = { 42 | BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07 43 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 44 | BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17 45 | BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f 46 | BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27 47 | }; 48 | 49 | private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 }; 50 | 51 | public EUCJPSMModel() : base( 52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 53 | BitPackage.SHIFT_MASK_4BITS, 54 | BitPackage.BIT_SHIFT_4BITS, 55 | BitPackage.UNIT_MASK_4BITS, EUCJP_cls), 56 | 6, 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, EUCJP_st), 61 | EUCJPCharLenTable, CodepageName.EUC_JP) 62 | { 63 | 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Japanese/Iso_2022_JP_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 2 | { 3 | public class Iso_2022_JP_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] ISO2022JP_cls = { 6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] ISO2022JP_st = { 41 | BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START), //00-07 42 | BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f 43 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //10-17 44 | BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR), //18-1f 45 | BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR), //20-27 46 | BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR), //28-2f 47 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME), //30-37 48 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //38-3f 49 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47 50 | }; 51 | 52 | private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 53 | 54 | public Iso_2022_JP_SMModel() : base( 55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 56 | BitPackage.SHIFT_MASK_4BITS, 57 | BitPackage.BIT_SHIFT_4BITS, 58 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls), 59 | 10, 60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 61 | BitPackage.SHIFT_MASK_4BITS, 62 | BitPackage.BIT_SHIFT_4BITS, 63 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_st), 64 | ISO2022JPCharLenTable, CodepageName.ISO_2022_JP) 65 | { 66 | 67 | } 68 | 69 | } 70 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Japanese/SJIS_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 2 | { 3 | public class SJIS_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] SJIS_cls = { 6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07 7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f 23 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87 24 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f 25 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97 26 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f 27 | //0xa0 is illegal in sjis encoding, but some pages does 28 | //contain such byte. We need to be more error forgiven. 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 38 | BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef 39 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7 40 | BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff 41 | }; 42 | 43 | private readonly static int[] SJIS_st = { 44 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07 45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 46 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17 47 | }; 48 | 49 | private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 }; 50 | 51 | public SJIS_SMModel() : base( 52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 53 | BitPackage.SHIFT_MASK_4BITS, 54 | BitPackage.BIT_SHIFT_4BITS, 55 | BitPackage.UNIT_MASK_4BITS, SJIS_cls), 56 | 6, 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, SJIS_st), 61 | SJISCharLenTable, CodepageName.SHIFT_JIS) 62 | { 63 | 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Korean/EUCKRSMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Korean 2 | { 3 | public class EUCKRSMModel : StateMachineModel 4 | { 5 | private readonly static int[] EUCKR_cls = { 6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07 7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47 16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f 17 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57 18 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f 19 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67 20 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f 21 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77 22 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 27 | BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7 28 | BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 32 | BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff 39 | }; 40 | 41 | private readonly static int[] EUCKR_st = { 42 | BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07 43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f 44 | }; 45 | 46 | private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 }; 47 | 48 | public EUCKRSMModel() : base( 49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 50 | BitPackage.SHIFT_MASK_4BITS, 51 | BitPackage.BIT_SHIFT_4BITS, 52 | BitPackage.UNIT_MASK_4BITS, EUCKR_cls), 53 | 4, 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, EUCKR_st), 58 | EUCKRCharLenTable, CodepageName.EUC_KR) 59 | { 60 | 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/Korean/Iso_2022_KR_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Korean 2 | { 3 | public class Iso_2022_KR_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] ISO2022KR_cls = { 6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] ISO2022KR_st = { 41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR), //00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR), //10-17 44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR), //18-1f 45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27 46 | }; 47 | 48 | private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0}; 49 | 50 | public Iso_2022_KR_SMModel() : base( 51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 52 | BitPackage.SHIFT_MASK_4BITS, 53 | BitPackage.BIT_SHIFT_4BITS, 54 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls), 55 | 6, 56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 57 | BitPackage.SHIFT_MASK_4BITS, 58 | BitPackage.BIT_SHIFT_4BITS, 59 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_st), 60 | ISO2022KRCharLenTable, CodepageName.ISO_2022_KR) 61 | { 62 | 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/UCS2BE_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte 2 | { 3 | public class UCS2BE_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] UCS2BE_cls = { 6 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7 27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af 28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7 29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf 30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7 31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf 32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7 33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df 34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 37 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] UCS2BE_st = { 41 | BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17 44 | BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f 45 | BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27 46 | BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f 47 | BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37 48 | }; 49 | 50 | private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 }; 51 | 52 | public UCS2BE_SMModel() : base( 53 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 54 | BitPackage.SHIFT_MASK_4BITS, 55 | BitPackage.BIT_SHIFT_4BITS, 56 | BitPackage.UNIT_MASK_4BITS, UCS2BE_cls), 57 | 6, 58 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 59 | BitPackage.SHIFT_MASK_4BITS, 60 | BitPackage.BIT_SHIFT_4BITS, 61 | BitPackage.UNIT_MASK_4BITS, UCS2BE_st), 62 | UCS2BECharLenTable, CodepageName.UTF16_BE) 63 | { 64 | 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /src/Core/Models/MultiByte/UCS2LE_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte 2 | { 3 | public class UCS2LE_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] UCS2LE_cls = { 6 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7 27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af 28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7 29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf 30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7 31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf 32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7 33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df 34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 37 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] UCS2LE_st = { 41 | BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17 44 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f 45 | BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27 46 | BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f 47 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37 48 | }; 49 | 50 | private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 }; 51 | 52 | public UCS2LE_SMModel() : base( 53 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 54 | BitPackage.SHIFT_MASK_4BITS, 55 | BitPackage.BIT_SHIFT_4BITS, 56 | BitPackage.UNIT_MASK_4BITS, UCS2LE_cls), 57 | 6, 58 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 59 | BitPackage.SHIFT_MASK_4BITS, 60 | BitPackage.BIT_SHIFT_4BITS, 61 | BitPackage.UNIT_MASK_4BITS, UCS2LE_st), 62 | UCS2LECharLenTable, CodepageName.UTF16_LE) 63 | { 64 | 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /src/Core/Models/SequenceModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System; 40 | 41 | namespace UtfUnknown.Core.Models 42 | { 43 | public abstract class SequenceModel 44 | { 45 | // Codepoints 46 | 47 | // Illegal codepoints 48 | public const byte ILL = 255; 49 | // Control character 50 | public const byte CTR = 254; 51 | // Symbols and punctuation that does not belong to words 52 | public const byte SYM = 253; 53 | // Return/Line feeds 54 | public const byte RET = 252; 55 | // Numbers 0-9 56 | public const byte NUM = 251; 57 | 58 | // [256] table use to find a char's order 59 | protected byte[] charToOrderMap; 60 | 61 | // freqCharCount x freqCharCount table to find a 2-char sequence's 62 | // frequency 63 | protected byte[] precedenceMatrix; 64 | 65 | // The count of frequent characters 66 | protected int freqCharCount; 67 | 68 | public int FreqCharCount 69 | { 70 | get { return freqCharCount; } 71 | } 72 | 73 | // freqSeqs / totalSeqs 74 | protected float typicalPositiveRatio; 75 | 76 | public float TypicalPositiveRatio { 77 | get { return typicalPositiveRatio; } 78 | } 79 | 80 | 81 | /// 82 | /// TODO not used? 83 | /// 84 | protected bool keepEnglishLetter; 85 | 86 | /// 87 | /// TODO not used? 88 | /// 89 | public bool KeepEnglishLetter { 90 | get { return keepEnglishLetter; } 91 | } 92 | 93 | protected string charsetName; 94 | 95 | public string CharsetName { 96 | get { return charsetName; } 97 | } 98 | 99 | public SequenceModel( 100 | byte[] charToOrderMap, 101 | byte[] precedenceMatrix, 102 | int freqCharCount, 103 | float typicalPositiveRatio, 104 | bool keepEnglishLetter, 105 | String charsetName) 106 | { 107 | this.charToOrderMap = charToOrderMap; 108 | this.precedenceMatrix = precedenceMatrix; 109 | this.freqCharCount = freqCharCount; 110 | this.typicalPositiveRatio = typicalPositiveRatio; 111 | this.keepEnglishLetter = keepEnglishLetter; 112 | this.charsetName = charsetName; 113 | } 114 | 115 | public byte GetOrder(byte b) 116 | { 117 | return charToOrderMap[b]; 118 | } 119 | 120 | public byte GetPrecedence(int pos) 121 | { 122 | return precedenceMatrix[pos]; 123 | } 124 | } 125 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Bulgarian/Iso_8859_5_BulgarianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian 44 | { 45 | public class Iso_8859_5_BulgarianModel : BulgarianModel 46 | { 47 | // CTR: Control characters that usually does not exist in any text 48 | // RET: Carriage/Return 49 | // SYM: symbol(punctuation) that does not belong to word 50 | // NUM: 0 - 9 51 | // 52 | // Character Mapping Table: 53 | // this table is modified base on win1251BulgarianCharToOrderMap, so 54 | // only number <64 is sure valid 55 | 56 | private static byte[] CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */ 62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */ 64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, /* 8X */ 66 | 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, /* 9X */ 67 | 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, /* AX */ 68 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* BX */ 69 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, /* CX */ 70 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* DX */ 71 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, /* EX */ 72 | 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Iso_8859_5_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Bulgarian/Windows_1251_BulgarianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian 44 | { 45 | public class Windows_1251_BulgarianModel : BulgarianModel 46 | { 47 | // CTR: Control characters that usually does not exist in any text 48 | // RET: Carriage/Return 49 | // SYM: symbol(punctuation) that does not belong to word 50 | // NUM: 0 - 9 51 | // 52 | // Character Mapping Table: 53 | // this table is modified base on win1251BulgarianCharToOrderMap, so 54 | // only number <64 is sure valid 55 | 56 | private static byte[] CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */ 62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */ 64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, /* 8X */ 66 | 221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, /* 9X */ 67 | 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, /* AX */ 68 | 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, /* BX */ 69 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* CX */ 70 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, /* DX */ 71 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* EX */ 72 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Windows_1251_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Croatian/Ibm852_CroatianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCroatianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Croatian 44 | { 45 | public class Ibm852_CroatianModel : CroatianModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-25 23:50:27.590137 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ 73 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ 75 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */ 77 | 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */ 78 | 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ 82 | 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_CroatianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Czech/Ibm852_CzechModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Czech 44 | { 45 | public class Ibm852_CzechModel : CzechModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 03:28:11.733089 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ 73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ 75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */ 77 | 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */ 78 | 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */ 82 | 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Czech/Iso_8859_2_CzechModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Czech 44 | { 45 | public class Iso_8859_2_CzechModel : CzechModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 03:28:11.733089 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ 73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ 75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */ 79 | SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */ 80 | 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */ 81 | 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */ 82 | 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */ 83 | 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_2_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_2) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Hebrew/Windows_1255_HebrewModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | /* 40 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 41 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangHebrewModel.cpp 42 | * and adjusted to language specific support. 43 | */ 44 | 45 | namespace UtfUnknown.Core.Models.SingleByte.Hebrew 46 | { 47 | public class Windows_1255_HebrewModel : HebrewModel 48 | { 49 | // 255: Control characters that usually does not exist in any text 50 | // 254: Carriage/Return 51 | // 253: symbol (punctuation) that does not belong to word 52 | // 252: 0 - 9 53 | 54 | // Windows-1255 language model 55 | // Character Mapping Table: 56 | private readonly static byte[]CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, /* 4X */ 62 | 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, /* 6X */ 64 | 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 124,ILL,203,204,205, 40, 58,206,207,208,ILL,210,ILL,ILL,ILL,ILL, /* 8X */ 66 | ILL, 83, 52, 47, 46, 72, 32, 94,216,113,ILL,109,ILL,ILL,ILL,ILL, /* 9X */ 67 | 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, /* AX */ 68 | 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, /* BX */ 69 | 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, /* CX */ 70 | 238, 38, 45,239,240,241,242,243,127,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* DX */ 71 | 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, /* EX */ 72 | 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,ILL,ILL,128, 96,ILL, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Windows_1255_HebrewModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1255) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Irish/Iso_8859_15_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_15_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */ 80 | 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */ 81 | 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */ 82 | 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */ 83 | 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_15_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_15) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Irish/Iso_8859_1_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_1_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */ 81 | 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */ 82 | 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */ 83 | 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_1_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Irish/Iso_8859_9_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_9_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */ 81 | 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */ 82 | 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */ 83 | 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_9_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_9) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Polish/Ibm852_PolishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangPolishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Polish 44 | { 45 | public class Ibm852_PolishModel : PolishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 17:21:04.405363 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ 73 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ 75 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */ 77 | 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */ 78 | 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */ 82 | 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_PolishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/Ibm855_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Ibm855_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] BYTE_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 56 | 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205, 57 | 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70, 58 | 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219, 59 | 220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229, 60 | 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243, 61 | 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248, 62 | 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249, 63 | 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR, 64 | }; 65 | 66 | public Ibm855_RussianModel() : base(BYTE_TO_ORDER_MAP, CodepageName.IBM855) 67 | { 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/Ibm866_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Ibm866_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 58 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, 59 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 60 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 61 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, 62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, 63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, 64 | }; 65 | 66 | public Ibm866_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM866) 67 | { 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/Iso_8859_5_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Iso_8859_5_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */ 59 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* BX */ 60 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* CX */ 61 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* DX */ 62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* EX */ 63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Iso_8859_5_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/Koi8r_Model.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Koi8r_Model : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, /* AX */ 59 | 238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, /* BX */ 60 | 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, /* CX */ 61 | 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, /* DX */ 62 | 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, /* EX */ 63 | 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Koi8r_Model() : base(CHAR_TO_ORDER_MAP, CodepageName.KOI8_R) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/Windows_1251_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Windows_1251_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */ 59 | 239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, /* BX */ 60 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* CX */ 61 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* DX */ 62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */ 63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Windows_1251_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Russian/X_Mac_Cyrillic_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class X_Mac_Cyrillic_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* 8X */ 57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* 9X */ 58 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* AX */ 59 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* BX */ 60 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* CX */ 61 | 239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, /* DX */ 62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */ 63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public X_Mac_Cyrillic_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.X_MAC_CYRILLIC) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Slovak/Ibm852_SlovakModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSlovakModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Slovak 44 | { 45 | public class Ibm852_SlovakModel : SlovakModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 13:33:10.331339 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ 73 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ 75 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 51, 46, 25, 62, 38, 48, 47, 51, 49, 54, 50, 50, 63, 64, 38, 47, /* 8X */ 77 | 25, 42, 42, 32, 43, 33, 33, 65, 66, 43, 46, 31, 31, 49,SYM, 24, /* 9X */ 78 | 21, 23, 35, 27, 67, 68, 26, 26, 69, 70,SYM, 71, 24, 59,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 21, 72, 41, 59,SYM,SYM,SYM,SYM, 61, 61,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM, 56, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 55, 55, 39, 54, 39, 36, 23, 73, 41,SYM,SYM,SYM,SYM, 74, 48,SYM, /* DX */ 82 | 35, 58, 32, 52, 52, 36, 28, 28, 44, 27, 44, 60, 22, 22, 75,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 45, 45,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_SlovakModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Slovene/Ibm852_SloveneModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSloveneModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Slovene 44 | { 45 | public class Ibm852_SloveneModel : SloveneModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-28 22:06:46.134717 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ 73 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ 75 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 34,249, 29,249,249,249, 37, 34,249, 36,249,249,249,249,249, 37, /* 8X */ 77 | 29,249,249, 35,249,249,249,249,249,249,249,249,249,249,SYM, 21, /* 9X */ 78 | 32, 30, 31, 39,249,249, 23, 23,249,249,SYM,249, 21,249,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 32,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 249,249,249, 36,249,249, 30,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ 82 | 31,249, 35,249,249,249, 22, 22,249, 39,249,249, 40, 40,249,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_SloveneModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Core/Models/SingleByte/Thai/Tis_620_ThaiModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangThaiModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Thai 44 | { 45 | public class Tis_620_ThaiModel: ThaiModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2015-12-04 03:05:06.182099 49 | // 50 | //aracter Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | // 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | // 60 | // Orders are generic to a language.So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order.For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French.Same for the euro sign. 66 | 67 | private readonly static byte[] CHAR_TO_ORDER_MAP = 68 | { 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 70 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 71 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 72 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 73 | SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */ 74 | 75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */ 75 | SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */ 76 | 59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 78 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 79 | ILL, 3, 23,105, 15,106, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */ 80 | 49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */ 81 | 38, 7, 10, 2, 50, 11,107, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */ 82 | 14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */ 83 | 6, 20, 27, 24, 25,108, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,109, /* EX */ 84 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,110,111,ILL,ILL,ILL,ILL, /* FX */ 85 | }; 86 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 87 | 88 | public Tis_620_ThaiModel() : base(CHAR_TO_ORDER_MAP, CodepageName.TIS_620) 89 | { 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/Core/Models/StateMachineModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Kohei TAKETA (Java port) 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System; 40 | 41 | namespace UtfUnknown.Core.Models 42 | { 43 | /// 44 | /// State machine model 45 | /// 46 | public abstract class StateMachineModel 47 | { 48 | /// 49 | /// Start node 50 | /// 51 | public const int START = 0; 52 | 53 | /// 54 | /// Error node ? 55 | /// 56 | public const int ERROR = 1; 57 | 58 | /// 59 | /// ? 60 | /// 61 | public const int ITSME = 2; 62 | 63 | public BitPackage classTable; 64 | public BitPackage stateTable; 65 | public int[] charLenTable; 66 | 67 | public string Name { get; } 68 | 69 | public int ClassFactor { get; } 70 | 71 | public StateMachineModel(BitPackage classTable, int classFactor, 72 | BitPackage stateTable, int[] charLenTable, String name) 73 | { 74 | this.classTable = classTable; 75 | ClassFactor = classFactor; 76 | this.stateTable = stateTable; 77 | this.charLenTable = charLenTable; 78 | Name = name; 79 | } 80 | 81 | public int GetClass(byte b) 82 | { 83 | return classTable.Unpack((int)b); 84 | } 85 | } 86 | } -------------------------------------------------------------------------------- /src/Core/Probers/CodingStateMachine.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is mozilla.org code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Kohei TAKETA (Java port) 24 | * Rudi Pettazzi (C# port) 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | using UtfUnknown.Core.Models; 41 | 42 | namespace UtfUnknown.Core.Probers 43 | { 44 | /// 45 | /// Parallel state machine for the Coding Scheme Method 46 | /// 47 | public class CodingStateMachine 48 | { 49 | private int currentState; 50 | private StateMachineModel model; 51 | private int currentCharLen; 52 | 53 | 54 | public CodingStateMachine(StateMachineModel model) 55 | { 56 | currentState = StateMachineModel.START; 57 | this.model = model; 58 | } 59 | 60 | public int NextState(byte b) 61 | { 62 | // for each byte we get its class, if it is first byte, 63 | // we also get byte length 64 | int byteCls = model.GetClass(b); 65 | if (currentState == StateMachineModel.START) { 66 | 67 | currentCharLen = model.charLenTable[byteCls]; 68 | } 69 | 70 | // from byte's class and stateTable, we get its next state 71 | currentState = model.stateTable.Unpack( 72 | currentState * model.ClassFactor + byteCls); 73 | 74 | return currentState; 75 | } 76 | 77 | public void Reset() 78 | { 79 | currentState = StateMachineModel.START; 80 | } 81 | 82 | public int CurrentCharLen 83 | { 84 | get { return currentCharLen; } 85 | } 86 | 87 | public string ModelName 88 | { 89 | get { return model.Name; } 90 | } 91 | } 92 | } -------------------------------------------------------------------------------- /src/Core/Probers/MultiByte/Chinese/Big5Prober.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Analyzers.Chinese; 42 | using UtfUnknown.Core.Models; 43 | using UtfUnknown.Core.Models.MultiByte.Chinese; 44 | 45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese 46 | { 47 | public class Big5Prober : CharsetProber 48 | { 49 | //void GetDistribution(PRUint32 aCharLen, const char* aStr); 50 | private CodingStateMachine codingSM; 51 | private BIG5DistributionAnalyser distributionAnalyser; 52 | private byte[] lastChar = new byte[2]; 53 | 54 | public Big5Prober() 55 | { 56 | codingSM = new CodingStateMachine(new BIG5SMModel()); 57 | distributionAnalyser = new BIG5DistributionAnalyser(); 58 | Reset(); 59 | } 60 | 61 | public override ProbingState HandleData(byte[] buf, int offset, int len) 62 | { 63 | int max = offset + len; 64 | 65 | for (int i = offset; i < max; i++) 66 | { 67 | var codingState = codingSM.NextState(buf[i]); 68 | if (codingState == StateMachineModel.ERROR) 69 | { 70 | state = ProbingState.NotMe; 71 | break; 72 | } 73 | if (codingState == StateMachineModel.ITSME) 74 | { 75 | state = ProbingState.FoundIt; 76 | break; 77 | } 78 | if (codingState == StateMachineModel.START) 79 | { 80 | int charLen = codingSM.CurrentCharLen; 81 | if (i == offset) 82 | { 83 | lastChar[1] = buf[offset]; 84 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen); 85 | } 86 | else 87 | { 88 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen); 89 | } 90 | } 91 | } 92 | 93 | lastChar[0] = buf[max - 1]; 94 | 95 | if (state == ProbingState.Detecting) 96 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 97 | state = ProbingState.FoundIt; 98 | 99 | return state; 100 | } 101 | 102 | public override void Reset() 103 | { 104 | codingSM.Reset(); 105 | state = ProbingState.Detecting; 106 | distributionAnalyser.Reset(); 107 | } 108 | 109 | public override string GetCharsetName() 110 | { 111 | return CodepageName.BIG5; 112 | } 113 | 114 | public override float GetConfidence(StringBuilder status = null) 115 | { 116 | return distributionAnalyser.GetConfidence(); 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Analyzers.Chinese; 42 | using UtfUnknown.Core.Models; 43 | using UtfUnknown.Core.Models.MultiByte.Chinese; 44 | 45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese 46 | { 47 | public class EUCTWProber : CharsetProber 48 | { 49 | private CodingStateMachine codingSM; 50 | private EUCTWDistributionAnalyser distributionAnalyser; 51 | private byte[] lastChar = new byte[2]; 52 | 53 | public EUCTWProber() 54 | { 55 | codingSM = new CodingStateMachine(new EUCTWSMModel()); 56 | distributionAnalyser = new EUCTWDistributionAnalyser(); 57 | Reset(); 58 | } 59 | 60 | public override ProbingState HandleData(byte[] buf, int offset, int len) 61 | { 62 | int codingState; 63 | int max = offset + len; 64 | 65 | for (int i = 0; i < max; i++) 66 | { 67 | codingState = codingSM.NextState(buf[i]); 68 | if (codingState == StateMachineModel.ERROR) 69 | { 70 | state = ProbingState.NotMe; 71 | break; 72 | } 73 | 74 | if (codingState == StateMachineModel.ITSME) 75 | { 76 | state = ProbingState.FoundIt; 77 | break; 78 | } 79 | 80 | if (codingState == StateMachineModel.START) 81 | { 82 | int charLen = codingSM.CurrentCharLen; 83 | if (i == offset) 84 | { 85 | lastChar[1] = buf[offset]; 86 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen); 87 | } 88 | else 89 | { 90 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen); 91 | } 92 | } 93 | } 94 | 95 | lastChar[0] = buf[max - 1]; 96 | 97 | if (state == ProbingState.Detecting) 98 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 99 | state = ProbingState.FoundIt; 100 | 101 | return state; 102 | } 103 | 104 | public override string GetCharsetName() 105 | { 106 | return CodepageName.EUC_TW; 107 | } 108 | 109 | public override void Reset() 110 | { 111 | codingSM.Reset(); 112 | state = ProbingState.Detecting; 113 | distributionAnalyser.Reset(); 114 | } 115 | 116 | public override float GetConfidence(StringBuilder status = null) 117 | { 118 | return distributionAnalyser.GetConfidence(); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/Core/Probers/MultiByte/UTF8Prober.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Models; 42 | using UtfUnknown.Core.Models.MultiByte; 43 | 44 | namespace UtfUnknown.Core.Probers.MultiByte 45 | { 46 | public class UTF8Prober : CharsetProber 47 | { 48 | private static float ONE_CHAR_PROB = 0.50f; 49 | private CodingStateMachine codingSM; 50 | private int numOfMBChar; 51 | 52 | public UTF8Prober() 53 | { 54 | numOfMBChar = 0; 55 | codingSM = new CodingStateMachine(new UTF8_SMModel()); 56 | Reset(); 57 | } 58 | 59 | public override string GetCharsetName() 60 | { 61 | return CodepageName.UTF8; 62 | } 63 | 64 | public override void Reset() 65 | { 66 | codingSM.Reset(); 67 | numOfMBChar = 0; 68 | state = ProbingState.Detecting; 69 | } 70 | 71 | public override ProbingState HandleData(byte[] buf, int offset, int len) 72 | { 73 | int max = offset + len; 74 | 75 | for (int i = offset; i < max; i++) 76 | { 77 | 78 | var codingState = codingSM.NextState(buf[i]); 79 | 80 | if (codingState == StateMachineModel.ERROR) 81 | { 82 | state = ProbingState.NotMe; 83 | break; 84 | } 85 | 86 | if (codingState == StateMachineModel.ITSME) 87 | { 88 | state = ProbingState.FoundIt; 89 | break; 90 | } 91 | 92 | if (codingState == StateMachineModel.START) 93 | { 94 | if (codingSM.CurrentCharLen >= 2) 95 | numOfMBChar++; 96 | } 97 | } 98 | 99 | if (state == ProbingState.Detecting) 100 | if (GetConfidence() > SHORTCUT_THRESHOLD) 101 | state = ProbingState.FoundIt; 102 | 103 | return state; 104 | } 105 | 106 | public override float GetConfidence(StringBuilder status = null) 107 | { 108 | float unlike = 0.99f; 109 | float confidence; 110 | 111 | if (numOfMBChar < 6) 112 | { 113 | for (int i = 0; i < numOfMBChar; i++) 114 | unlike *= ONE_CHAR_PROB; 115 | 116 | confidence = 1.0f - unlike; 117 | } 118 | else 119 | { 120 | confidence = 0.99f; 121 | } 122 | 123 | return confidence; 124 | } 125 | } 126 | } -------------------------------------------------------------------------------- /src/Core/Probers/ProbingState.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Probers 2 | { 3 | public enum ProbingState 4 | { 5 | /// 6 | /// No sure answer yet, but caller can ask for confidence 7 | /// 8 | Detecting = 0, 9 | /// 10 | /// Positive answer 11 | /// 12 | FoundIt = 1, 13 | /// 14 | /// Negative answer 15 | /// 16 | NotMe = 2 17 | } 18 | } -------------------------------------------------------------------------------- /src/DetectionDetail.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.CompilerServices; 4 | using System.Text; 5 | using UtfUnknown.Core; 6 | using UtfUnknown.Core.Probers; 7 | 8 | [assembly: InternalsVisibleTo("UtfUnknown.Tests, PublicKey=" + 9 | "002400000480000094000000060200000024000052534131000400000100010029f6b4defac763" + 10 | "66721687460b44b7619e8e19a411f785279316fdae2f6965edfa4a460304fe8b4ed796d5356a1c" + 11 | "225131b9087983d9ff9530df9307eab17d88cd4f1005a45f6f35523445d1ff7323322f3060cffc" + 12 | "0d70d0cb1b4b7d46081bbead31844927aaadb0508b64bf298de5abe5ea5cca8b92490c961b7b75" + 13 | "13c2c2a9")] 14 | namespace UtfUnknown 15 | { 16 | /// 17 | /// Detailed result of a detection 18 | /// 19 | public class DetectionDetail 20 | { 21 | /// 22 | /// A dictionary for replace unsupported codepage name in .NET to the nearly identical version. 23 | /// 24 | private static readonly Dictionary FixedToSupportCodepageName = 25 | new Dictionary 26 | { 27 | // CP949 is superset of ks_c_5601-1987 (see https://github.com/CharsetDetector/UTF-unknown/pull/74#issuecomment-550362133) 28 | {CodepageName.CP949, CodepageName.KS_C_5601_1987}, 29 | {CodepageName.ISO_2022_CN, CodepageName.X_CP50227}, 30 | }; 31 | 32 | /// 33 | /// New result 34 | /// 35 | public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, 36 | TimeSpan? time = null, string statusLog = null) 37 | { 38 | EncodingName = encodingShortName; 39 | Confidence = confidence; 40 | Encoding = GetEncoding(encodingShortName); 41 | Prober = prober; 42 | Time = time; 43 | StatusLog = statusLog; 44 | } 45 | 46 | /// 47 | /// New Result 48 | /// 49 | public DetectionDetail(CharsetProber prober, TimeSpan? time = null) 50 | : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time, prober.DumpStatus()) 51 | { 52 | } 53 | 54 | /// 55 | /// The (short) name of the detected encoding. For full details, check 56 | /// 57 | public string EncodingName { get; } 58 | 59 | /// 60 | /// The detected encoding. 61 | /// 62 | public Encoding Encoding { get; set; } 63 | 64 | /// 65 | /// The confidence of the found encoding. Between 0 and 1. 66 | /// 67 | public float Confidence { get; set; } 68 | 69 | /// 70 | /// The used prober for detection 71 | /// 72 | public CharsetProber Prober { get; set; } 73 | 74 | /// 75 | /// A Byte Order Mark was detected 76 | /// 77 | public bool HasBOM { get; set; } 78 | 79 | /// 80 | /// The time spend 81 | /// 82 | public TimeSpan? Time { get; set; } 83 | 84 | public string StatusLog { get; set; } 85 | 86 | public override string ToString() 87 | { 88 | return $"Detected {EncodingName} with confidence of {Confidence}. (BOM: {HasBOM})"; 89 | } 90 | 91 | internal static Encoding GetEncoding(string encodingShortName) 92 | { 93 | var encodingName = FixedToSupportCodepageName.TryGetValue(encodingShortName, out var supportCodepageName) 94 | ? supportCodepageName 95 | : encodingShortName; 96 | try 97 | { 98 | return Encoding.GetEncoding(encodingName); 99 | } 100 | catch (Exception exception) when 101 | (exception is ArgumentException || // unsupported name 102 | exception is NotSupportedException) 103 | { 104 | #if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0 105 | return CodePagesEncodingProvider.Instance.GetEncoding(encodingName); 106 | #else 107 | return null; 108 | #endif 109 | } 110 | } 111 | } 112 | } -------------------------------------------------------------------------------- /src/DetectionResult.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | 4 | namespace UtfUnknown 5 | { 6 | /// 7 | /// Result of a detection. 8 | /// 9 | public class DetectionResult 10 | { 11 | /// 12 | /// Empty 13 | /// 14 | public DetectionResult() 15 | { 16 | } 17 | 18 | /// 19 | /// Multiple results 20 | /// 21 | public DetectionResult(IList details) 22 | { 23 | Details = details; 24 | } 25 | 26 | /// 27 | /// Single result 28 | /// 29 | /// 30 | public DetectionResult(DetectionDetail detectionDetail) 31 | { 32 | Details = new List { detectionDetail }; 33 | } 34 | 35 | /// 36 | /// Get the best Detection 37 | /// 38 | public DetectionDetail Detected => Details?.FirstOrDefault(); 39 | 40 | /// 41 | /// All results 42 | /// 43 | public IList Details { get; set; } 44 | 45 | public override string ToString() 46 | { 47 | return $"{nameof(Detected)}: {Detected}, \n{nameof(Details)}:\n - {string.Join("\n- ", Details?.Select(d => d.ToString()))}"; 48 | } 49 | } 50 | } -------------------------------------------------------------------------------- /src/UTF-unknown.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0 5 | 6 | 7 | 8 | UtfUnknown 9 | UTF.Unknown 10 | 2.0.0 11 | 12 | Full 13 | 14 | 15 | 16 | Library 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | Julian Verdurmen, Rustam Sayfutdinov, Rudi Pettazzi, Shy Shalom 28 | en-US 29 | UTF Unknown 30 | Detect character set for files, streams and other bytes. 31 | 32 | This package is based on Ude and since version 2 also on uchardet, which are ports of the Mozilla Universal Charset Detector. 33 | 34 | Features: 35 | - Easy to use API 36 | - Supports frameworks: 37 | - .NET 5+ 38 | - .NET Standard 1.0+ 39 | - .NET Core 3.0+ 40 | - .NET Framework 4.0+ 41 | - Strong named 42 | - XML documentation included 43 | 44 | Compared to Ude: 45 | 46 | - Refactor of API, namespaces and deadcode removal 47 | - Added some docs 48 | - Improve error handling 49 | - Improved unit tests 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | charset;detection;unicode;ascii;netstandard;chardet 59 | 60 | - See https://github.com/CharsetDetector/UTF-unknown/releases 61 | 62 | https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/master/logo.png 63 | https://github.com/CharsetDetector/UTF-unknown 64 | https://github.com/CharsetDetector/UTF-unknown/blob/master/license/MPL-1.1.txt 65 | false 66 | git 67 | https://github.com/CharsetDetector/UTF-unknown 68 | UtfUnknown 69 | True 70 | UtfUnknown.snk 71 | 72 | 73 | 74 | bin\$(Configuration)\$(TargetFramework)\UtfUnknown.xml 75 | 1701;1702;1705,1570,1591 76 | 2.0.0.0 77 | 2.0.0.0 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/UtfUnknown.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/src/UtfUnknown.snk -------------------------------------------------------------------------------- /tests/BitPackageTest.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Kohei TAKETA (Java port) 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using NUnit.Framework; 40 | using UtfUnknown.Core; 41 | 42 | namespace UtfUnknown.Tests 43 | { 44 | public class BitPackageTest 45 | { 46 | [Test] 47 | public void TestPack() 48 | { 49 | Assert.AreEqual(BitPackage.Pack4bits(0,0,0,0,0,0,0,0), 0); 50 | Assert.AreEqual(BitPackage.Pack4bits(1,1,1,1,1,1,1,1), 286331153); 51 | Assert.AreEqual(BitPackage.Pack4bits(2,2,2,2,2,2,2,2), 572662306); 52 | Assert.AreEqual(BitPackage.Pack4bits(15,15,15,15,15,15,15,15), -1); 53 | } 54 | 55 | [Test] 56 | public void TestUnpack() 57 | { 58 | int[] data = new int[] { 59 | BitPackage.Pack4bits(0, 1, 2, 3, 4, 5, 6, 7), 60 | BitPackage.Pack4bits(8, 9, 10, 11, 12, 13, 14, 15) 61 | }; 62 | 63 | BitPackage pkg = new BitPackage( 64 | BitPackage.INDEX_SHIFT_4BITS, 65 | BitPackage.SHIFT_MASK_4BITS, 66 | BitPackage.BIT_SHIFT_4BITS, 67 | BitPackage.UNIT_MASK_4BITS, 68 | data); 69 | 70 | for (int i = 0; i < 16; i++) { 71 | int n = pkg.Unpack(i); 72 | Assert.AreEqual(n, i); 73 | } 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tests/Data/README.md: -------------------------------------------------------------------------------- 1 | These text fragments have been copied taken from the following sources: 2 | 3 | - Wikipedia - http://wikipedia.org 4 | - Project Gutenberg - http://www.gutenberg.org 5 | 6 | The test files are automatically discovered. 7 | The directory name should be the expected encoding. 8 | If there is a `(`, then it's the name before it. 9 | -------------------------------------------------------------------------------- /tests/Data/big5/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/big5/1.txt -------------------------------------------------------------------------------- /tests/Data/cp949/cp949_1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/cp949/cp949_1.txt -------------------------------------------------------------------------------- /tests/Data/cp949/cp949_2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/cp949/cp949_2.txt -------------------------------------------------------------------------------- /tests/Data/euc-jp/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-jp/1.txt -------------------------------------------------------------------------------- /tests/Data/euc-kr/euc1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-kr/euc1.txt -------------------------------------------------------------------------------- /tests/Data/euc-kr/euc2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-kr/euc2.txt -------------------------------------------------------------------------------- /tests/Data/gb18030/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/gb18030/1.txt -------------------------------------------------------------------------------- /tests/Data/ibm852/lang_ce_ibm852.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm852/lang_ce_ibm852.txt -------------------------------------------------------------------------------- /tests/Data/ibm855/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm855/1.txt -------------------------------------------------------------------------------- /tests/Data/ibm855/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm855/2.txt -------------------------------------------------------------------------------- /tests/Data/ibm866/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm866/1.txt -------------------------------------------------------------------------------- /tests/Data/iso-2022-jp/1.txt: -------------------------------------------------------------------------------- 1 | ======================================================================== 2 | $B%3%s%=!<%k(J $B%"%W%j%1!<%7%g%s(J : universalchardet $B%W%m%8%'%/%H$N35MW(J 3 | ======================================================================== 4 | 5 | $B$3$N(J universalchardet $B%"%W%j%1!<%7%g%s$O!"(JAppWizard $B$K$h$C$F:n@.$5$l$^$7$?!#(J 6 | 7 | $B$3$N%U%!%$%k$K$O!"(Juniversalchardet $B%"%W%j%1!<%7%g%s$r9=@.$9$k3F%U%!%$%k$N(J 8 | $BFbMF$N35N,$,5-=R$5$l$F$$$^$9!#(J 9 | 10 | 11 | universalchardet.vcproj 12 | $B$3$l$O!"%"%W%j%1!<%7%g%s(J $B%&%#%6!<%I$G@8@.$5$l$k(J VC++ $B%W%m%8%'%/%H$N%a%$%s$N(J 13 | $B%W%m%8%'%/%H(J $B%U%!%$%k$G$9!#(J 14 | $B%U%!%$%k$r@8@.$7$?(J Visual C++ $B$N%P!<%8%g%s>pJs$H!"%"%W%j%1!<%7%g%s(J 15 | $B%&%#%6!<%I$GA*Br$7$?%W%i%C%H%U%)!<%`!"9=@.!"$*$h$S%W%m%8%'%/%H$N5!G=$K4X$9$k(J 16 | $B>pJs$,5-=R$5$l$F$$$^$9!#(J 17 | 18 | universalchardet.cpp 19 | $B$3$l$O!"%a%$%s$N%"%W%j%1!<%7%g%s(J $B%=!<%9(J $B%U%!%$%k$G$9!#(J 20 | 21 | ///////////////////////////////////////////////////////////////////////////// 22 | $B$=$NB>$NI8=`%U%!%$%k(J : 23 | 24 | StdAfx.h, StdAfx.cpp 25 | $B$3$l$i$N%U%!%$%k$O!"%3%s%Q%$%k:Q$_%X%C%@!<(J (PCH) $B%U%!%$%k(J 26 | universalchardet.pch $B$H%W%j%3%s%Q%$%k:Q$_7?%U%!%$%k(J StdAfx.obj $B$r(J 27 | $B%S%k%I$9$k$?$a$K;HMQ$7$^$9!#(J 28 | 29 | ///////////////////////////////////////////////////////////////////////////// 30 | $B$=$NB>$N%a%b(J : 31 | 32 | AppWizard $B$G$O(J "TODO:" $B%3%a%s%H$r;HMQ$7$F!"%f!<%6!<$,DI2C$^$?$O%+%9%?%^%$%:$9$k(J 33 | $B%=!<%9ItJ,$r<($7$^$9!#(J 34 | 35 | ///////////////////////////////////////////////////////////////////////////// 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /tests/Data/iso-2022-kr/iso1.txt: -------------------------------------------------------------------------------- 1 | $)C?*;g@{ ?9F4Q, ?*;g@{ @N0#@87N<-@G ?9J4B4Y. GQ19 ;g6w5i@: 4k03 !.:OGQ(]Ay[)!/@L6s0m :N8#8g, B*0T !.:O!/@87N :N8& 6'55 @V4Y. 3*@L0! 89@: <<4k?!<-4B 0#H$ !.@L:O(l$]A)!/@L6s4B G%Gv@; >21b55 GO8g, A&GQ@{@87N ':OA6<1'@L6s :N8#1b55 GQ4Y. 0z0E 4kGQ9N19 A$:N?!<-4B :OA6<1(:OGQ)@; ?> 2@LAv >J0T 5G>z4Y.[3] 0#H$ @O:N 9]0xAV@G :821b55 GQ4Y. 4 | 5 | 4kGQ9N19?!<-4B A6<1@L3* :OA6<1@L6u G%Gv@; 1bGGGO8g[CbC3 GJ?d], A6<1@: A6<1 ?UA63* 4\1:A6<1 5n@; 0!8.E04B 8;7N >44Y. 32:O0#@G 137y0! H0<:H-5G8i<- F/:0GQ ;vC$0! >x4B !.:OCx!/@L3* !.:OBJ!/@L6u G%Gv55 89@L >2@L0m @V4B C_<<@L4Y. 4kGQ9N19?!<-4B GQ19 @|@o @L@|@G G`A$ 18?*@N Fr>H3255!$Fr>H:O55!$GT0f3255!$GT0f:O55!$H2GX558& !.@L:O 555(l$]AgiT3)!/6s :N8#1b55 GQ4Y. 6 | 7 | 0x=D@{@N ?5>n 8mD*@: DPRK(Democratic People's Republic of Korea)@L8g :8Ek 'A6<19]55(GQ9]55) :OBJ'@; @G9LGO4B North Korea6s0m :N8%4Y. 8 | -------------------------------------------------------------------------------- /tests/Data/iso-8859-1/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/1.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-1/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/3.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-1/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/4.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-11/lang_th_iso-8859-11.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-11/lang_th_iso-8859-11.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-13/lang_et_iso-8859-13.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-13/lang_et_iso-8859-13.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-15/lang_da_iso-8859-15.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-15/lang_da_iso-8859-15.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-2/lang_ce_iso-8859-2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-2/lang_ce_iso-8859-2.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-3/lang_eo_iso-8859-3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-3/lang_eo_iso-8859-3.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-4/lang_et_iso-8859-4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-4/lang_et_iso-8859-4.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-5/lang_ru_iso-8859-5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-5/lang_ru_iso-8859-5.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-6/lang_ar_iso-8859-6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-6/lang_ar_iso-8859-6.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-7/greek.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-7/greek.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-7/lang_le_iso-8859-7.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-7/lang_le_iso-8859-7.txt -------------------------------------------------------------------------------- /tests/Data/iso-8859-9/lang_tr_iso-8859-9.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-9/lang_tr_iso-8859-9.txt -------------------------------------------------------------------------------- /tests/Data/koi8-r/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/koi8-r/1.txt -------------------------------------------------------------------------------- /tests/Data/koi8-r/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/koi8-r/2.txt -------------------------------------------------------------------------------- /tests/Data/shift-jis/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/1.txt -------------------------------------------------------------------------------- /tests/Data/shift-jis/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/2.txt -------------------------------------------------------------------------------- /tests/Data/shift-jis/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/3.txt -------------------------------------------------------------------------------- /tests/Data/shift-jis/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/4.txt -------------------------------------------------------------------------------- /tests/Data/tis-620/lang_th_tis-620.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/tis-620/lang_th_tis-620.txt -------------------------------------------------------------------------------- /tests/Data/utf-16be/lang_fr_utf-16.be: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-16be/lang_fr_utf-16.be -------------------------------------------------------------------------------- /tests/Data/utf-16le/lang_ko_utf-16.le: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-16le/lang_ko_utf-16.le -------------------------------------------------------------------------------- /tests/Data/utf-32le/lang_fr_utf-32.le: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-32le/lang_fr_utf-32.le -------------------------------------------------------------------------------- /tests/Data/utf-8/1.txt: -------------------------------------------------------------------------------- 1 | 역사적 예수 연구자들은 복음서나 사도들의 서신서 속의 교리적 예수가 아닌, 역사적 인간으로서의 예수를 추구한다. 20세기 이후 역사적 예수에 대한 연구는 마커스 보그, 가톨릭 수사 출신으로 환속한 도미닉 크로산 등의 예수 세미나 운동 시작을 통해 진행되고 있다. 대한민국에서는 한국 기독교 연구소(소장 김준우)에서 크로산 등의 신학 문서들을 출판하여, 역사적 예수에 대한 연구 성과들을 소개하고 있다. 2 | -------------------------------------------------------------------------------- /tests/Data/utf-8/2.txt: -------------------------------------------------------------------------------- 1 | 북조선 사람들은 흔히 자국을 조선(朝鮮)이나 공화국(共和國)이라고 부른다. 지역적으로 한반도의 남쪽(대한민국)을 의미하는 남조선(南朝鮮)에 대응하여 북조선(北朝鮮)이라고도 부른다. 2 | 3 | 대한민국에서는 남·북의 대치 상황과 맞물려 공식 명칭인 '조선민주주의인민공화국'을 잘 사용하지 않는다. 한국 사람들은 대개 ‘북한(北韓)’이라고 부르며, 짧게 ‘북’으로 부를 때도 있다. 나이가 많은 세대에서는 간혹 ‘이북(以北)’이라는 표현을 쓰기도 하며, 제한적으로 '북조선'이라 부르기도 한다. 과거 대한민국 정부에서는 북조선(북한)을 옛 소련의 괴뢰정권으로 비하하는 ‘북괴(北傀)’로 비칭하였으나 관계 개선과 함께 잘 쓰이지 않게 되었다.[3] 간혹 일부 반공주의 보수단체들은 북괴라는 표현을 쓰기도 한다. 4 | 5 | 대한민국에서는 조선이나 북조선이란 표현을 기피하며[출처 필요], 조선은 조선 왕조나 단군조선 등을 가리키는 말로 쓴다. 남북간의 교류가 활성화되면서 특별한 색채가 없는 ‘북측’이나 ‘북쪽’이란 표현도 많이 쓰이고 있는 추세이다. 대한민국에서는 한국 전쟁 이전의 행정 구역인 평안남도·평안북도·함경남도·함경북도·황해도를 ‘이북 5도(以北五道)’라 부르기도 한다. 6 | 7 | 공식적인 영어 명칭은 DPRK(Democratic People's Republic of Korea)이며 보통 '조선반도(한반도) 북쪽'을 의미하는 North Korea라고 부른다. 8 | -------------------------------------------------------------------------------- /tests/Data/utf-8/3.txt: -------------------------------------------------------------------------------- 1 | \\\\\\\{ssss } siaaaaaaaaa ssssi à è ì 2 | -------------------------------------------------------------------------------- /tests/Data/utf-8/4.txt: -------------------------------------------------------------------------------- 1 | ======================================================================== 2 | コンソール アプリケーション : universalchardet プロジェクトの概要 3 | ======================================================================== 4 | 5 | この universalchardet アプリケーションは、AppWizard によって作成されました。 6 | 7 | このファイルには、universalchardet アプリケーションを構成する各ファイルの 8 | 内容の概略が記述されています。 9 | 10 | 11 | universalchardet.vcproj 12 | これは、アプリケーション ウィザードで生成される VC++ プロジェクトのメインの 13 | プロジェクト ファイルです。 14 | ファイルを生成した Visual C++ のバージョン情報と、アプリケーション 15 | ウィザードで選択したプラットフォーム、構成、およびプロジェクトの機能に関する 16 | 情報が記述されています。 17 | 18 | universalchardet.cpp 19 | これは、メインのアプリケーション ソース ファイルです。 20 | 21 | ///////////////////////////////////////////////////////////////////////////// 22 | その他の標準ファイル : 23 | 24 | StdAfx.h, StdAfx.cpp 25 | これらのファイルは、コンパイル済みヘッダー (PCH) ファイル 26 | universalchardet.pch とプリコンパイル済み型ファイル StdAfx.obj を 27 | ビルドするために使用します。 28 | 29 | ///////////////////////////////////////////////////////////////////////////// 30 | その他のメモ : 31 | 32 | AppWizard では "TODO:" コメントを使用して、ユーザーが追加またはカスタマイズする 33 | ソース部分を示します。 34 | 35 | ///////////////////////////////////////////////////////////////////////////// 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /tests/Data/utf-8/5.txt: -------------------------------------------------------------------------------- 1 | 仙人洞文化係話到萬年大源盆地一隻叫仙人洞嗰溶洞發現嗰史前文化。九十年代到許裡尋到嘍距今距今1萬年嗰穀植矽石標本,咁一吖子就搦人類栽禾嗰歷史提早嘍5000年,仙人洞遺跡也就成為世界頭上嗰「稻作之源」。 2 | 3 | 萬年仙人洞人種出世界首棵水稻 贛鄱是世界的稻作起源中心區 4 | 5 | 1隻分類: 江西嗰歷史 6 | -------------------------------------------------------------------------------- /tests/Data/utf-8/greek.txt: -------------------------------------------------------------------------------- 1 | Η ελληνική αποτελεί τη μητρική γλώσσα περίπου 12 εκατομμυρίων ανθρώπων, κυρίως στην Ελλάδα και στην Κύπρο. Αποτελεί επίσης την μητρική γλώσσα αυτοχθόνων πληθυσμών στην Αλβανία, στη Βουλγαρία, στην ΠΓΔΜ και στην Τουρκία. Εξαιτίας της μετανάστευσης η γλώσσα μιλιέται ακόμα σε χώρες-προορισμούς ελληνόφωνων πληθυσμών μεταξύ των οποίων η Αυστραλία, ο Καναδάς, η Γερμανία, το Ηνωμένο Βασίλειο, η Ρωσία, η Σερβία και οι Ηνωμένες Πολιτείες. Συνολικά υπολογίζεται ότι ο συνολικός αριθμός ανθρώπων που μιλάνε τα ελληνικά σαν πρώτη ή δεύτερη γλώσσα είναι γύρω στα 20 εκατομμύρια. 2 | -------------------------------------------------------------------------------- /tests/Data/utf-8/he1.txt: -------------------------------------------------------------------------------- 1 | השם עבר מופיע בתנ"ך כשמו של סבו של אברהם אבינו. המושג "עברי" נזכר בתנ"ך פעמים רבות, אולם שפתם של העברים אינה נקראת עברית. כיום מכנים את שפת התנ"ך "לשון המקרא" (או "לשון הקודש") כדי להבדיל אותה מלשון חז"ל המכונה גם "לשון חכמים", שהיא בעצם ניב מאוחר של עברית. המונח כתב עברי מציין בלשונם של חז"ל דווקא את הכתב הארמי על שם "עבר הנהר". 2 | 3 | הקובץ המפורסם ביותר שנכתב בשפה העברית הוא התנ"ך, אם כי בו עצמו לא נזכר שמה של השפה. עם זאת, במלכים ב' יח, כו, ובישעיהו לו, יא, מסופר כי שליחי חזקיהו המלך מבקשים מרבשקה, שליחו של סנחריב מלך אשור, לדבר עמם ב"ארמית" ולא ב"יהודית", כדי שהעם (שכנראה לא דיבר ארמית) לא יבין את דבריהם, ונראה שזה היה שמה של השפה, או לפחות שמו של הניב שדובר באזור ירושלים. 4 | -------------------------------------------------------------------------------- /tests/Data/utf-8/he2.txt: -------------------------------------------------------------------------------- 1 | העברית היא שפה המשתייכת לקבוצת הלשונות השמיות הצפון מערביות, ומהווה את אחד הדיאלקטים של השפה הכנענית. שפה זו הייתה מדוברת החל מהאלף ה-2 לפני הספירה באזור הקרוי הלבנט, שהיום נמצא בשטחן של לבנון, סוריה, ארץ ישראל וירדן. טקסטים מהתקופה הזו שהתגלו בירדן ובלבנון חושפים קרבה רבה בין השפה העברית לשפה הפיניקית והמואבית. 2 | 3 | בעברית נכתבו רוב ספרי התנ"ך, כל המשנה, רוב הספרים החיצוניים ורוב המגילות הגנוזות. המקרא נכתב בעברית מקראית, ואילו המשנה נכתבה בניב הקרוי לשון חז"ל. בתקופה מסוימת בסוף המאה ה-2 לספירה או קצת מאוחר יותר (החוקרים חלוקים בשאלה זו) פסקו רוב היהודים מלהשתמש בעברית כבשפת דיבור. מאות שנים לאחר חתימת המשנה כאשר חדלו היהודים להשתמש בעברית כבר נכתבו התלמודים בארמית. עם זאת ישנן עדויות שאף במאה ה-8 לספירה שפת הדיבור בטבריה שם פעלו בעלי המסורה הייתה עברית. 4 | 5 | גם כשהשפה העברית לא שימשה שפת דיבור, עדיין שימשה לאורך הדורות, במה שמכונה תקופת הביניים של העברית, כשפת הכתב העיקרית של היהודים, בעיקר בעניינים הלכתיים: כתיבת פרוטוקולים של בתי דין, קובצי הלכות, פרשנות לכתבי קודש ועוד. גם כתיבת מכתבים וחוזים בין גברים יהודים נעשתה לעתים קרובות בעברית. ספרות הלכתית לנשים בקהילות אשכנזיות נכתבה ביידיש (למשל ספר ההלכות "צאינה וראינה"), כיוון שהנשים, בניגוד לגברים, לא למדו עברית. חיבורים יהודיים בעלי אופי חילוני או לא-הלכתי נכתבו בשפות יהודיות או בשפות זרות, לדוגמה: הרמב"ם כתב את ספרו "משנה תורה" בעברית, על אף שספרו הפילוסופי המפורסם "מורה נבוכים" שיועד למשכילי זמנו נכתב בערבית יהודית. עם זאת, "מורה נבוכים", כמו ספרים אחרים בנושאים חילוניים, תורגמו לעברית כשהיה בהם עניין לקהילות יהודיות דוברות שפות אחרות. אחת המשפחות היהודיות המפורסמות שעסקו בתרגום מערבית-יהודית לעברית בימי הביניים היא משפחת אבן תיבון. 6 | -------------------------------------------------------------------------------- /tests/Data/utf-8/he3.txt: -------------------------------------------------------------------------------- 1 | אין כמעט ניבים אזוריים עבריים. למעשה, השפה הנשמעת בפי דוברים ילידיים זהה כמעט בכל חלקי ישראל. אפשר להבחין בשוני בין הניבים המדוברים בפי עדות יהודיות שונות (אתנולקטים), אולם שוני זה מתבטא בעיקר בפונולוגיה, ולא בתחביר או במורפולוגיה. שוני מסוים בתחביר ובמורפולוגיה קיים בין ניבים מעמדיים של השפה (סוציולקטים), אולם שוני זה אינו גדול (יחסית). 2 | -------------------------------------------------------------------------------- /tests/Data/utf-8/russian.txt: -------------------------------------------------------------------------------- 1 | В гимназии он не был в числе первых учеников (исключение составляли математика и латынь). Укоренившаяся система механического заучивания материала учащимися (которая, как он считал, наносит вред самому духу учёбы и творческому мышлению), а также авторитарное отношение учителей к ученикам вызывало у Альберта Эйнштейна неприятие, поэтому он часто вступал в споры со своими преподавателями. 2 | 3 | После окончательного разорения отца семейства в 1894 году Эйнштейны переехали из Мюнхена в итальянский город Павию, близ Милана. Сам Альберт оставался в Мюнхене ещё некоторое время, чтобы окончить все шесть классов гимназии. Так и не получив аттестата зрелости, в 1895 году он присоединился к своей семье в Милане. 4 | 5 | Осенью 1895 г. Альберт Эйнштейн прибыл в Швейцарию, чтобы сдать вступительные экзамены в Высшее техническое училище (Политехникум) в Цюрихе и стать преподавателем физики. Блестяще проявив себя на экзамене по математике, он в то же время провалил экзамены по ботанике и французскому языку, что не позволило ему поступить в Цюрихский Политехникум. Однако директор училища посоветовал молодому человеку поступить в выпускной класс школы в Аарау (Швейцария), чтобы получить аттестат и повторить поступление. 6 | -------------------------------------------------------------------------------- /tests/Data/windows-1250/lang_ce_windows-1250.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1250/lang_ce_windows-1250.txt -------------------------------------------------------------------------------- /tests/Data/windows-1251/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1251/1.txt -------------------------------------------------------------------------------- /tests/Data/windows-1252 (latin1)/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1252 (latin1)/2.txt -------------------------------------------------------------------------------- /tests/Data/windows-1253/lang_le_windows-1253.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1253/lang_le_windows-1253.txt -------------------------------------------------------------------------------- /tests/Data/windows-1255/he1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he1.txt -------------------------------------------------------------------------------- /tests/Data/windows-1255/he2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he2.txt -------------------------------------------------------------------------------- /tests/Data/windows-1255/he3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he3.txt -------------------------------------------------------------------------------- /tests/Data/windows-1256/lang_ar_windows-1256.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1256/lang_ar_windows-1256.txt -------------------------------------------------------------------------------- /tests/Data/windows-1257/lang_et_windows-1257.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1257/lang_et_windows-1257.txt -------------------------------------------------------------------------------- /tests/Data/windows-1258/lang_vi_windows-1258.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1258/lang_vi_windows-1258.txt -------------------------------------------------------------------------------- /tests/Data/x-mac-ce/lang_cs_mac-centraleurope.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/x-mac-ce/lang_cs_mac-centraleurope.txt -------------------------------------------------------------------------------- /tests/Data/x-mac-cyrillic/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/x-mac-cyrillic/1.txt -------------------------------------------------------------------------------- /tests/DataUnsupported/euc-tw/euc-tw1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/euc-tw/euc-tw1.txt -------------------------------------------------------------------------------- /tests/DataUnsupported/iso-8859-10/lang_lv_iso-8859-10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/iso-8859-10/lang_lv_iso-8859-10.txt -------------------------------------------------------------------------------- /tests/DataUnsupported/iso-8859-16/lang_sl_iso-8859-16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/iso-8859-16/lang_sl_iso-8859-16.txt -------------------------------------------------------------------------------- /tests/DataUnsupported/viscii/lang_vi_viscii.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/viscii/lang_vi_viscii.txt -------------------------------------------------------------------------------- /tests/DetectionDetailTests.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Reflection; 4 | using NUnit.Framework; 5 | using UtfUnknown.Core; 6 | 7 | namespace UtfUnknown.Tests 8 | { 9 | [TestFixture] 10 | public class DetectionDetailTests 11 | { 12 | 13 | [TestCaseSource(nameof(EncodingNames))] 14 | public void DetectionDetailGetEncodingIsNotNull(string codepageName) 15 | { 16 | var encoding = DetectionDetail.GetEncoding(codepageName); 17 | Assert.IsNotNull(encoding); 18 | } 19 | 20 | private static readonly HashSet UnsupportedEncodings = new HashSet 21 | { 22 | CodepageName.ISO_8859_10, 23 | CodepageName.ISO_8859_16, 24 | CodepageName.EUC_TW, 25 | CodepageName.VISCII, 26 | CodepageName.X_ISO_10646_UCS_4_2143, 27 | CodepageName.X_ISO_10646_UCS_4_3412, 28 | }; 29 | 30 | private static readonly IReadOnlyList EncodingNames = typeof(CodepageName) 31 | .GetFields(BindingFlags.NonPublic | BindingFlags.Static | BindingFlags.CreateInstance) 32 | .Select(x => x.GetValue(null).ToString()) 33 | .Where(x => !UnsupportedEncodings.Contains(x)) 34 | .ToList(); 35 | 36 | 37 | [Test] 38 | public void GetEncodingShouldHandleIncorrectEncoding() 39 | { 40 | // Arrange 41 | string encoding = "wrong"; 42 | // Act 43 | var result = DetectionDetail.GetEncoding(encoding); 44 | 45 | // Assert 46 | Assert.AreEqual(null, result); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tests/EncodingJsonConverter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using Newtonsoft.Json; 4 | 5 | namespace UtfUnknown.Tests 6 | { 7 | public class EncodingJsonConverter : JsonConverter 8 | { 9 | public override bool CanConvert(Type objectType) 10 | { 11 | return typeof(Encoding).IsAssignableFrom(objectType); 12 | } 13 | 14 | public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer) 15 | { 16 | writer.WriteValue(((Encoding)value).WebName); 17 | } 18 | 19 | public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer) 20 | { 21 | return Encoding.GetEncoding((string)reader.Value); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/UTF-unknown.Tests.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | net452;netcoreapp2.1;netcoreapp3.0 5 | UtfUnknown.Tests 6 | UtfUnknown.Tests 7 | true 8 | ..\src\UtfUnknown.snk 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | --------------------------------------------------------------------------------