├── .editorconfig
├── .gitattributes
├── .github
├── dependabot.yml
├── release-drafter.yml
└── workflows
│ ├── codeql-analysis.yml
│ └── main.yml
├── .gitignore
├── README.md
├── SECURITY.md
├── UTF-unknown.sln
├── UTF-unknown.sln.DotSettings
├── appveyor.yml
├── example
├── ConsoleExample.csproj
├── DetectFile.cs
└── app.config
├── license
├── MPL-1.1.txt
├── gpl-2.0.txt
└── lgpl-2.1.txt
├── logo.png
├── src
├── CharsetDetector.cs
├── Core
│ ├── Analyzers
│ │ ├── CharDistributionAnalyser.cs
│ │ └── MultiByte
│ │ │ ├── Chinese
│ │ │ ├── BIG5DistributionAnalyser.cs
│ │ │ ├── EUCTWDistributionAnalyser.cs
│ │ │ └── GB18030DistributionAnalyser.cs
│ │ │ ├── Japanese
│ │ │ ├── EUCJPContextAnalyser.cs
│ │ │ ├── EUCJPDistributionAnalyser.cs
│ │ │ ├── JapaneseContextAnalyser.cs
│ │ │ ├── SJISContextAnalyser.cs
│ │ │ └── SJISDistributionAnalyser.cs
│ │ │ └── Korean
│ │ │ └── EUCKRDistributionAnalyser.cs
│ ├── BitPackage.cs
│ ├── CodepageName.cs
│ ├── InputState.cs
│ ├── Models
│ │ ├── MultiByte
│ │ │ ├── Chinese
│ │ │ │ ├── BIG5SMModel.cs
│ │ │ │ ├── EUCTWSMModel.cs
│ │ │ │ ├── GB18030_SMModel.cs
│ │ │ │ ├── HZ_GB_2312_SMModel.cs
│ │ │ │ └── Iso_2022_CN_SMModel.cs
│ │ │ ├── Japanese
│ │ │ │ ├── EUCJPSMModel.cs
│ │ │ │ ├── Iso_2022_JP_SMModel.cs
│ │ │ │ └── SJIS_SMModel.cs
│ │ │ ├── Korean
│ │ │ │ ├── CP949SMModel.cs
│ │ │ │ ├── EUCKRSMModel.cs
│ │ │ │ └── Iso_2022_KR_SMModel.cs
│ │ │ ├── UCS2BE_SMModel.cs
│ │ │ ├── UCS2LE_SMModel.cs
│ │ │ └── UTF8_SMModel.cs
│ │ ├── SequenceModel.cs
│ │ ├── SingleByte
│ │ │ ├── Arabic
│ │ │ │ ├── ArabicModel.cs
│ │ │ │ ├── Iso_8859_6_ArabicModel.cs
│ │ │ │ └── Windows_1256_ArabicModel.cs
│ │ │ ├── Bulgarian
│ │ │ │ ├── BulgarianModel.cs
│ │ │ │ ├── Iso_8859_5_BulgarianModel.cs
│ │ │ │ └── Windows_1251_BulgarianModel.cs
│ │ │ ├── Croatian
│ │ │ │ ├── CroatianModel.cs
│ │ │ │ ├── Ibm852_CroatianModel.cs
│ │ │ │ ├── Iso_8859_13_CroatianModel.cs
│ │ │ │ ├── Iso_8859_16_CroatianModel.cs
│ │ │ │ ├── Iso_8859_2_CroatianModel.cs
│ │ │ │ ├── Mac_Centraleurope_CroatianModel.cs
│ │ │ │ └── Windows_1250_CroatianModel.cs
│ │ │ ├── Czech
│ │ │ │ ├── CzechModel.cs
│ │ │ │ ├── Ibm852_CzechModel.cs
│ │ │ │ ├── Iso_8859_2_CzechModel.cs
│ │ │ │ ├── Mac_Centraleurope_CzechModel.cs
│ │ │ │ └── Windows_1250_CzechModel.cs
│ │ │ ├── Danish
│ │ │ │ ├── DanishModel.cs
│ │ │ │ ├── Iso_8859_15_DanishModel.cs
│ │ │ │ ├── Iso_8859_1_DanishModel.cs
│ │ │ │ └── Windows_1252_DanishModel.cs
│ │ │ ├── Esperanto
│ │ │ │ ├── EsperantoModel.cs
│ │ │ │ └── Iso_8859_3_EsperantoModel.cs
│ │ │ ├── Estonian
│ │ │ │ ├── EstonianModel.cs
│ │ │ │ ├── Iso_8859_13_EstonianModel.cs
│ │ │ │ ├── Iso_8859_15_EstonianModel.cs
│ │ │ │ ├── Iso_8859_4_EstonianModel.cs
│ │ │ │ ├── Windows_1252_EstonianModel.cs
│ │ │ │ └── Windows_1257_EstonianModel.cs
│ │ │ ├── Finnish
│ │ │ │ ├── FinnishModel.cs
│ │ │ │ ├── Iso_8859_13_FinnishModel.cs
│ │ │ │ ├── Iso_8859_15_FinnishModel.cs
│ │ │ │ ├── Iso_8859_1_FinnishModel.cs
│ │ │ │ ├── Iso_8859_4_FinnishModel.cs
│ │ │ │ ├── Iso_8859_9_FinnishModel.cs
│ │ │ │ └── Windows_1252_FinnishModel.cs
│ │ │ ├── French
│ │ │ │ ├── FrenchModel.cs
│ │ │ │ ├── Iso_8859_15_FrenchModel.cs
│ │ │ │ ├── Iso_8859_1_FrenchModel.cs
│ │ │ │ └── Windows_1252_FrenchModel.cs
│ │ │ ├── German
│ │ │ │ ├── GermanModel.cs
│ │ │ │ ├── Iso_8859_1_GermanModel.cs
│ │ │ │ └── Windows_1252_GermanModel.cs
│ │ │ ├── Greek
│ │ │ │ ├── GreekModel.cs
│ │ │ │ ├── Iso_8859_7_GreekModel.cs
│ │ │ │ └── Windows_1253_GreekModel.cs
│ │ │ ├── Hebrew
│ │ │ │ ├── HebrewModel.cs
│ │ │ │ └── Windows_1255_HebrewModel.cs
│ │ │ ├── Hungarian
│ │ │ │ ├── HungarianModel.cs
│ │ │ │ ├── Iso_8859_2_HungarianModel.cs
│ │ │ │ └── Windows_1250_HungarianModel.cs
│ │ │ ├── Irish
│ │ │ │ ├── IrishModel.cs
│ │ │ │ ├── Iso_8859_15_IrishModel.cs
│ │ │ │ ├── Iso_8859_1_IrishModel.cs
│ │ │ │ ├── Iso_8859_9_IrishModel.cs
│ │ │ │ └── Windows_1252_IrishModel.cs
│ │ │ ├── Italian
│ │ │ │ ├── Iso_8859_15_ItalianModel.cs
│ │ │ │ ├── Iso_8859_1_ItalianModel.cs
│ │ │ │ ├── Iso_8859_3_ItalianModel.cs
│ │ │ │ ├── Iso_8859_9_ItalianModel.cs
│ │ │ │ ├── ItalianModel.cs
│ │ │ │ └── Windows_1252_ItalianModel.cs
│ │ │ ├── Latvian
│ │ │ │ ├── Iso_8859_10_LatvianModel.cs
│ │ │ │ ├── Iso_8859_13_LatvianModel.cs
│ │ │ │ ├── Iso_8859_4_LatvianModel.cs
│ │ │ │ └── LatvianModel.cs
│ │ │ ├── Lithuanian
│ │ │ │ ├── Iso_8859_10_LithuanianModel.cs
│ │ │ │ ├── Iso_8859_13_LithuanianModel.cs
│ │ │ │ ├── Iso_8859_4_LithuanianModel.cs
│ │ │ │ └── LithuanianModel.cs
│ │ │ ├── Maltese
│ │ │ │ ├── Iso_8859_3_MalteseModel.cs
│ │ │ │ └── MalteseModel.cs
│ │ │ ├── Polish
│ │ │ │ ├── Ibm852_PolishModel.cs
│ │ │ │ ├── Iso_8859_13_PolishModel.cs
│ │ │ │ ├── Iso_8859_16_PolishModel.cs
│ │ │ │ ├── Iso_8859_2_PolishModel.cs
│ │ │ │ ├── Mac_Centraleurope_PolishModel.cs
│ │ │ │ ├── PolishModel.cs
│ │ │ │ └── Windows_1250_PolishModel.cs
│ │ │ ├── Portuguese
│ │ │ │ ├── Iso_8859_15_PortugueseModel.cs
│ │ │ │ ├── Iso_8859_1_PortugueseModel.cs
│ │ │ │ ├── Iso_8859_9_PortugueseModel.cs
│ │ │ │ ├── PortugueseModel.cs
│ │ │ │ └── Windows_1252_PortugueseModel.cs
│ │ │ ├── Romanian
│ │ │ │ ├── Ibm852_RomanianModel.cs
│ │ │ │ ├── Iso_8859_16_RomanianModel.cs
│ │ │ │ ├── Iso_8859_2_RomanianModel.cs
│ │ │ │ ├── RomanianModel.cs
│ │ │ │ └── Windows_1250_RomanianModel.cs
│ │ │ ├── Russian
│ │ │ │ ├── Ibm855_RussianModel.cs
│ │ │ │ ├── Ibm866_RussianModel.cs
│ │ │ │ ├── Iso_8859_5_RussianModel.cs
│ │ │ │ ├── Koi8r_Model.cs
│ │ │ │ ├── RussianModel.cs
│ │ │ │ ├── Windows_1251_RussianModel.cs
│ │ │ │ └── X_Mac_Cyrillic_RussianModel.cs
│ │ │ ├── Slovak
│ │ │ │ ├── Ibm852_SlovakModel.cs
│ │ │ │ ├── Iso_8859_2_SlovakModel.cs
│ │ │ │ ├── Mac_Centraleurope_SlovakModel.cs
│ │ │ │ ├── SlovakModel.cs
│ │ │ │ └── Windows_1250_SlovakModel.cs
│ │ │ ├── Slovene
│ │ │ │ ├── Ibm852_SloveneModel.cs
│ │ │ │ ├── Iso_8859_16_SloveneModel.cs
│ │ │ │ ├── Iso_8859_2_SloveneModel.cs
│ │ │ │ ├── Mac_Centraleurope_SloveneModel.cs
│ │ │ │ ├── SloveneModel.cs
│ │ │ │ └── Windows_1250_SloveneModel.cs
│ │ │ ├── Spanish
│ │ │ │ ├── Iso_8859_15_SpanishModel.cs
│ │ │ │ ├── Iso_8859_1_SpanishModel.cs
│ │ │ │ ├── SpanishModel.cs
│ │ │ │ └── Windows_1252_SpanishModel.cs
│ │ │ ├── Swedish
│ │ │ │ ├── Iso_8859_15_SwedishModel.cs
│ │ │ │ ├── Iso_8859_1_SwedishModel.cs
│ │ │ │ ├── Iso_8859_4_SwedishModel.cs
│ │ │ │ ├── Iso_8859_9_SwedishModel.cs
│ │ │ │ ├── SwedishModel.cs
│ │ │ │ └── Windows_1252_SwedishModel.cs
│ │ │ ├── Thai
│ │ │ │ ├── Iso_8859_11_ThaiModel.cs
│ │ │ │ ├── ThaiModel.cs
│ │ │ │ └── Tis_620_ThaiModel.cs
│ │ │ ├── Turkish
│ │ │ │ ├── Iso_8859_3_TurkishModel.cs
│ │ │ │ ├── Iso_8859_9_TurkishModel.cs
│ │ │ │ └── TurkishModel.cs
│ │ │ └── Vietnamese
│ │ │ │ ├── VietnameseModel.cs
│ │ │ │ ├── Viscii_VietnameseModel.cs
│ │ │ │ └── Windows_1258_VietnameseModel.cs
│ │ └── StateMachineModel.cs
│ └── Probers
│ │ ├── CharsetProber.cs
│ │ ├── CodingStateMachine.cs
│ │ ├── EscCharsetProber.cs
│ │ ├── HebrewProber.cs
│ │ ├── Latin1Prober.cs
│ │ ├── MBCSGroupProber.cs
│ │ ├── MultiByte
│ │ ├── Chinese
│ │ │ ├── Big5Prober.cs
│ │ │ ├── EUCTWProber.cs
│ │ │ └── GB18030Prober.cs
│ │ ├── Japanese
│ │ │ ├── EUCJPProber.cs
│ │ │ └── SJISProber.cs
│ │ ├── Korean
│ │ │ ├── CP949Prober.cs
│ │ │ └── EUCKRProber.cs
│ │ └── UTF8Prober.cs
│ │ ├── ProbingState.cs
│ │ ├── SBCSGroupProber.cs
│ │ └── SingleByteCharSetProber.cs
├── DetectionDetail.cs
├── DetectionResult.cs
├── UTF-unknown.csproj
└── UtfUnknown.snk
└── tests
├── BitPackageTest.cs
├── CharsetDetectorTest.cs
├── CharsetDetectorTestBatch.cs
├── Data
├── README.md
├── big5
│ └── 1.txt
├── cp949
│ ├── cp949_1.txt
│ └── cp949_2.txt
├── euc-jp
│ └── 1.txt
├── euc-kr
│ ├── euc1.txt
│ └── euc2.txt
├── gb18030
│ └── 1.txt
├── ibm852
│ └── lang_ce_ibm852.txt
├── ibm855
│ ├── 1.txt
│ └── 2.txt
├── ibm866
│ └── 1.txt
├── iso-2022-jp
│ └── 1.txt
├── iso-2022-kr
│ ├── iso1.txt
│ └── iso2.txt
├── iso-8859-1
│ ├── 1.txt
│ ├── 3.txt
│ └── 4.txt
├── iso-8859-11
│ └── lang_th_iso-8859-11.txt
├── iso-8859-13
│ └── lang_et_iso-8859-13.txt
├── iso-8859-15
│ └── lang_da_iso-8859-15.txt
├── iso-8859-2
│ └── lang_ce_iso-8859-2.txt
├── iso-8859-3
│ └── lang_eo_iso-8859-3.txt
├── iso-8859-4
│ └── lang_et_iso-8859-4.txt
├── iso-8859-5
│ └── lang_ru_iso-8859-5.txt
├── iso-8859-6
│ └── lang_ar_iso-8859-6.txt
├── iso-8859-7
│ ├── greek.txt
│ └── lang_le_iso-8859-7.txt
├── iso-8859-9
│ └── lang_tr_iso-8859-9.txt
├── koi8-r
│ ├── 1.txt
│ └── 2.txt
├── shift-jis
│ ├── 1.txt
│ ├── 2.txt
│ ├── 3.txt
│ └── 4.txt
├── tis-620
│ └── lang_th_tis-620.txt
├── utf-16be
│ └── lang_fr_utf-16.be
├── utf-16le
│ └── lang_ko_utf-16.le
├── utf-32le
│ └── lang_fr_utf-32.le
├── utf-8
│ ├── 1.txt
│ ├── 2.txt
│ ├── 3.txt
│ ├── 4.txt
│ ├── 5.txt
│ ├── emoji.html.txt
│ ├── greek.txt
│ ├── he1.txt
│ ├── he2.txt
│ ├── he3.txt
│ └── russian.txt
├── windows-1250
│ └── lang_ce_windows-1250.txt
├── windows-1251
│ └── 1.txt
├── windows-1252 (latin1)
│ └── 2.txt
├── windows-1253
│ └── lang_le_windows-1253.txt
├── windows-1255
│ ├── he1.txt
│ ├── he2.txt
│ └── he3.txt
├── windows-1256
│ └── lang_ar_windows-1256.txt
├── windows-1257
│ └── lang_et_windows-1257.txt
├── windows-1258
│ └── lang_vi_windows-1258.txt
├── x-mac-ce
│ └── lang_cs_mac-centraleurope.txt
└── x-mac-cyrillic
│ └── 1.txt
├── DataUnsupported
├── euc-tw
│ └── euc-tw1.txt
├── iso-8859-10
│ └── lang_lv_iso-8859-10.txt
├── iso-8859-16
│ └── lang_sl_iso-8859-16.txt
└── viscii
│ └── lang_vi_viscii.txt
├── DetectionDetailTests.cs
├── EncodingJsonConverter.cs
└── UTF-unknown.Tests.csproj
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "nuget"
4 | directory: "/"
5 | schedule:
6 | interval: "daily"
7 |
--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
1 | name-template: 'Version $NEXT_MINOR_VERSION'
2 | tag-template: 'v$NEXT_MINOR_VERSION'
3 | version-template: '$MAJOR.$MINOR'
4 | categories:
5 | - title: '🚀 Features'
6 | labels:
7 | - 'feature'
8 | - title: '👍 Enhancements'
9 | labels:
10 | - 'enhancement'
11 | - 'documentation'
12 | - title: '🐛 Bug Fixes'
13 | labels:
14 | - 'fix'
15 | - 'bugfix'
16 | - 'bug'
17 | - title: '🔧 Maintenance'
18 | labels:
19 | - 'refactor'
20 | - 'test'
21 | - 'tests'
22 | exclude-labels:
23 | - 'skip-changelog'
24 | - 'build'
25 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
26 | template: |
27 | $CHANGES
28 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master ]
20 | schedule:
21 | - cron: '26 18 * * 6'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'csharp' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 | # Learn more:
38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 |
40 | steps:
41 | - name: Checkout repository
42 | uses: actions/checkout@v2
43 |
44 | # Initializes the CodeQL tools for scanning.
45 | - name: Initialize CodeQL
46 | uses: github/codeql-action/init@v1
47 | with:
48 | languages: ${{ matrix.language }}
49 | # If you wish to specify custom queries, you can do so here or in a config file.
50 | # By default, queries listed here will override any specified in a config file.
51 | # Prefix the list here with "+" to use these queries and those in the config file.
52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 |
54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
55 | # If this step fails, then you should remove it and run the build manually (see below)
56 | - name: Autobuild
57 | uses: github/codeql-action/autobuild@v1
58 |
59 | # ℹ️ Command-line programs to run using the OS shell.
60 | # 📚 https://git.io/JvXDl
61 |
62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 | # and modify them (or add more) to build your code if your project
64 | # uses a compiled language
65 |
66 | #- run: |
67 | # make bootstrap
68 | # make release
69 |
70 | - name: Perform CodeQL Analysis
71 | uses: github/codeql-action/analyze@v1
72 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Release Drafter
2 |
3 | on:
4 | push:
5 | # branches to consider in the event; optional, defaults to all
6 | branches:
7 | - master
8 |
9 | jobs:
10 | update_release_draft:
11 | runs-on: ubuntu-latest
12 | steps:
13 | # Drafts your next Release notes as Pull Requests are merged into "master"
14 | - uses: release-drafter/release-drafter@v5
15 | env:
16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sublime-*
2 | *.vshost.*
3 | log.txt
4 | NLogMerged.api.xml
5 | packages/
6 | StyleCop.Cache
7 | test-results/
8 |
9 | # User-specific files
10 | *.rsuser
11 | *.suo
12 | *.user
13 | *.userosscache
14 | *.sln.docstates
15 |
16 | # User-specific files (MonoDevelop/Xamarin Studio)
17 | *.userprefs
18 |
19 | # User-specific files (IntelliJ IDEA)
20 | .idea/
21 |
22 | # Build results
23 | [Dd]ebug/
24 | [Dd]ebugPublic/
25 | [Rr]elease/
26 | [Rr]eleases/
27 | x64/
28 | x86/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Ll]og/
33 | [Ll]ogs/
34 | [Bb]uild/
35 |
36 | # Visual Studio 2015/2017 cache/options directory
37 | .vs/
38 |
39 | # Visual Studio 2017 auto generated files
40 | Generated\ Files/
41 |
42 | # MSTest test Results
43 | [Tt]est[Rr]esult*/
44 | [Bb]uild[Ll]og.*
45 |
46 | # NUnit
47 | *.VisualState.xml
48 | TestResult.xml
49 | nunit-*.xml
50 |
51 | # StyleCop
52 | StyleCopReport.xml
53 |
54 | # Files built by Visual Studio
55 | *_i.c
56 | *_p.c
57 | *_h.h
58 | *.ilk
59 | *.meta
60 | *.obj
61 | *.iobj
62 | *.pch
63 | *.pdb
64 | *.ipdb
65 | *.pgc
66 | *.pgd
67 | *.rsp
68 | *.sbr
69 | *.tlb
70 | *.tli
71 | *.tlh
72 | *.tmp
73 | *.tmp_proj
74 | *_wpftmp.csproj
75 | *.log
76 | *.vspscc
77 | *.vssscc
78 | .builds
79 | *.pidb
80 | *.svclog
81 | *.scc
82 |
83 | # Visual Studio profiler
84 | *.psess
85 | *.vsp
86 | *.vspx
87 | *.sap
88 |
89 | # Visual Studio Trace Files
90 | *.e2e
91 |
92 | # ReSharper is a .NET coding add-in
93 | _ReSharper*/
94 | *.[Rr]e[Ss]harper
95 | *.DotSettings.user
96 |
97 | # JustCode is a .NET coding add-in
98 | .JustCode
99 |
100 | # DotCover is a Code Coverage Tool
101 | *.dotCover
102 |
103 | # AxoCover is a Code Coverage Tool
104 | .axoCover/*
105 | !.axoCover/settings.json
106 |
107 | # Visual Studio code coverage results
108 | *.coverage
109 | *.coveragexml
110 |
111 | # NuGet Packages
112 | *.nupkg
113 | # NuGet Symbol Packages
114 | *.snupkg
115 | # The packages folder can be ignored because of Package Restore
116 | **/[Pp]ackages/*
117 | # except build/, which is used as an MSBuild target.
118 | !**/[Pp]ackages/build/
119 | # Uncomment if necessary however generally it will be regenerated when needed
120 | #!**/[Pp]ackages/repositories.config
121 | # NuGet v3's project.json files produces more ignorable files
122 | *.nuget.props
123 | *.nuget.targets
124 |
125 | # Visual Studio cache files
126 | # files ending in .cache can be ignored
127 | *.[Cc]ache
128 | # but keep track of directories ending in .cache
129 | !?*.[Cc]ache/
130 |
131 | # Others
132 | ClientBin/
133 | ~$*
134 | *~
135 | *.dbmdl
136 | *.dbproj.schemaview
137 | *.jfm
138 | *.pfx
139 | *.publishsettings
140 | orleans.codegen.cs
141 |
142 | # Backup & report files from converting an old project file
143 | # to a newer Visual Studio version. Backup files are not needed,
144 | # because we have git ;-)
145 | _UpgradeReport_Files/
146 | Backup*/
147 | UpgradeLog*.XML
148 | UpgradeLog*.htm
149 | ServiceFabricBackup/
150 | *.rptproj.bak
151 |
152 | # MSBuild Binary and Structured Log
153 | *.binlog
154 |
155 | # MFractors (Xamarin productivity tool) working folder
156 | .mfractor/
157 |
158 | # Local History for Visual Studio
159 | .localhistory/
160 |
161 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
162 | MigrationBackup/
163 |
164 | # Ionide (cross platform F# VS Code tools) working folder
165 | .ionide/
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Supported Versions
4 |
5 | Currently supported with security updates:
6 |
7 | | Version | Supported |
8 | | ------- | ------------------ |
9 | | 2.x.x | :white_check_mark: |
10 | | 1.x.x | :x: |
11 |
12 | ## Reporting a Vulnerability
13 |
14 | Please open a GitHub issue, don't report details and leave your email address. I will contact you ASAP. Thanks!
15 |
--------------------------------------------------------------------------------
/UTF-unknown.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.2.32616.157
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UTF-unknown", "src\UTF-unknown.csproj", "{64CA7BA7-EFD9-4475-BB66-40B187622A73}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ConsoleExample", "example\ConsoleExample.csproj", "{386C6ABF-44EA-4418-B90E-E8D21E4C2475}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UTF-unknown.Tests", "Tests\UTF-unknown.Tests.csproj", "{1922DCC9-A45F-4627-9087-CD492BBF7F38}"
11 | EndProject
12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{052846B2-CA56-482F-B477-6E33523C091E}"
13 | ProjectSection(SolutionItems) = preProject
14 | .editorconfig = .editorconfig
15 | EndProjectSection
16 | EndProject
17 | Global
18 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
19 | Debug|Any CPU = Debug|Any CPU
20 | Release|Any CPU = Release|Any CPU
21 | EndGlobalSection
22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
23 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
24 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Debug|Any CPU.Build.0 = Debug|Any CPU
25 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Release|Any CPU.ActiveCfg = Release|Any CPU
26 | {64CA7BA7-EFD9-4475-BB66-40B187622A73}.Release|Any CPU.Build.0 = Release|Any CPU
27 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
28 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Debug|Any CPU.Build.0 = Debug|Any CPU
29 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Release|Any CPU.ActiveCfg = Release|Any CPU
30 | {386C6ABF-44EA-4418-B90E-E8D21E4C2475}.Release|Any CPU.Build.0 = Release|Any CPU
31 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
32 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Debug|Any CPU.Build.0 = Debug|Any CPU
33 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Release|Any CPU.ActiveCfg = Release|Any CPU
34 | {1922DCC9-A45F-4627-9087-CD492BBF7F38}.Release|Any CPU.Build.0 = Release|Any CPU
35 | EndGlobalSection
36 | GlobalSection(SolutionProperties) = preSolution
37 | HideSolutionNode = FALSE
38 | EndGlobalSection
39 | GlobalSection(ExtensibilityGlobals) = postSolution
40 | SolutionGuid = {0C7AF656-EF20-4880-8EB9-9BF101340A03}
41 | EndGlobalSection
42 | EndGlobal
43 |
--------------------------------------------------------------------------------
/UTF-unknown.sln.DotSettings:
--------------------------------------------------------------------------------
1 |
2 | True
3 | True
4 | True
5 | True
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | environment:
2 | info_version: 2.5.1
3 |
4 | version: 2.0.{build}
5 |
6 | clone_folder: c:\utfUnknown
7 | image: Visual Studio 2019
8 | configuration: Release
9 | platform: Any CPU
10 | nuget:
11 | project_feed: true
12 | init:
13 | - git config --global core.autocrlf true
14 | build_script:
15 | - ps: dotnet build -c Release
16 | test_script:
17 | - ps: cd .\tests\
18 | - ps: dotnet test
19 | - ps: cd ..
20 | after_build:
21 | - ps: msbuild -t:Pack .\src\ -p:Configuration=Release -p:IncludeSymbols=true -p:SymbolPackageFormat=snupkg -p:ContinuousIntegrationBuild=true -p:EmbedUntrackedSources=true -verbosity:minimal
22 | - ps: Get-ChildItem '**\test-diag.log' | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name }
23 | artifacts:
24 | - path: '**\*.nupkg'
25 | - path: '**\*.snupkg'
26 | - path: '**\test-diag.log'
27 | dotnet_csproj:
28 | patch: true
29 | file: '**\*.csproj'
30 | version: $(info_version)
31 | package_version: $(info_version)
32 | assembly_version: 2.0.0.0
33 | file_version: '{version}'
34 | informational_version: $(info_version)
35 | deploy:
36 | - provider: NuGet
37 | api_key:
38 | secure: iB1GljKDgO1ynQjVNyXRQY1Ib3nOuCvV+UkPGK44U/5tIhKvZm7ZSgUG5CeQj2/z
39 | on:
40 | branch: master
41 |
--------------------------------------------------------------------------------
/example/ConsoleExample.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.0
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/example/DetectFile.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using UtfUnknown;
4 |
5 | namespace ConsoleExample
6 | {
7 | public class DetectFile
8 | {
9 | ///
10 | /// Command line example: detects the encoding of the given file.
11 | ///
12 | /// a filename
13 | public static void Main(string[] args)
14 | {
15 | if (args.Length == 0)
16 | {
17 | Console.WriteLine("Usage: ConsoleExample ");
18 | return;
19 | }
20 |
21 | var filename = args[0];
22 | if (!File.Exists(filename))
23 | {
24 | Console.WriteLine($"File not found: {filename}");
25 | return;
26 | }
27 |
28 | var result = CharsetDetector.DetectFromFile(filename);
29 | var message = result.Detected != null
30 | ? $"Detected encoding {result.Detected.Encoding.WebName} with confidence {result.Detected.Confidence}."
31 | : $"Detection failed: {filename}";
32 | Console.WriteLine(message);
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/example/app.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/logo.png
--------------------------------------------------------------------------------
/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class EUCJPContextAnalyser : JapaneseContextAnalyser
4 | {
5 | private const byte HIRAGANA_FIRST_BYTE = 0xA4;
6 |
7 | protected override int GetOrder(byte[] buf, int offset, out int charLen)
8 | {
9 | byte high = buf[offset];
10 |
11 | //find out current char's byte length
12 | if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
13 | charLen = 2;
14 | else if (high == 0xBF)
15 | charLen = 3;
16 | else
17 | charLen = 1;
18 |
19 | // return its order if it is hiragana
20 | if (high == HIRAGANA_FIRST_BYTE) {
21 | byte low = buf[offset+1];
22 | if (low >= 0xA1 && low <= 0xF3)
23 | return low - 0xA1;
24 | }
25 | return -1;
26 | }
27 |
28 | protected override int GetOrder(byte[] buf, int offset)
29 | {
30 | // We are only interested in Hiragana
31 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
32 | byte low = buf[offset+1];
33 | if (low >= 0xA1 && low <= 0xF3)
34 | return low - 0xA1;
35 | }
36 | return -1;
37 | }
38 | }
39 | }
--------------------------------------------------------------------------------
/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
4 | {
5 | ///
6 | /// first byte range: 0xa0 -- 0xfe
7 | /// second byte range: 0xa1 -- 0xfe
8 | /// no validation needed here. State machine has done that
9 | ///
10 | public override int GetOrder(byte[] buf, int offset)
11 | {
12 | if (buf[offset] >= 0xA0)
13 | return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
14 | else
15 | return -1;
16 | }
17 | }
18 | }
--------------------------------------------------------------------------------
/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class SJISContextAnalyser : JapaneseContextAnalyser
4 | {
5 | private const byte HIRAGANA_FIRST_BYTE = 0x82;
6 |
7 | protected override int GetOrder(byte[] buf, int offset, out int charLen)
8 | {
9 | //find out current char's byte length
10 | if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
11 | || buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
12 | charLen = 2;
13 | else
14 | charLen = 1;
15 |
16 | // return its order if it is hiragana
17 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
18 | byte low = buf[offset+1];
19 | if (low >= 0x9F && low <= 0xF1)
20 | return low - 0x9F;
21 | }
22 | return -1;
23 | }
24 |
25 | protected override int GetOrder(byte[] buf, int offset)
26 | {
27 | // We are only interested in Hiragana
28 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
29 | byte low = buf[offset+1];
30 | if (low >= 0x9F && low <= 0xF1)
31 | return low - 0x9F;
32 | }
33 | return -1;
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/src/Core/BitPackage.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Kohei TAKETA (Java port)
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | namespace UtfUnknown.Core
40 | {
41 | public class BitPackage
42 | {
43 | public static int INDEX_SHIFT_4BITS = 3;
44 | public static int INDEX_SHIFT_8BITS = 2;
45 | public static int INDEX_SHIFT_16BITS = 1;
46 | public static int SHIFT_MASK_4BITS = 7;
47 | public static int SHIFT_MASK_8BITS = 3;
48 |
49 | public static int SHIFT_MASK_16BITS = 1;
50 | public static int BIT_SHIFT_4BITS = 2;
51 | public static int BIT_SHIFT_8BITS = 3;
52 |
53 | public static int BIT_SHIFT_16BITS = 4;
54 | public static int UNIT_MASK_4BITS = 0x0000000F;
55 | public static int UNIT_MASK_8BITS = 0x000000FF;
56 |
57 | public static int UNIT_MASK_16BITS = 0x0000FFFF;
58 |
59 | private int indexShift;
60 | private int shiftMask;
61 | private int bitShift;
62 | private int unitMask;
63 | private int[] data;
64 |
65 | public BitPackage(int indexShift, int shiftMask,
66 | int bitShift, int unitMask, int[] data)
67 | {
68 | this.indexShift = indexShift;
69 | this.shiftMask = shiftMask;
70 | this.bitShift = bitShift;
71 | this.unitMask = unitMask;
72 | this.data = data;
73 | }
74 |
75 | public static int Pack16bits(int a, int b)
76 | {
77 | return ((b << 16) | a);
78 | }
79 |
80 | public static int Pack8bits(int a, int b, int c, int d)
81 | {
82 | return Pack16bits((b << 8) | a, (d << 8) | c);
83 | }
84 |
85 | public static int Pack4bits(int a, int b, int c, int d,
86 | int e, int f, int g, int h)
87 | {
88 | return Pack8bits((b << 4) | a, (d << 4) | c,
89 | (f << 4) | e, (h << 4) | g);
90 | }
91 |
92 | public int Unpack(int i)
93 | {
94 | return (data[i >> indexShift] >>
95 | ((i & shiftMask) << bitShift)) & unitMask;
96 | }
97 | }
98 | }
--------------------------------------------------------------------------------
/src/Core/InputState.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core
2 | {
3 | enum InputState
4 | {
5 | PureASCII=0,
6 |
7 | ///
8 | /// Found escape character or HZ "~{"
9 | ///
10 | EscASCII = 1,
11 |
12 | ///
13 | /// non-ascii byte (high-byte)
14 | ///
15 | Highbyte = 2
16 | };
17 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Chinese/BIG5SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
2 | {
3 | public class BIG5SMModel : StateMachineModel
4 | {
5 | private readonly static int[] BIG5_cls = {
6 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
7 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
9 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87
23 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
25 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
26 | BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7
27 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af
28 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7
29 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf
30 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7
31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
38 | };
39 |
40 | private readonly static int[] BIG5_st = {
41 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f
43 | BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
44 | };
45 |
46 | private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
47 |
48 | public BIG5SMModel() : base(
49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
50 | BitPackage.SHIFT_MASK_4BITS,
51 | BitPackage.BIT_SHIFT_4BITS,
52 | BitPackage.UNIT_MASK_4BITS, BIG5_cls),
53 | 5,
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, BIG5_st),
58 | BIG5CharLenTable, CodepageName.BIG5)
59 | {
60 | }
61 | }
62 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Chinese/EUCTWSMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
2 | {
3 | public class EUCTWSMModel : StateMachineModel
4 | {
5 | private readonly static int[] EUCTW_cls = {
6 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07
7 | BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17
9 | BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f
10 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27
11 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f
12 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37
13 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f
14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
23 | BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
26 | BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7
27 | BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af
28 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
29 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
30 | BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7
31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
38 | };
39 |
40 | private readonly static int[] EUCTW_st = {
41 | BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17
44 | BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
45 | BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27
46 | BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f
47 | };
48 |
49 | private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 };
50 |
51 | public EUCTWSMModel() : base(
52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
53 | BitPackage.SHIFT_MASK_4BITS,
54 | BitPackage.BIT_SHIFT_4BITS,
55 | BitPackage.UNIT_MASK_4BITS, EUCTW_cls),
56 | 7,
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, EUCTW_st),
61 | EUCTWCharLenTable, CodepageName.EUC_TW)
62 | {
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Chinese/GB18030_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
2 | {
3 | public class GB18030_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] GB18030_cls = {
6 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
7 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
9 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
12 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37
13 | BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f
14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f
22 | BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87
23 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f
24 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97
25 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f
26 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7
27 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af
28 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7
29 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf
30 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7
31 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
32 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
33 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
34 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7
35 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef
36 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7
37 | BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff
38 | };
39 |
40 | private readonly static int[] GB18030_st = {
41 | BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17
44 | BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f
45 | BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27
46 | BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f
47 | };
48 |
49 | // To be accurate, the length of class 6 can be either 2 or 4.
50 | // But it is not necessary to discriminate between the two since
51 | // it is used for frequency analysis only, and we are validating
52 | // each code range there as well. So it is safe to set it to be
53 | // 2 here.
54 | private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
55 |
56 | public GB18030_SMModel() : base(
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, GB18030_cls),
61 | 7,
62 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
63 | BitPackage.SHIFT_MASK_4BITS,
64 | BitPackage.BIT_SHIFT_4BITS,
65 | BitPackage.UNIT_MASK_4BITS, GB18030_st),
66 | GB18030CharLenTable, CodepageName.GB18030)
67 | {
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Chinese/Iso_2022_CN_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
2 | {
3 | public class Iso_2022_CN_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] ISO2022CN_cls = {
6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
38 | };
39 |
40 | private readonly static int[] ISO2022CN_st = {
41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START), //00-07
42 | BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f
43 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME), //10-17
44 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR), //18-1f
45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //20-27
46 | BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //28-2f
47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //30-37
48 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
49 | };
50 |
51 | private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
52 |
53 | public Iso_2022_CN_SMModel() : base(
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
58 | 9,
59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
60 | BitPackage.SHIFT_MASK_4BITS,
61 | BitPackage.BIT_SHIFT_4BITS,
62 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
63 | ISO2022CNCharLenTable, CodepageName.ISO_2022_CN)
64 | {
65 | }
66 | }
67 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Japanese/EUCJPSMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
2 | {
3 | public class EUCJPSMModel : StateMachineModel
4 | {
5 | private readonly static int[] EUCJP_cls = {
6 | //BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07
7 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07
8 | BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f
9 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17
10 | BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f
11 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27
12 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f
13 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37
14 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f
15 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47
16 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f
17 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57
18 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f
19 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67
20 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f
21 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77
22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f
23 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87
24 | BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f
25 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97
26 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f
27 | BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff
39 | };
40 |
41 | private readonly static int[] EUCJP_st = {
42 | BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07
43 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
44 | BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17
45 | BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f
46 | BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27
47 | };
48 |
49 | private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 };
50 |
51 | public EUCJPSMModel() : base(
52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
53 | BitPackage.SHIFT_MASK_4BITS,
54 | BitPackage.BIT_SHIFT_4BITS,
55 | BitPackage.UNIT_MASK_4BITS, EUCJP_cls),
56 | 6,
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, EUCJP_st),
61 | EUCJPCharLenTable, CodepageName.EUC_JP)
62 | {
63 |
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Japanese/Iso_2022_JP_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
2 | {
3 | public class Iso_2022_JP_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] ISO2022JP_cls = {
6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
38 | };
39 |
40 | private readonly static int[] ISO2022JP_st = {
41 | BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START), //00-07
42 | BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f
43 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //10-17
44 | BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR), //18-1f
45 | BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR), //20-27
46 | BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR), //28-2f
47 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME), //30-37
48 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //38-3f
49 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
50 | };
51 |
52 | private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
53 |
54 | public Iso_2022_JP_SMModel() : base(
55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
56 | BitPackage.SHIFT_MASK_4BITS,
57 | BitPackage.BIT_SHIFT_4BITS,
58 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
59 | 10,
60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
61 | BitPackage.SHIFT_MASK_4BITS,
62 | BitPackage.BIT_SHIFT_4BITS,
63 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
64 | ISO2022JPCharLenTable, CodepageName.ISO_2022_JP)
65 | {
66 |
67 | }
68 |
69 | }
70 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Japanese/SJIS_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
2 | {
3 | public class SJIS_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] SJIS_cls = {
6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
23 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87
24 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f
25 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97
26 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f
27 | //0xa0 is illegal in sjis encoding, but some pages does
28 | //contain such byte. We need to be more error forgiven.
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
38 | BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef
39 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7
40 | BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff
41 | };
42 |
43 | private readonly static int[] SJIS_st = {
44 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
46 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17
47 | };
48 |
49 | private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 };
50 |
51 | public SJIS_SMModel() : base(
52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
53 | BitPackage.SHIFT_MASK_4BITS,
54 | BitPackage.BIT_SHIFT_4BITS,
55 | BitPackage.UNIT_MASK_4BITS, SJIS_cls),
56 | 6,
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, SJIS_st),
61 | SJISCharLenTable, CodepageName.SHIFT_JIS)
62 | {
63 |
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Korean/EUCKRSMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Korean
2 | {
3 | public class EUCKRSMModel : StateMachineModel
4 | {
5 | private readonly static int[] EUCKR_cls = {
6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
17 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
18 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
19 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
20 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
21 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
22 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
27 | BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7
28 | BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
32 | BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff
39 | };
40 |
41 | private readonly static int[] EUCKR_st = {
42 | BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07
43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f
44 | };
45 |
46 | private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 };
47 |
48 | public EUCKRSMModel() : base(
49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
50 | BitPackage.SHIFT_MASK_4BITS,
51 | BitPackage.BIT_SHIFT_4BITS,
52 | BitPackage.UNIT_MASK_4BITS, EUCKR_cls),
53 | 4,
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, EUCKR_st),
58 | EUCKRCharLenTable, CodepageName.EUC_KR)
59 | {
60 |
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/Korean/Iso_2022_KR_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Korean
2 | {
3 | public class Iso_2022_KR_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] ISO2022KR_cls = {
6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
38 | };
39 |
40 | private readonly static int[] ISO2022KR_st = {
41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR), //00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR), //10-17
44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR), //18-1f
45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
46 | };
47 |
48 | private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
49 |
50 | public Iso_2022_KR_SMModel() : base(
51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
52 | BitPackage.SHIFT_MASK_4BITS,
53 | BitPackage.BIT_SHIFT_4BITS,
54 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
55 | 6,
56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
57 | BitPackage.SHIFT_MASK_4BITS,
58 | BitPackage.BIT_SHIFT_4BITS,
59 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
60 | ISO2022KRCharLenTable, CodepageName.ISO_2022_KR)
61 | {
62 |
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/UCS2BE_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte
2 | {
3 | public class UCS2BE_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] UCS2BE_cls = {
6 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
37 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
38 | };
39 |
40 | private readonly static int[] UCS2BE_st = {
41 | BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17
44 | BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f
45 | BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27
46 | BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f
47 | BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37
48 | };
49 |
50 | private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 };
51 |
52 | public UCS2BE_SMModel() : base(
53 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
54 | BitPackage.SHIFT_MASK_4BITS,
55 | BitPackage.BIT_SHIFT_4BITS,
56 | BitPackage.UNIT_MASK_4BITS, UCS2BE_cls),
57 | 6,
58 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
59 | BitPackage.SHIFT_MASK_4BITS,
60 | BitPackage.BIT_SHIFT_4BITS,
61 | BitPackage.UNIT_MASK_4BITS, UCS2BE_st),
62 | UCS2BECharLenTable, CodepageName.UTF16_BE)
63 | {
64 |
65 | }
66 | }
67 | }
--------------------------------------------------------------------------------
/src/Core/Models/MultiByte/UCS2LE_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte
2 | {
3 | public class UCS2LE_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] UCS2LE_cls = {
6 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
37 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
38 | };
39 |
40 | private readonly static int[] UCS2LE_st = {
41 | BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17
44 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f
45 | BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27
46 | BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f
47 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37
48 | };
49 |
50 | private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 };
51 |
52 | public UCS2LE_SMModel() : base(
53 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
54 | BitPackage.SHIFT_MASK_4BITS,
55 | BitPackage.BIT_SHIFT_4BITS,
56 | BitPackage.UNIT_MASK_4BITS, UCS2LE_cls),
57 | 6,
58 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
59 | BitPackage.SHIFT_MASK_4BITS,
60 | BitPackage.BIT_SHIFT_4BITS,
61 | BitPackage.UNIT_MASK_4BITS, UCS2LE_st),
62 | UCS2LECharLenTable, CodepageName.UTF16_LE)
63 | {
64 |
65 | }
66 | }
67 | }
--------------------------------------------------------------------------------
/src/Core/Models/SequenceModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System;
40 |
41 | namespace UtfUnknown.Core.Models
42 | {
43 | public abstract class SequenceModel
44 | {
45 | // Codepoints
46 |
47 | // Illegal codepoints
48 | public const byte ILL = 255;
49 | // Control character
50 | public const byte CTR = 254;
51 | // Symbols and punctuation that does not belong to words
52 | public const byte SYM = 253;
53 | // Return/Line feeds
54 | public const byte RET = 252;
55 | // Numbers 0-9
56 | public const byte NUM = 251;
57 |
58 | // [256] table use to find a char's order
59 | protected byte[] charToOrderMap;
60 |
61 | // freqCharCount x freqCharCount table to find a 2-char sequence's
62 | // frequency
63 | protected byte[] precedenceMatrix;
64 |
65 | // The count of frequent characters
66 | protected int freqCharCount;
67 |
68 | public int FreqCharCount
69 | {
70 | get { return freqCharCount; }
71 | }
72 |
73 | // freqSeqs / totalSeqs
74 | protected float typicalPositiveRatio;
75 |
76 | public float TypicalPositiveRatio {
77 | get { return typicalPositiveRatio; }
78 | }
79 |
80 |
81 | ///
82 | /// TODO not used?
83 | ///
84 | protected bool keepEnglishLetter;
85 |
86 | ///
87 | /// TODO not used?
88 | ///
89 | public bool KeepEnglishLetter {
90 | get { return keepEnglishLetter; }
91 | }
92 |
93 | protected string charsetName;
94 |
95 | public string CharsetName {
96 | get { return charsetName; }
97 | }
98 |
99 | public SequenceModel(
100 | byte[] charToOrderMap,
101 | byte[] precedenceMatrix,
102 | int freqCharCount,
103 | float typicalPositiveRatio,
104 | bool keepEnglishLetter,
105 | String charsetName)
106 | {
107 | this.charToOrderMap = charToOrderMap;
108 | this.precedenceMatrix = precedenceMatrix;
109 | this.freqCharCount = freqCharCount;
110 | this.typicalPositiveRatio = typicalPositiveRatio;
111 | this.keepEnglishLetter = keepEnglishLetter;
112 | this.charsetName = charsetName;
113 | }
114 |
115 | public byte GetOrder(byte b)
116 | {
117 | return charToOrderMap[b];
118 | }
119 |
120 | public byte GetPrecedence(int pos)
121 | {
122 | return precedenceMatrix[pos];
123 | }
124 | }
125 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Bulgarian/Iso_8859_5_BulgarianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian
44 | {
45 | public class Iso_8859_5_BulgarianModel : BulgarianModel
46 | {
47 | // CTR: Control characters that usually does not exist in any text
48 | // RET: Carriage/Return
49 | // SYM: symbol(punctuation) that does not belong to word
50 | // NUM: 0 - 9
51 | //
52 | // Character Mapping Table:
53 | // this table is modified base on win1251BulgarianCharToOrderMap, so
54 | // only number <64 is sure valid
55 |
56 | private static byte[] CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */
62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */
64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, /* 8X */
66 | 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, /* 9X */
67 | 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, /* AX */
68 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* BX */
69 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, /* CX */
70 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* DX */
71 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, /* EX */
72 | 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Iso_8859_5_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Bulgarian/Windows_1251_BulgarianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian
44 | {
45 | public class Windows_1251_BulgarianModel : BulgarianModel
46 | {
47 | // CTR: Control characters that usually does not exist in any text
48 | // RET: Carriage/Return
49 | // SYM: symbol(punctuation) that does not belong to word
50 | // NUM: 0 - 9
51 | //
52 | // Character Mapping Table:
53 | // this table is modified base on win1251BulgarianCharToOrderMap, so
54 | // only number <64 is sure valid
55 |
56 | private static byte[] CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */
62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */
64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, /* 8X */
66 | 221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, /* 9X */
67 | 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, /* AX */
68 | 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, /* BX */
69 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* CX */
70 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, /* DX */
71 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* EX */
72 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Windows_1251_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Croatian/Ibm852_CroatianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCroatianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Croatian
44 | {
45 | public class Ibm852_CroatianModel : CroatianModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-25 23:50:27.590137
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */
73 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */
75 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */
77 | 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */
78 | 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */
82 | 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_CroatianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Czech/Ibm852_CzechModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Czech
44 | {
45 | public class Ibm852_CzechModel : CzechModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 03:28:11.733089
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */
73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */
75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */
77 | 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */
78 | 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */
82 | 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Czech/Iso_8859_2_CzechModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Czech
44 | {
45 | public class Iso_8859_2_CzechModel : CzechModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 03:28:11.733089
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */
73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */
75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */
79 | SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */
80 | 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */
81 | 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */
82 | 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */
83 | 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_2_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_2)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Hebrew/Windows_1255_HebrewModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | /*
40 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
41 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangHebrewModel.cpp
42 | * and adjusted to language specific support.
43 | */
44 |
45 | namespace UtfUnknown.Core.Models.SingleByte.Hebrew
46 | {
47 | public class Windows_1255_HebrewModel : HebrewModel
48 | {
49 | // 255: Control characters that usually does not exist in any text
50 | // 254: Carriage/Return
51 | // 253: symbol (punctuation) that does not belong to word
52 | // 252: 0 - 9
53 |
54 | // Windows-1255 language model
55 | // Character Mapping Table:
56 | private readonly static byte[]CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, /* 4X */
62 | 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, /* 6X */
64 | 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 124,ILL,203,204,205, 40, 58,206,207,208,ILL,210,ILL,ILL,ILL,ILL, /* 8X */
66 | ILL, 83, 52, 47, 46, 72, 32, 94,216,113,ILL,109,ILL,ILL,ILL,ILL, /* 9X */
67 | 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, /* AX */
68 | 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, /* BX */
69 | 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, /* CX */
70 | 238, 38, 45,239,240,241,242,243,127,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* DX */
71 | 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, /* EX */
72 | 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,ILL,ILL,128, 96,ILL, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Windows_1255_HebrewModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1255)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Irish/Iso_8859_15_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_15_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */
80 | 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */
81 | 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */
82 | 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */
83 | 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_15_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_15)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Irish/Iso_8859_1_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_1_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */
81 | 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */
82 | 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */
83 | 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_1_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Irish/Iso_8859_9_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_9_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */
81 | 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */
82 | 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */
83 | 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_9_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_9)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Polish/Ibm852_PolishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangPolishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Polish
44 | {
45 | public class Ibm852_PolishModel : PolishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 17:21:04.405363
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */
73 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */
75 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */
77 | 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */
78 | 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */
82 | 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_PolishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/Ibm855_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Ibm855_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] BYTE_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70
56 | 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
57 | 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
58 | 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
59 | 220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
60 | 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
61 | 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
62 | 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
63 | 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR,
64 | };
65 |
66 | public Ibm855_RussianModel() : base(BYTE_TO_ORDER_MAP, CodepageName.IBM855)
67 | {
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/Ibm866_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Ibm866_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70
56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
58 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
59 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
60 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
61 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR,
64 | };
65 |
66 | public Ibm866_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM866)
67 | {
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/Iso_8859_5_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Iso_8859_5_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */
59 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* BX */
60 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* CX */
61 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* DX */
62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* EX */
63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Iso_8859_5_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/Koi8r_Model.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Koi8r_Model : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, /* AX */
59 | 238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, /* BX */
60 | 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, /* CX */
61 | 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, /* DX */
62 | 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, /* EX */
63 | 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Koi8r_Model() : base(CHAR_TO_ORDER_MAP, CodepageName.KOI8_R)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/Windows_1251_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Windows_1251_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */
59 | 239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, /* BX */
60 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* CX */
61 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* DX */
62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */
63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Windows_1251_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Russian/X_Mac_Cyrillic_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class X_Mac_Cyrillic_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* 8X */
57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* 9X */
58 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* AX */
59 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* BX */
60 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* CX */
61 | 239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, /* DX */
62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */
63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public X_Mac_Cyrillic_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.X_MAC_CYRILLIC)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Slovak/Ibm852_SlovakModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSlovakModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Slovak
44 | {
45 | public class Ibm852_SlovakModel : SlovakModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 13:33:10.331339
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */
73 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */
75 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 51, 46, 25, 62, 38, 48, 47, 51, 49, 54, 50, 50, 63, 64, 38, 47, /* 8X */
77 | 25, 42, 42, 32, 43, 33, 33, 65, 66, 43, 46, 31, 31, 49,SYM, 24, /* 9X */
78 | 21, 23, 35, 27, 67, 68, 26, 26, 69, 70,SYM, 71, 24, 59,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 21, 72, 41, 59,SYM,SYM,SYM,SYM, 61, 61,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM, 56, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 55, 55, 39, 54, 39, 36, 23, 73, 41,SYM,SYM,SYM,SYM, 74, 48,SYM, /* DX */
82 | 35, 58, 32, 52, 52, 36, 28, 28, 44, 27, 44, 60, 22, 22, 75,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 45, 45,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_SlovakModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Slovene/Ibm852_SloveneModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSloveneModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Slovene
44 | {
45 | public class Ibm852_SloveneModel : SloveneModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-28 22:06:46.134717
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */
73 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */
75 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 34,249, 29,249,249,249, 37, 34,249, 36,249,249,249,249,249, 37, /* 8X */
77 | 29,249,249, 35,249,249,249,249,249,249,249,249,249,249,SYM, 21, /* 9X */
78 | 32, 30, 31, 39,249,249, 23, 23,249,249,SYM,249, 21,249,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 32,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 249,249,249, 36,249,249, 30,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */
82 | 31,249, 35,249,249,249, 22, 22,249, 39,249,249, 40, 40,249,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_SloveneModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Core/Models/SingleByte/Thai/Tis_620_ThaiModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangThaiModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Thai
44 | {
45 | public class Tis_620_ThaiModel: ThaiModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2015-12-04 03:05:06.182099
49 | //
50 | //aracter Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 | //
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 | //
60 | // Orders are generic to a language.So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order.For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French.Same for the euro sign.
66 |
67 | private readonly static byte[] CHAR_TO_ORDER_MAP =
68 | {
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
70 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
71 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
72 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
73 | SYM, 66, 70, 67, 80, 78, 87, 85, 73, 79, 93, 88, 84, 68, 77, 81, /* 4X */
74 | 75,101, 74, 61, 71, 86, 96, 90,103,100, 99,SYM,SYM,SYM,SYM,SYM, /* 5X */
75 | SYM, 35, 64, 48, 52, 32, 60, 65, 54, 36, 97, 76, 46, 56, 41, 40, /* 6X */
76 | 59,104, 43, 45, 44, 55, 72, 82, 94, 57, 92,SYM,SYM,SYM,SYM,CTR, /* 7X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
78 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
79 | ILL, 3, 23,105, 15,106, 89, 5, 21, 63, 26, 31,102, 42, 69, 58, /* AX */
80 | 49, 91, 83, 34, 9, 17, 30, 12, 39, 1, 16, 19, 33, 62, 22, 47, /* BX */
81 | 38, 7, 10, 2, 50, 11,107, 8, 28, 37, 13, 18, 98, 4, 53, 95, /* CX */
82 | 14,SYM, 0, 29,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, /* DX */
83 | 6, 20, 27, 24, 25,108, 51,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,109, /* EX */
84 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,110,111,ILL,ILL,ILL,ILL, /* FX */
85 | };
86 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
87 |
88 | public Tis_620_ThaiModel() : base(CHAR_TO_ORDER_MAP, CodepageName.TIS_620)
89 | {
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/Core/Models/StateMachineModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Kohei TAKETA (Java port)
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System;
40 |
41 | namespace UtfUnknown.Core.Models
42 | {
43 | ///
44 | /// State machine model
45 | ///
46 | public abstract class StateMachineModel
47 | {
48 | ///
49 | /// Start node
50 | ///
51 | public const int START = 0;
52 |
53 | ///
54 | /// Error node ?
55 | ///
56 | public const int ERROR = 1;
57 |
58 | ///
59 | /// ?
60 | ///
61 | public const int ITSME = 2;
62 |
63 | public BitPackage classTable;
64 | public BitPackage stateTable;
65 | public int[] charLenTable;
66 |
67 | public string Name { get; }
68 |
69 | public int ClassFactor { get; }
70 |
71 | public StateMachineModel(BitPackage classTable, int classFactor,
72 | BitPackage stateTable, int[] charLenTable, String name)
73 | {
74 | this.classTable = classTable;
75 | ClassFactor = classFactor;
76 | this.stateTable = stateTable;
77 | this.charLenTable = charLenTable;
78 | Name = name;
79 | }
80 |
81 | public int GetClass(byte b)
82 | {
83 | return classTable.Unpack((int)b);
84 | }
85 | }
86 | }
--------------------------------------------------------------------------------
/src/Core/Probers/CodingStateMachine.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is mozilla.org code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Kohei TAKETA (Java port)
24 | * Rudi Pettazzi (C# port)
25 | *
26 | * Alternatively, the contents of this file may be used under the terms of
27 | * either the GNU General Public License Version 2 or later (the "GPL"), or
28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 | * in which case the provisions of the GPL or the LGPL are applicable instead
30 | * of those above. If you wish to allow use of your version of this file only
31 | * under the terms of either the GPL or the LGPL, and not to allow others to
32 | * use your version of this file under the terms of the MPL, indicate your
33 | * decision by deleting the provisions above and replace them with the notice
34 | * and other provisions required by the GPL or the LGPL. If you do not delete
35 | * the provisions above, a recipient may use your version of this file under
36 | * the terms of any one of the MPL, the GPL or the LGPL.
37 | *
38 | * ***** END LICENSE BLOCK ***** */
39 |
40 | using UtfUnknown.Core.Models;
41 |
42 | namespace UtfUnknown.Core.Probers
43 | {
44 | ///
45 | /// Parallel state machine for the Coding Scheme Method
46 | ///
47 | public class CodingStateMachine
48 | {
49 | private int currentState;
50 | private StateMachineModel model;
51 | private int currentCharLen;
52 |
53 |
54 | public CodingStateMachine(StateMachineModel model)
55 | {
56 | currentState = StateMachineModel.START;
57 | this.model = model;
58 | }
59 |
60 | public int NextState(byte b)
61 | {
62 | // for each byte we get its class, if it is first byte,
63 | // we also get byte length
64 | int byteCls = model.GetClass(b);
65 | if (currentState == StateMachineModel.START) {
66 |
67 | currentCharLen = model.charLenTable[byteCls];
68 | }
69 |
70 | // from byte's class and stateTable, we get its next state
71 | currentState = model.stateTable.Unpack(
72 | currentState * model.ClassFactor + byteCls);
73 |
74 | return currentState;
75 | }
76 |
77 | public void Reset()
78 | {
79 | currentState = StateMachineModel.START;
80 | }
81 |
82 | public int CurrentCharLen
83 | {
84 | get { return currentCharLen; }
85 | }
86 |
87 | public string ModelName
88 | {
89 | get { return model.Name; }
90 | }
91 | }
92 | }
--------------------------------------------------------------------------------
/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Analyzers.Chinese;
42 | using UtfUnknown.Core.Models;
43 | using UtfUnknown.Core.Models.MultiByte.Chinese;
44 |
45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese
46 | {
47 | public class Big5Prober : CharsetProber
48 | {
49 | //void GetDistribution(PRUint32 aCharLen, const char* aStr);
50 | private CodingStateMachine codingSM;
51 | private BIG5DistributionAnalyser distributionAnalyser;
52 | private byte[] lastChar = new byte[2];
53 |
54 | public Big5Prober()
55 | {
56 | codingSM = new CodingStateMachine(new BIG5SMModel());
57 | distributionAnalyser = new BIG5DistributionAnalyser();
58 | Reset();
59 | }
60 |
61 | public override ProbingState HandleData(byte[] buf, int offset, int len)
62 | {
63 | int max = offset + len;
64 |
65 | for (int i = offset; i < max; i++)
66 | {
67 | var codingState = codingSM.NextState(buf[i]);
68 | if (codingState == StateMachineModel.ERROR)
69 | {
70 | state = ProbingState.NotMe;
71 | break;
72 | }
73 | if (codingState == StateMachineModel.ITSME)
74 | {
75 | state = ProbingState.FoundIt;
76 | break;
77 | }
78 | if (codingState == StateMachineModel.START)
79 | {
80 | int charLen = codingSM.CurrentCharLen;
81 | if (i == offset)
82 | {
83 | lastChar[1] = buf[offset];
84 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
85 | }
86 | else
87 | {
88 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
89 | }
90 | }
91 | }
92 |
93 | lastChar[0] = buf[max - 1];
94 |
95 | if (state == ProbingState.Detecting)
96 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
97 | state = ProbingState.FoundIt;
98 |
99 | return state;
100 | }
101 |
102 | public override void Reset()
103 | {
104 | codingSM.Reset();
105 | state = ProbingState.Detecting;
106 | distributionAnalyser.Reset();
107 | }
108 |
109 | public override string GetCharsetName()
110 | {
111 | return CodepageName.BIG5;
112 | }
113 |
114 | public override float GetConfidence(StringBuilder status = null)
115 | {
116 | return distributionAnalyser.GetConfidence();
117 | }
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Analyzers.Chinese;
42 | using UtfUnknown.Core.Models;
43 | using UtfUnknown.Core.Models.MultiByte.Chinese;
44 |
45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese
46 | {
47 | public class EUCTWProber : CharsetProber
48 | {
49 | private CodingStateMachine codingSM;
50 | private EUCTWDistributionAnalyser distributionAnalyser;
51 | private byte[] lastChar = new byte[2];
52 |
53 | public EUCTWProber()
54 | {
55 | codingSM = new CodingStateMachine(new EUCTWSMModel());
56 | distributionAnalyser = new EUCTWDistributionAnalyser();
57 | Reset();
58 | }
59 |
60 | public override ProbingState HandleData(byte[] buf, int offset, int len)
61 | {
62 | int codingState;
63 | int max = offset + len;
64 |
65 | for (int i = 0; i < max; i++)
66 | {
67 | codingState = codingSM.NextState(buf[i]);
68 | if (codingState == StateMachineModel.ERROR)
69 | {
70 | state = ProbingState.NotMe;
71 | break;
72 | }
73 |
74 | if (codingState == StateMachineModel.ITSME)
75 | {
76 | state = ProbingState.FoundIt;
77 | break;
78 | }
79 |
80 | if (codingState == StateMachineModel.START)
81 | {
82 | int charLen = codingSM.CurrentCharLen;
83 | if (i == offset)
84 | {
85 | lastChar[1] = buf[offset];
86 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
87 | }
88 | else
89 | {
90 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
91 | }
92 | }
93 | }
94 |
95 | lastChar[0] = buf[max - 1];
96 |
97 | if (state == ProbingState.Detecting)
98 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
99 | state = ProbingState.FoundIt;
100 |
101 | return state;
102 | }
103 |
104 | public override string GetCharsetName()
105 | {
106 | return CodepageName.EUC_TW;
107 | }
108 |
109 | public override void Reset()
110 | {
111 | codingSM.Reset();
112 | state = ProbingState.Detecting;
113 | distributionAnalyser.Reset();
114 | }
115 |
116 | public override float GetConfidence(StringBuilder status = null)
117 | {
118 | return distributionAnalyser.GetConfidence();
119 | }
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/Core/Probers/MultiByte/UTF8Prober.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Models;
42 | using UtfUnknown.Core.Models.MultiByte;
43 |
44 | namespace UtfUnknown.Core.Probers.MultiByte
45 | {
46 | public class UTF8Prober : CharsetProber
47 | {
48 | private static float ONE_CHAR_PROB = 0.50f;
49 | private CodingStateMachine codingSM;
50 | private int numOfMBChar;
51 |
52 | public UTF8Prober()
53 | {
54 | numOfMBChar = 0;
55 | codingSM = new CodingStateMachine(new UTF8_SMModel());
56 | Reset();
57 | }
58 |
59 | public override string GetCharsetName()
60 | {
61 | return CodepageName.UTF8;
62 | }
63 |
64 | public override void Reset()
65 | {
66 | codingSM.Reset();
67 | numOfMBChar = 0;
68 | state = ProbingState.Detecting;
69 | }
70 |
71 | public override ProbingState HandleData(byte[] buf, int offset, int len)
72 | {
73 | int max = offset + len;
74 |
75 | for (int i = offset; i < max; i++)
76 | {
77 |
78 | var codingState = codingSM.NextState(buf[i]);
79 |
80 | if (codingState == StateMachineModel.ERROR)
81 | {
82 | state = ProbingState.NotMe;
83 | break;
84 | }
85 |
86 | if (codingState == StateMachineModel.ITSME)
87 | {
88 | state = ProbingState.FoundIt;
89 | break;
90 | }
91 |
92 | if (codingState == StateMachineModel.START)
93 | {
94 | if (codingSM.CurrentCharLen >= 2)
95 | numOfMBChar++;
96 | }
97 | }
98 |
99 | if (state == ProbingState.Detecting)
100 | if (GetConfidence() > SHORTCUT_THRESHOLD)
101 | state = ProbingState.FoundIt;
102 |
103 | return state;
104 | }
105 |
106 | public override float GetConfidence(StringBuilder status = null)
107 | {
108 | float unlike = 0.99f;
109 | float confidence;
110 |
111 | if (numOfMBChar < 6)
112 | {
113 | for (int i = 0; i < numOfMBChar; i++)
114 | unlike *= ONE_CHAR_PROB;
115 |
116 | confidence = 1.0f - unlike;
117 | }
118 | else
119 | {
120 | confidence = 0.99f;
121 | }
122 |
123 | return confidence;
124 | }
125 | }
126 | }
--------------------------------------------------------------------------------
/src/Core/Probers/ProbingState.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Probers
2 | {
3 | public enum ProbingState
4 | {
5 | ///
6 | /// No sure answer yet, but caller can ask for confidence
7 | ///
8 | Detecting = 0,
9 | ///
10 | /// Positive answer
11 | ///
12 | FoundIt = 1,
13 | ///
14 | /// Negative answer
15 | ///
16 | NotMe = 2
17 | }
18 | }
--------------------------------------------------------------------------------
/src/DetectionDetail.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Runtime.CompilerServices;
4 | using System.Text;
5 | using UtfUnknown.Core;
6 | using UtfUnknown.Core.Probers;
7 |
8 | [assembly: InternalsVisibleTo("UtfUnknown.Tests, PublicKey=" +
9 | "002400000480000094000000060200000024000052534131000400000100010029f6b4defac763" +
10 | "66721687460b44b7619e8e19a411f785279316fdae2f6965edfa4a460304fe8b4ed796d5356a1c" +
11 | "225131b9087983d9ff9530df9307eab17d88cd4f1005a45f6f35523445d1ff7323322f3060cffc" +
12 | "0d70d0cb1b4b7d46081bbead31844927aaadb0508b64bf298de5abe5ea5cca8b92490c961b7b75" +
13 | "13c2c2a9")]
14 | namespace UtfUnknown
15 | {
16 | ///
17 | /// Detailed result of a detection
18 | ///
19 | public class DetectionDetail
20 | {
21 | ///
22 | /// A dictionary for replace unsupported codepage name in .NET to the nearly identical version.
23 | ///
24 | private static readonly Dictionary FixedToSupportCodepageName =
25 | new Dictionary
26 | {
27 | // CP949 is superset of ks_c_5601-1987 (see https://github.com/CharsetDetector/UTF-unknown/pull/74#issuecomment-550362133)
28 | {CodepageName.CP949, CodepageName.KS_C_5601_1987},
29 | {CodepageName.ISO_2022_CN, CodepageName.X_CP50227},
30 | };
31 |
32 | ///
33 | /// New result
34 | ///
35 | public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null,
36 | TimeSpan? time = null, string statusLog = null)
37 | {
38 | EncodingName = encodingShortName;
39 | Confidence = confidence;
40 | Encoding = GetEncoding(encodingShortName);
41 | Prober = prober;
42 | Time = time;
43 | StatusLog = statusLog;
44 | }
45 |
46 | ///
47 | /// New Result
48 | ///
49 | public DetectionDetail(CharsetProber prober, TimeSpan? time = null)
50 | : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time, prober.DumpStatus())
51 | {
52 | }
53 |
54 | ///
55 | /// The (short) name of the detected encoding. For full details, check
56 | ///
57 | public string EncodingName { get; }
58 |
59 | ///
60 | /// The detected encoding.
61 | ///
62 | public Encoding Encoding { get; set; }
63 |
64 | ///
65 | /// The confidence of the found encoding. Between 0 and 1.
66 | ///
67 | public float Confidence { get; set; }
68 |
69 | ///
70 | /// The used prober for detection
71 | ///
72 | public CharsetProber Prober { get; set; }
73 |
74 | ///
75 | /// A Byte Order Mark was detected
76 | ///
77 | public bool HasBOM { get; set; }
78 |
79 | ///
80 | /// The time spend
81 | ///
82 | public TimeSpan? Time { get; set; }
83 |
84 | public string StatusLog { get; set; }
85 |
86 | public override string ToString()
87 | {
88 | return $"Detected {EncodingName} with confidence of {Confidence}. (BOM: {HasBOM})";
89 | }
90 |
91 | internal static Encoding GetEncoding(string encodingShortName)
92 | {
93 | var encodingName = FixedToSupportCodepageName.TryGetValue(encodingShortName, out var supportCodepageName)
94 | ? supportCodepageName
95 | : encodingShortName;
96 | try
97 | {
98 | return Encoding.GetEncoding(encodingName);
99 | }
100 | catch (Exception exception) when
101 | (exception is ArgumentException || // unsupported name
102 | exception is NotSupportedException)
103 | {
104 | #if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
105 | return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
106 | #else
107 | return null;
108 | #endif
109 | }
110 | }
111 | }
112 | }
--------------------------------------------------------------------------------
/src/DetectionResult.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 |
4 | namespace UtfUnknown
5 | {
6 | ///
7 | /// Result of a detection.
8 | ///
9 | public class DetectionResult
10 | {
11 | ///
12 | /// Empty
13 | ///
14 | public DetectionResult()
15 | {
16 | }
17 |
18 | ///
19 | /// Multiple results
20 | ///
21 | public DetectionResult(IList details)
22 | {
23 | Details = details;
24 | }
25 |
26 | ///
27 | /// Single result
28 | ///
29 | ///
30 | public DetectionResult(DetectionDetail detectionDetail)
31 | {
32 | Details = new List { detectionDetail };
33 | }
34 |
35 | ///
36 | /// Get the best Detection
37 | ///
38 | public DetectionDetail Detected => Details?.FirstOrDefault();
39 |
40 | ///
41 | /// All results
42 | ///
43 | public IList Details { get; set; }
44 |
45 | public override string ToString()
46 | {
47 | return $"{nameof(Detected)}: {Detected}, \n{nameof(Details)}:\n - {string.Join("\n- ", Details?.Select(d => d.ToString()))}";
48 | }
49 | }
50 | }
--------------------------------------------------------------------------------
/src/UTF-unknown.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0
5 |
6 |
7 |
8 | UtfUnknown
9 | UTF.Unknown
10 | 2.0.0
11 |
12 | Full
13 |
14 |
15 |
16 | Library
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | Julian Verdurmen, Rustam Sayfutdinov, Rudi Pettazzi, Shy Shalom
28 | en-US
29 | UTF Unknown
30 | Detect character set for files, streams and other bytes.
31 |
32 | This package is based on Ude and since version 2 also on uchardet, which are ports of the Mozilla Universal Charset Detector.
33 |
34 | Features:
35 | - Easy to use API
36 | - Supports frameworks:
37 | - .NET 5+
38 | - .NET Standard 1.0+
39 | - .NET Core 3.0+
40 | - .NET Framework 4.0+
41 | - Strong named
42 | - XML documentation included
43 |
44 | Compared to Ude:
45 |
46 | - Refactor of API, namespaces and deadcode removal
47 | - Added some docs
48 | - Improve error handling
49 | - Improved unit tests
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | charset;detection;unicode;ascii;netstandard;chardet
59 |
60 | - See https://github.com/CharsetDetector/UTF-unknown/releases
61 |
62 | https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/master/logo.png
63 | https://github.com/CharsetDetector/UTF-unknown
64 | https://github.com/CharsetDetector/UTF-unknown/blob/master/license/MPL-1.1.txt
65 | false
66 | git
67 | https://github.com/CharsetDetector/UTF-unknown
68 | UtfUnknown
69 | True
70 | UtfUnknown.snk
71 |
72 |
73 |
74 | bin\$(Configuration)\$(TargetFramework)\UtfUnknown.xml
75 | 1701;1702;1705,1570,1591
76 | 2.0.0.0
77 | 2.0.0.0
78 |
79 |
80 |
--------------------------------------------------------------------------------
/src/UtfUnknown.snk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/src/UtfUnknown.snk
--------------------------------------------------------------------------------
/tests/BitPackageTest.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Kohei TAKETA (Java port)
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using NUnit.Framework;
40 | using UtfUnknown.Core;
41 |
42 | namespace UtfUnknown.Tests
43 | {
44 | public class BitPackageTest
45 | {
46 | [Test]
47 | public void TestPack()
48 | {
49 | Assert.AreEqual(BitPackage.Pack4bits(0,0,0,0,0,0,0,0), 0);
50 | Assert.AreEqual(BitPackage.Pack4bits(1,1,1,1,1,1,1,1), 286331153);
51 | Assert.AreEqual(BitPackage.Pack4bits(2,2,2,2,2,2,2,2), 572662306);
52 | Assert.AreEqual(BitPackage.Pack4bits(15,15,15,15,15,15,15,15), -1);
53 | }
54 |
55 | [Test]
56 | public void TestUnpack()
57 | {
58 | int[] data = new int[] {
59 | BitPackage.Pack4bits(0, 1, 2, 3, 4, 5, 6, 7),
60 | BitPackage.Pack4bits(8, 9, 10, 11, 12, 13, 14, 15)
61 | };
62 |
63 | BitPackage pkg = new BitPackage(
64 | BitPackage.INDEX_SHIFT_4BITS,
65 | BitPackage.SHIFT_MASK_4BITS,
66 | BitPackage.BIT_SHIFT_4BITS,
67 | BitPackage.UNIT_MASK_4BITS,
68 | data);
69 |
70 | for (int i = 0; i < 16; i++) {
71 | int n = pkg.Unpack(i);
72 | Assert.AreEqual(n, i);
73 | }
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/tests/Data/README.md:
--------------------------------------------------------------------------------
1 | These text fragments have been copied taken from the following sources:
2 |
3 | - Wikipedia - http://wikipedia.org
4 | - Project Gutenberg - http://www.gutenberg.org
5 |
6 | The test files are automatically discovered.
7 | The directory name should be the expected encoding.
8 | If there is a `(`, then it's the name before it.
9 |
--------------------------------------------------------------------------------
/tests/Data/big5/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/big5/1.txt
--------------------------------------------------------------------------------
/tests/Data/cp949/cp949_1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/cp949/cp949_1.txt
--------------------------------------------------------------------------------
/tests/Data/cp949/cp949_2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/cp949/cp949_2.txt
--------------------------------------------------------------------------------
/tests/Data/euc-jp/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-jp/1.txt
--------------------------------------------------------------------------------
/tests/Data/euc-kr/euc1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-kr/euc1.txt
--------------------------------------------------------------------------------
/tests/Data/euc-kr/euc2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/euc-kr/euc2.txt
--------------------------------------------------------------------------------
/tests/Data/gb18030/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/gb18030/1.txt
--------------------------------------------------------------------------------
/tests/Data/ibm852/lang_ce_ibm852.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm852/lang_ce_ibm852.txt
--------------------------------------------------------------------------------
/tests/Data/ibm855/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm855/1.txt
--------------------------------------------------------------------------------
/tests/Data/ibm855/2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm855/2.txt
--------------------------------------------------------------------------------
/tests/Data/ibm866/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/ibm866/1.txt
--------------------------------------------------------------------------------
/tests/Data/iso-2022-jp/1.txt:
--------------------------------------------------------------------------------
1 | ========================================================================
2 | $B%3%s%=!<%k(J $B%"%W%j%1!<%7%g%s(J : universalchardet $B%W%m%8%'%/%H$N35MW(J
3 | ========================================================================
4 |
5 | $B$3$N(J universalchardet $B%"%W%j%1!<%7%g%s$O!"(JAppWizard $B$K$h$C$F:n@.$5$l$^$7$?!#(J
6 |
7 | $B$3$N%U%!%$%k$K$O!"(Juniversalchardet $B%"%W%j%1!<%7%g%s$r9=@.$9$k3F%U%!%$%k$N(J
8 | $BFbMF$N35N,$,5-=R$5$l$F$$$^$9!#(J
9 |
10 |
11 | universalchardet.vcproj
12 | $B$3$l$O!"%"%W%j%1!<%7%g%s(J $B%&%#%6!<%I$G@8@.$5$l$k(J VC++ $B%W%m%8%'%/%H$N%a%$%s$N(J
13 | $B%W%m%8%'%/%H(J $B%U%!%$%k$G$9!#(J
14 | $B%U%!%$%k$r@8@.$7$?(J Visual C++ $B$N%P!<%8%g%s>pJs$H!"%"%W%j%1!<%7%g%s(J
15 | $B%&%#%6!<%I$GA*Br$7$?%W%i%C%H%U%)!<%`!"9=@.!"$*$h$S%W%m%8%'%/%H$N5!G=$K4X$9$k(J
16 | $B>pJs$,5-=R$5$l$F$$$^$9!#(J
17 |
18 | universalchardet.cpp
19 | $B$3$l$O!"%a%$%s$N%"%W%j%1!<%7%g%s(J $B%=!<%9(J $B%U%!%$%k$G$9!#(J
20 |
21 | /////////////////////////////////////////////////////////////////////////////
22 | $B$=$NB>$NI8=`%U%!%$%k(J :
23 |
24 | StdAfx.h, StdAfx.cpp
25 | $B$3$l$i$N%U%!%$%k$O!"%3%s%Q%$%k:Q$_%X%C%@!<(J (PCH) $B%U%!%$%k(J
26 | universalchardet.pch $B$H%W%j%3%s%Q%$%k:Q$_7?%U%!%$%k(J StdAfx.obj $B$r(J
27 | $B%S%k%I$9$k$?$a$K;HMQ$7$^$9!#(J
28 |
29 | /////////////////////////////////////////////////////////////////////////////
30 | $B$=$NB>$N%a%b(J :
31 |
32 | AppWizard $B$G$O(J "TODO:" $B%3%a%s%H$r;HMQ$7$F!"%f!<%6!<$,DI2C$^$?$O%+%9%?%^%$%:$9$k(J
33 | $B%=!<%9ItJ,$r<($7$^$9!#(J
34 |
35 | /////////////////////////////////////////////////////////////////////////////
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/tests/Data/iso-2022-kr/iso1.txt:
--------------------------------------------------------------------------------
1 | $)C?*;g@{ ?9F4Q, ?*;g@{ @N0#@87N<-@G ?9J4B4Y. GQ19 ;g6w5i@: 4k03 !.:OGQ(]Ay[)!/@L6s0m :N8#8g, B*0T !.:O!/@87N :N8& 6'55 @V4Y. 3*@L0! 89@: <<4k?!<-4B 0#H$ !.@L:O(l$]A)!/@L6s4B G%Gv@; >21b55 GO8g, A&GQ@{@87N ':OA6<1'@L6s :N8#1b55 GQ4Y. 0z0E 4kGQ9N19 A$:N?!<-4B :OA6<1(:OGQ)@; ?> 2@LAv >J0T 5G>z4Y.[3] 0#H$ @O:N 9]0xAV@G :821b55 GQ4Y.
4 |
5 | 4kGQ9N19?!<-4B A6<1@L3* :OA6<1@L6u G%Gv@; 1bGGGO8g[CbC3 GJ?d], A6<1@: A6<1 ?UA63* 4\1:A6<1 5n@; 0!8.E04B 8;7N >44Y. 32:O0#@G 137y0! H0<:H-5G8i<- F/:0GQ ;vC$0! >x4B !.:OCx!/@L3* !.:OBJ!/@L6u G%Gv55 89@L >2@L0m @V4B C_<<@L4Y. 4kGQ9N19?!<-4B GQ19 @|@o @L@|@G G`A$ 18?*@N Fr>H3255!$Fr>H:O55!$GT0f3255!$GT0f:O55!$H2GX558& !.@L:O 555(l$]AgiT3)!/6s :N8#1b55 GQ4Y.
6 |
7 | 0x=D@{@N ?5>n 8mD*@: DPRK(Democratic People's Republic of Korea)@L8g :8Ek 'A6<19]55(GQ9]55) :OBJ'@; @G9LGO4B North Korea6s0m :N8%4Y.
8 |
--------------------------------------------------------------------------------
/tests/Data/iso-8859-1/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/1.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-1/3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/3.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-1/4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-1/4.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-11/lang_th_iso-8859-11.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-11/lang_th_iso-8859-11.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-13/lang_et_iso-8859-13.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-13/lang_et_iso-8859-13.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-15/lang_da_iso-8859-15.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-15/lang_da_iso-8859-15.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-2/lang_ce_iso-8859-2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-2/lang_ce_iso-8859-2.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-3/lang_eo_iso-8859-3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-3/lang_eo_iso-8859-3.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-4/lang_et_iso-8859-4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-4/lang_et_iso-8859-4.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-5/lang_ru_iso-8859-5.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-5/lang_ru_iso-8859-5.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-6/lang_ar_iso-8859-6.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-6/lang_ar_iso-8859-6.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-7/greek.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-7/greek.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-7/lang_le_iso-8859-7.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-7/lang_le_iso-8859-7.txt
--------------------------------------------------------------------------------
/tests/Data/iso-8859-9/lang_tr_iso-8859-9.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/iso-8859-9/lang_tr_iso-8859-9.txt
--------------------------------------------------------------------------------
/tests/Data/koi8-r/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/koi8-r/1.txt
--------------------------------------------------------------------------------
/tests/Data/koi8-r/2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/koi8-r/2.txt
--------------------------------------------------------------------------------
/tests/Data/shift-jis/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/1.txt
--------------------------------------------------------------------------------
/tests/Data/shift-jis/2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/2.txt
--------------------------------------------------------------------------------
/tests/Data/shift-jis/3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/3.txt
--------------------------------------------------------------------------------
/tests/Data/shift-jis/4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/shift-jis/4.txt
--------------------------------------------------------------------------------
/tests/Data/tis-620/lang_th_tis-620.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/tis-620/lang_th_tis-620.txt
--------------------------------------------------------------------------------
/tests/Data/utf-16be/lang_fr_utf-16.be:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-16be/lang_fr_utf-16.be
--------------------------------------------------------------------------------
/tests/Data/utf-16le/lang_ko_utf-16.le:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-16le/lang_ko_utf-16.le
--------------------------------------------------------------------------------
/tests/Data/utf-32le/lang_fr_utf-32.le:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/utf-32le/lang_fr_utf-32.le
--------------------------------------------------------------------------------
/tests/Data/utf-8/1.txt:
--------------------------------------------------------------------------------
1 | 역사적 예수 연구자들은 복음서나 사도들의 서신서 속의 교리적 예수가 아닌, 역사적 인간으로서의 예수를 추구한다. 20세기 이후 역사적 예수에 대한 연구는 마커스 보그, 가톨릭 수사 출신으로 환속한 도미닉 크로산 등의 예수 세미나 운동 시작을 통해 진행되고 있다. 대한민국에서는 한국 기독교 연구소(소장 김준우)에서 크로산 등의 신학 문서들을 출판하여, 역사적 예수에 대한 연구 성과들을 소개하고 있다.
2 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/2.txt:
--------------------------------------------------------------------------------
1 | 북조선 사람들은 흔히 자국을 조선(朝鮮)이나 공화국(共和國)이라고 부른다. 지역적으로 한반도의 남쪽(대한민국)을 의미하는 남조선(南朝鮮)에 대응하여 북조선(北朝鮮)이라고도 부른다.
2 |
3 | 대한민국에서는 남·북의 대치 상황과 맞물려 공식 명칭인 '조선민주주의인민공화국'을 잘 사용하지 않는다. 한국 사람들은 대개 ‘북한(北韓)’이라고 부르며, 짧게 ‘북’으로 부를 때도 있다. 나이가 많은 세대에서는 간혹 ‘이북(以北)’이라는 표현을 쓰기도 하며, 제한적으로 '북조선'이라 부르기도 한다. 과거 대한민국 정부에서는 북조선(북한)을 옛 소련의 괴뢰정권으로 비하하는 ‘북괴(北傀)’로 비칭하였으나 관계 개선과 함께 잘 쓰이지 않게 되었다.[3] 간혹 일부 반공주의 보수단체들은 북괴라는 표현을 쓰기도 한다.
4 |
5 | 대한민국에서는 조선이나 북조선이란 표현을 기피하며[출처 필요], 조선은 조선 왕조나 단군조선 등을 가리키는 말로 쓴다. 남북간의 교류가 활성화되면서 특별한 색채가 없는 ‘북측’이나 ‘북쪽’이란 표현도 많이 쓰이고 있는 추세이다. 대한민국에서는 한국 전쟁 이전의 행정 구역인 평안남도·평안북도·함경남도·함경북도·황해도를 ‘이북 5도(以北五道)’라 부르기도 한다.
6 |
7 | 공식적인 영어 명칭은 DPRK(Democratic People's Republic of Korea)이며 보통 '조선반도(한반도) 북쪽'을 의미하는 North Korea라고 부른다.
8 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/3.txt:
--------------------------------------------------------------------------------
1 | \\\\\\\{ssss } siaaaaaaaaa ssssi à è ì
2 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/4.txt:
--------------------------------------------------------------------------------
1 | ========================================================================
2 | コンソール アプリケーション : universalchardet プロジェクトの概要
3 | ========================================================================
4 |
5 | この universalchardet アプリケーションは、AppWizard によって作成されました。
6 |
7 | このファイルには、universalchardet アプリケーションを構成する各ファイルの
8 | 内容の概略が記述されています。
9 |
10 |
11 | universalchardet.vcproj
12 | これは、アプリケーション ウィザードで生成される VC++ プロジェクトのメインの
13 | プロジェクト ファイルです。
14 | ファイルを生成した Visual C++ のバージョン情報と、アプリケーション
15 | ウィザードで選択したプラットフォーム、構成、およびプロジェクトの機能に関する
16 | 情報が記述されています。
17 |
18 | universalchardet.cpp
19 | これは、メインのアプリケーション ソース ファイルです。
20 |
21 | /////////////////////////////////////////////////////////////////////////////
22 | その他の標準ファイル :
23 |
24 | StdAfx.h, StdAfx.cpp
25 | これらのファイルは、コンパイル済みヘッダー (PCH) ファイル
26 | universalchardet.pch とプリコンパイル済み型ファイル StdAfx.obj を
27 | ビルドするために使用します。
28 |
29 | /////////////////////////////////////////////////////////////////////////////
30 | その他のメモ :
31 |
32 | AppWizard では "TODO:" コメントを使用して、ユーザーが追加またはカスタマイズする
33 | ソース部分を示します。
34 |
35 | /////////////////////////////////////////////////////////////////////////////
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/5.txt:
--------------------------------------------------------------------------------
1 | 仙人洞文化係話到萬年大源盆地一隻叫仙人洞嗰溶洞發現嗰史前文化。九十年代到許裡尋到嘍距今距今1萬年嗰穀植矽石標本,咁一吖子就搦人類栽禾嗰歷史提早嘍5000年,仙人洞遺跡也就成為世界頭上嗰「稻作之源」。
2 |
3 | 萬年仙人洞人種出世界首棵水稻 贛鄱是世界的稻作起源中心區
4 |
5 | 1隻分類: 江西嗰歷史
6 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/greek.txt:
--------------------------------------------------------------------------------
1 | Η ελληνική αποτελεί τη μητρική γλώσσα περίπου 12 εκατομμυρίων ανθρώπων, κυρίως στην Ελλάδα και στην Κύπρο. Αποτελεί επίσης την μητρική γλώσσα αυτοχθόνων πληθυσμών στην Αλβανία, στη Βουλγαρία, στην ΠΓΔΜ και στην Τουρκία. Εξαιτίας της μετανάστευσης η γλώσσα μιλιέται ακόμα σε χώρες-προορισμούς ελληνόφωνων πληθυσμών μεταξύ των οποίων η Αυστραλία, ο Καναδάς, η Γερμανία, το Ηνωμένο Βασίλειο, η Ρωσία, η Σερβία και οι Ηνωμένες Πολιτείες. Συνολικά υπολογίζεται ότι ο συνολικός αριθμός ανθρώπων που μιλάνε τα ελληνικά σαν πρώτη ή δεύτερη γλώσσα είναι γύρω στα 20 εκατομμύρια.
2 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/he1.txt:
--------------------------------------------------------------------------------
1 | השם עבר מופיע בתנ"ך כשמו של סבו של אברהם אבינו. המושג "עברי" נזכר בתנ"ך פעמים רבות, אולם שפתם של העברים אינה נקראת עברית. כיום מכנים את שפת התנ"ך "לשון המקרא" (או "לשון הקודש") כדי להבדיל אותה מלשון חז"ל המכונה גם "לשון חכמים", שהיא בעצם ניב מאוחר של עברית. המונח כתב עברי מציין בלשונם של חז"ל דווקא את הכתב הארמי על שם "עבר הנהר".
2 |
3 | הקובץ המפורסם ביותר שנכתב בשפה העברית הוא התנ"ך, אם כי בו עצמו לא נזכר שמה של השפה. עם זאת, במלכים ב' יח, כו, ובישעיהו לו, יא, מסופר כי שליחי חזקיהו המלך מבקשים מרבשקה, שליחו של סנחריב מלך אשור, לדבר עמם ב"ארמית" ולא ב"יהודית", כדי שהעם (שכנראה לא דיבר ארמית) לא יבין את דבריהם, ונראה שזה היה שמה של השפה, או לפחות שמו של הניב שדובר באזור ירושלים.
4 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/he2.txt:
--------------------------------------------------------------------------------
1 | העברית היא שפה המשתייכת לקבוצת הלשונות השמיות הצפון מערביות, ומהווה את אחד הדיאלקטים של השפה הכנענית. שפה זו הייתה מדוברת החל מהאלף ה-2 לפני הספירה באזור הקרוי הלבנט, שהיום נמצא בשטחן של לבנון, סוריה, ארץ ישראל וירדן. טקסטים מהתקופה הזו שהתגלו בירדן ובלבנון חושפים קרבה רבה בין השפה העברית לשפה הפיניקית והמואבית.
2 |
3 | בעברית נכתבו רוב ספרי התנ"ך, כל המשנה, רוב הספרים החיצוניים ורוב המגילות הגנוזות. המקרא נכתב בעברית מקראית, ואילו המשנה נכתבה בניב הקרוי לשון חז"ל. בתקופה מסוימת בסוף המאה ה-2 לספירה או קצת מאוחר יותר (החוקרים חלוקים בשאלה זו) פסקו רוב היהודים מלהשתמש בעברית כבשפת דיבור. מאות שנים לאחר חתימת המשנה כאשר חדלו היהודים להשתמש בעברית כבר נכתבו התלמודים בארמית. עם זאת ישנן עדויות שאף במאה ה-8 לספירה שפת הדיבור בטבריה שם פעלו בעלי המסורה הייתה עברית.
4 |
5 | גם כשהשפה העברית לא שימשה שפת דיבור, עדיין שימשה לאורך הדורות, במה שמכונה תקופת הביניים של העברית, כשפת הכתב העיקרית של היהודים, בעיקר בעניינים הלכתיים: כתיבת פרוטוקולים של בתי דין, קובצי הלכות, פרשנות לכתבי קודש ועוד. גם כתיבת מכתבים וחוזים בין גברים יהודים נעשתה לעתים קרובות בעברית. ספרות הלכתית לנשים בקהילות אשכנזיות נכתבה ביידיש (למשל ספר ההלכות "צאינה וראינה"), כיוון שהנשים, בניגוד לגברים, לא למדו עברית. חיבורים יהודיים בעלי אופי חילוני או לא-הלכתי נכתבו בשפות יהודיות או בשפות זרות, לדוגמה: הרמב"ם כתב את ספרו "משנה תורה" בעברית, על אף שספרו הפילוסופי המפורסם "מורה נבוכים" שיועד למשכילי זמנו נכתב בערבית יהודית. עם זאת, "מורה נבוכים", כמו ספרים אחרים בנושאים חילוניים, תורגמו לעברית כשהיה בהם עניין לקהילות יהודיות דוברות שפות אחרות. אחת המשפחות היהודיות המפורסמות שעסקו בתרגום מערבית-יהודית לעברית בימי הביניים היא משפחת אבן תיבון.
6 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/he3.txt:
--------------------------------------------------------------------------------
1 | אין כמעט ניבים אזוריים עבריים. למעשה, השפה הנשמעת בפי דוברים ילידיים זהה כמעט בכל חלקי ישראל. אפשר להבחין בשוני בין הניבים המדוברים בפי עדות יהודיות שונות (אתנולקטים), אולם שוני זה מתבטא בעיקר בפונולוגיה, ולא בתחביר או במורפולוגיה. שוני מסוים בתחביר ובמורפולוגיה קיים בין ניבים מעמדיים של השפה (סוציולקטים), אולם שוני זה אינו גדול (יחסית).
2 |
--------------------------------------------------------------------------------
/tests/Data/utf-8/russian.txt:
--------------------------------------------------------------------------------
1 | В гимназии он не был в числе первых учеников (исключение составляли математика и латынь). Укоренившаяся система механического заучивания материала учащимися (которая, как он считал, наносит вред самому духу учёбы и творческому мышлению), а также авторитарное отношение учителей к ученикам вызывало у Альберта Эйнштейна неприятие, поэтому он часто вступал в споры со своими преподавателями.
2 |
3 | После окончательного разорения отца семейства в 1894 году Эйнштейны переехали из Мюнхена в итальянский город Павию, близ Милана. Сам Альберт оставался в Мюнхене ещё некоторое время, чтобы окончить все шесть классов гимназии. Так и не получив аттестата зрелости, в 1895 году он присоединился к своей семье в Милане.
4 |
5 | Осенью 1895 г. Альберт Эйнштейн прибыл в Швейцарию, чтобы сдать вступительные экзамены в Высшее техническое училище (Политехникум) в Цюрихе и стать преподавателем физики. Блестяще проявив себя на экзамене по математике, он в то же время провалил экзамены по ботанике и французскому языку, что не позволило ему поступить в Цюрихский Политехникум. Однако директор училища посоветовал молодому человеку поступить в выпускной класс школы в Аарау (Швейцария), чтобы получить аттестат и повторить поступление.
6 |
--------------------------------------------------------------------------------
/tests/Data/windows-1250/lang_ce_windows-1250.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1250/lang_ce_windows-1250.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1251/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1251/1.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1252 (latin1)/2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1252 (latin1)/2.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1253/lang_le_windows-1253.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1253/lang_le_windows-1253.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1255/he1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he1.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1255/he2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he2.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1255/he3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1255/he3.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1256/lang_ar_windows-1256.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1256/lang_ar_windows-1256.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1257/lang_et_windows-1257.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1257/lang_et_windows-1257.txt
--------------------------------------------------------------------------------
/tests/Data/windows-1258/lang_vi_windows-1258.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/windows-1258/lang_vi_windows-1258.txt
--------------------------------------------------------------------------------
/tests/Data/x-mac-ce/lang_cs_mac-centraleurope.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/x-mac-ce/lang_cs_mac-centraleurope.txt
--------------------------------------------------------------------------------
/tests/Data/x-mac-cyrillic/1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/Data/x-mac-cyrillic/1.txt
--------------------------------------------------------------------------------
/tests/DataUnsupported/euc-tw/euc-tw1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/euc-tw/euc-tw1.txt
--------------------------------------------------------------------------------
/tests/DataUnsupported/iso-8859-10/lang_lv_iso-8859-10.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/iso-8859-10/lang_lv_iso-8859-10.txt
--------------------------------------------------------------------------------
/tests/DataUnsupported/iso-8859-16/lang_sl_iso-8859-16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/iso-8859-16/lang_sl_iso-8859-16.txt
--------------------------------------------------------------------------------
/tests/DataUnsupported/viscii/lang_vi_viscii.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharsetDetector/UTF-unknown/8b3b42ab1ba220aa074ea92bc5f69b248cf783f8/tests/DataUnsupported/viscii/lang_vi_viscii.txt
--------------------------------------------------------------------------------
/tests/DetectionDetailTests.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using System.Reflection;
4 | using NUnit.Framework;
5 | using UtfUnknown.Core;
6 |
7 | namespace UtfUnknown.Tests
8 | {
9 | [TestFixture]
10 | public class DetectionDetailTests
11 | {
12 |
13 | [TestCaseSource(nameof(EncodingNames))]
14 | public void DetectionDetailGetEncodingIsNotNull(string codepageName)
15 | {
16 | var encoding = DetectionDetail.GetEncoding(codepageName);
17 | Assert.IsNotNull(encoding);
18 | }
19 |
20 | private static readonly HashSet UnsupportedEncodings = new HashSet
21 | {
22 | CodepageName.ISO_8859_10,
23 | CodepageName.ISO_8859_16,
24 | CodepageName.EUC_TW,
25 | CodepageName.VISCII,
26 | CodepageName.X_ISO_10646_UCS_4_2143,
27 | CodepageName.X_ISO_10646_UCS_4_3412,
28 | };
29 |
30 | private static readonly IReadOnlyList EncodingNames = typeof(CodepageName)
31 | .GetFields(BindingFlags.NonPublic | BindingFlags.Static | BindingFlags.CreateInstance)
32 | .Select(x => x.GetValue(null).ToString())
33 | .Where(x => !UnsupportedEncodings.Contains(x))
34 | .ToList();
35 |
36 |
37 | [Test]
38 | public void GetEncodingShouldHandleIncorrectEncoding()
39 | {
40 | // Arrange
41 | string encoding = "wrong";
42 | // Act
43 | var result = DetectionDetail.GetEncoding(encoding);
44 |
45 | // Assert
46 | Assert.AreEqual(null, result);
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/tests/EncodingJsonConverter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Text;
3 | using Newtonsoft.Json;
4 |
5 | namespace UtfUnknown.Tests
6 | {
7 | public class EncodingJsonConverter : JsonConverter
8 | {
9 | public override bool CanConvert(Type objectType)
10 | {
11 | return typeof(Encoding).IsAssignableFrom(objectType);
12 | }
13 |
14 | public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
15 | {
16 | writer.WriteValue(((Encoding)value).WebName);
17 | }
18 |
19 | public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
20 | {
21 | return Encoding.GetEncoding((string)reader.Value);
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/tests/UTF-unknown.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net452;netcoreapp2.1;netcoreapp3.0
5 | UtfUnknown.Tests
6 | UtfUnknown.Tests
7 | true
8 | ..\src\UtfUnknown.snk
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------