├── form.png
├── App
└── EncodingChecker.exe
├── sources
├── EncodingChecker
│ ├── AppIcon.ico
│ ├── Properties
│ │ ├── Settings.settings
│ │ ├── Settings.Designer.cs
│ │ ├── AssemblyInfo.cs
│ │ └── Resources.Designer.cs
│ ├── UtfUnknown
│ │ ├── Core
│ │ │ ├── InputState.cs
│ │ │ ├── Probers
│ │ │ │ ├── ProbingState.cs
│ │ │ │ ├── CodingStateMachine.cs
│ │ │ │ └── MultiByte
│ │ │ │ │ ├── UTF8Prober.cs
│ │ │ │ │ └── Chinese
│ │ │ │ │ ├── EUCTWProber.cs
│ │ │ │ │ └── Big5Prober.cs
│ │ │ ├── Analyzers
│ │ │ │ └── MultiByte
│ │ │ │ │ └── Japanese
│ │ │ │ │ ├── EUCJPDistributionAnalyser.cs
│ │ │ │ │ ├── SJISContextAnalyser.cs
│ │ │ │ │ └── EUCJPContextAnalyser.cs
│ │ │ ├── Models
│ │ │ │ ├── MultiByte
│ │ │ │ │ ├── Korean
│ │ │ │ │ │ ├── EUCKRSMModel.cs
│ │ │ │ │ │ └── Iso_2022_KR_SMModel.cs
│ │ │ │ │ ├── Chinese
│ │ │ │ │ │ ├── BIG5SMModel.cs
│ │ │ │ │ │ ├── EUCTWSMModel.cs
│ │ │ │ │ │ ├── Iso_2022_CN_SMModel.cs
│ │ │ │ │ │ └── GB18030_SMModel.cs
│ │ │ │ │ ├── Japanese
│ │ │ │ │ │ ├── SJIS_SMModel.cs
│ │ │ │ │ │ ├── EUCJPSMModel.cs
│ │ │ │ │ │ └── Iso_2022_JP_SMModel.cs
│ │ │ │ │ ├── UCS2BE_SMModel.cs
│ │ │ │ │ └── UCS2LE_SMModel.cs
│ │ │ │ ├── StateMachineModel.cs
│ │ │ │ ├── SingleByte
│ │ │ │ │ ├── Russian
│ │ │ │ │ │ ├── Ibm866_RussianModel.cs
│ │ │ │ │ │ ├── Ibm855_RussianModel.cs
│ │ │ │ │ │ ├── Koi8r_Model.cs
│ │ │ │ │ │ ├── Iso_8859_5_RussianModel.cs
│ │ │ │ │ │ ├── Windows_1251_RussianModel.cs
│ │ │ │ │ │ └── X_Mac_Cyrillic_RussianModel.cs
│ │ │ │ │ ├── Bulgarian
│ │ │ │ │ │ ├── Iso_8859_5_BulgarianModel.cs
│ │ │ │ │ │ └── Windows_1251_BulgarianModel.cs
│ │ │ │ │ ├── Hebrew
│ │ │ │ │ │ └── Windows_1255_HebrewModel.cs
│ │ │ │ │ ├── Czech
│ │ │ │ │ │ ├── Ibm852_CzechModel.cs
│ │ │ │ │ │ ├── Iso_8859_2_CzechModel.cs
│ │ │ │ │ │ └── Windows_1250_CzechModel.cs
│ │ │ │ │ ├── Polish
│ │ │ │ │ │ └── Ibm852_PolishModel.cs
│ │ │ │ │ ├── Slovak
│ │ │ │ │ │ └── Ibm852_SlovakModel.cs
│ │ │ │ │ ├── Irish
│ │ │ │ │ │ ├── Iso_8859_1_IrishModel.cs
│ │ │ │ │ │ ├── Iso_8859_9_IrishModel.cs
│ │ │ │ │ │ └── Iso_8859_15_IrishModel.cs
│ │ │ │ │ ├── Slovene
│ │ │ │ │ │ └── Ibm852_SloveneModel.cs
│ │ │ │ │ ├── Croatian
│ │ │ │ │ │ └── Ibm852_CroatianModel.cs
│ │ │ │ │ ├── Romanian
│ │ │ │ │ │ └── Ibm852_RomanianModel.cs
│ │ │ │ │ ├── Arabic
│ │ │ │ │ │ └── Iso_8859_6_ArabicModel.cs
│ │ │ │ │ ├── Danish
│ │ │ │ │ │ └── Iso_8859_1_DanishModel.cs
│ │ │ │ │ ├── French
│ │ │ │ │ │ └── Iso_8859_1_FrenchModel.cs
│ │ │ │ │ └── German
│ │ │ │ │ │ └── Iso_8859_1_GermanModel.cs
│ │ │ │ └── SequenceModel.cs
│ │ │ └── BitPackage.cs
│ │ ├── DetectionResult.cs
│ │ └── DetectionDetail.cs
│ ├── Program.cs
│ ├── AboutForm.cs
│ ├── ListViewColumnSorter.cs
│ ├── Settings.cs
│ ├── TextEncoding.cs
│ └── ListViewExtensions.cs
└── EncodingChecker.sln
├── appveyor.yml
├── README.md
└── .gitattributes
/form.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/form.png
--------------------------------------------------------------------------------
/App/EncodingChecker.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/App/EncodingChecker.exe
--------------------------------------------------------------------------------
/sources/EncodingChecker/AppIcon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/sources/EncodingChecker/AppIcon.ico
--------------------------------------------------------------------------------
/sources/EncodingChecker/Properties/Settings.settings:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | version: 2.0.{build}
2 | image: Visual Studio 2019
3 | configuration: Release
4 | platform: Any CPU
5 | build:
6 | project: sources/EncodingChecker.sln
7 | verbosity: minimal
8 |
9 | after_build:
10 | - 7z a EncodingChecker.zip %APPVEYOR_BUILD_FOLDER%/sources/EncodingChecker/bin/Release/*
11 |
12 | artifacts:
13 | - path: EncodingChecker.zip
14 | name: EncodingChecker
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/InputState.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core
2 | {
3 | enum InputState
4 | {
5 | PureASCII=0,
6 |
7 | ///
8 | /// Found escape character or HZ "~{"
9 | ///
10 | EscASCII = 1,
11 |
12 | ///
13 | /// non-ascii byte (high-byte)
14 | ///
15 | Highbyte = 2
16 | };
17 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Probers/ProbingState.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Probers
2 | {
3 | public enum ProbingState
4 | {
5 | ///
6 | /// No sure answer yet, but caller can ask for confidence
7 | ///
8 | Detecting = 0, //
9 | ///
10 | /// Positive answer
11 | ///
12 | FoundIt = 1,
13 | ///
14 | /// Negative answer
15 | ///
16 | NotMe = 2
17 | };
18 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
4 | {
5 | ///
6 | /// first byte range: 0xa0 -- 0xfe
7 | /// second byte range: 0xa1 -- 0xfe
8 | /// no validation needed here. State machine has done that
9 | ///
10 | public override int GetOrder(byte[] buf, int offset)
11 | {
12 | if (buf[offset] >= 0xA0)
13 | return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
14 | else
15 | return -1;
16 | }
17 | }
18 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Threading;
3 | using System.Windows.Forms;
4 |
5 | namespace EncodingChecker
6 | {
7 | internal static class Program
8 | {
9 | [STAThread]
10 | private static void Main()
11 | {
12 | Application.ThreadException += OnApplicationThreadException;
13 | Application.EnableVisualStyles();
14 | Application.SetCompatibleTextRenderingDefault(false);
15 | Application.Run(new MainForm());
16 | }
17 |
18 | private static void OnApplicationThreadException(object sender, ThreadExceptionEventArgs e)
19 | {
20 | MessageBox.Show(e.Exception.Message, @"Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
21 | }
22 | }
23 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/AboutForm.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 | using System.Windows.Forms;
3 |
4 | namespace EncodingChecker
5 | {
6 | public partial class AboutForm : Form
7 | {
8 | public AboutForm()
9 | {
10 | InitializeComponent();
11 | }
12 |
13 | private void OnFormLoad(object sender, System.EventArgs e)
14 | {
15 | lblHomepage.Links[0].LinkData = "https://github.com/amrali-eg/EncodingChecker";
16 | lblAuthor.Links[0].LinkData = "https://github.com/JeevanJames";
17 | lblLicense.Links[0].LinkData = "http://www.mozilla.org/MPL/MPL-1.1.html";
18 | lblCreditsUde.Links[0].LinkData = "https://github.com/CharsetDetector/UTF-unknown";
19 | lblCreditsCodePlex.Links[0].LinkData = "http://encodingchecker.codeplex.com";
20 | }
21 |
22 | private void OnLinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
23 | {
24 | string url = (string)e.Link.LinkData;
25 | ProcessStartInfo startInfo = new ProcessStartInfo(url) {UseShellExecute = true};
26 | Process.Start(startInfo);
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.30309.148
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EncodingChecker", "EncodingChecker\EncodingChecker.csproj", "{134E6B14-A7BE-4CED-8332-3A2CA6023EE1}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Release|Any CPU.Build.0 = Release|Any CPU
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | GlobalSection(ExtensibilityGlobals) = postSolution
23 | SolutionGuid = {0E70C6C9-020B-4479-A5CB-2A85A2137B6C}
24 | EndGlobalSection
25 | EndGlobal
26 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/Properties/Settings.Designer.cs:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------------------------
2 | //
3 | // This code was generated by a tool.
4 | // Runtime Version:4.0.30319.42000
5 | //
6 | // Changes to this file may cause incorrect behavior and will be lost if
7 | // the code is regenerated.
8 | //
9 | //------------------------------------------------------------------------------
10 |
11 | namespace EncodingChecker.Properties
12 | {
13 |
14 |
15 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
16 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0")]
17 | internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase
18 | {
19 |
20 | private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
21 |
22 | public static Settings Default
23 | {
24 | get
25 | {
26 | return defaultInstance;
27 | }
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class SJISContextAnalyser : JapaneseContextAnalyser
4 | {
5 | private const byte HIRAGANA_FIRST_BYTE = 0x82;
6 |
7 | protected override int GetOrder(byte[] buf, int offset, out int charLen)
8 | {
9 | //find out current char's byte length
10 | if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
11 | || buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
12 | charLen = 2;
13 | else
14 | charLen = 1;
15 |
16 | // return its order if it is hiragana
17 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
18 | byte low = buf[offset+1];
19 | if (low >= 0x9F && low <= 0xF1)
20 | return low - 0x9F;
21 | }
22 | return -1;
23 | }
24 |
25 | protected override int GetOrder(byte[] buf, int offset)
26 | {
27 | // We are only interested in Hiragana
28 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
29 | byte low = buf[offset+1];
30 | if (low >= 0x9F && low <= 0xF1)
31 | return low - 0x9F;
32 | }
33 | return -1;
34 | }
35 |
36 | }
37 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/ListViewColumnSorter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections;
3 | using System.Windows.Forms;
4 |
5 | namespace EncodingChecker
6 | {
7 | public class ListViewColumnSorter : IComparer
8 | {
9 | private readonly CaseInsensitiveComparer _objectCompare;
10 |
11 | public int SortColumn { get; set; }
12 |
13 | public SortOrder Order { get; set; }
14 |
15 | public ListViewColumnSorter()
16 | {
17 | SortColumn = 0;
18 | Order = SortOrder.None;
19 | _objectCompare = new CaseInsensitiveComparer();
20 | }
21 |
22 | public int Compare(object x, object y)
23 | {
24 | ListViewItem listViewItem = (ListViewItem)x;
25 | if (listViewItem == null) throw new ArgumentNullException(nameof(listViewItem));
26 |
27 | ListViewItem listViewItem2 = (ListViewItem)y;
28 | if (listViewItem2 == null) throw new ArgumentNullException(nameof(listViewItem2));
29 |
30 | int compareResult = _objectCompare.Compare(a: listViewItem.SubItems[index: SortColumn].Text, b: listViewItem2.SubItems[index: SortColumn].Text);
31 | if (Order == SortOrder.Ascending)
32 | {
33 | return compareResult;
34 | }
35 | if (Order == SortOrder.Descending)
36 | {
37 | return -compareResult;
38 | }
39 | return 0;
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Analyzers.Japanese
2 | {
3 | public class EUCJPContextAnalyser : JapaneseContextAnalyser
4 | {
5 | private const byte HIRAGANA_FIRST_BYTE = 0xA4;
6 |
7 | protected override int GetOrder(byte[] buf, int offset, out int charLen)
8 | {
9 | byte high = buf[offset];
10 |
11 | //find out current char's byte length
12 | if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
13 | charLen = 2;
14 | else if (high == 0xBF)
15 | charLen = 3;
16 | else
17 | charLen = 1;
18 |
19 | // return its order if it is hiragana
20 | if (high == HIRAGANA_FIRST_BYTE) {
21 | byte low = buf[offset+1];
22 | if (low >= 0xA1 && low <= 0xF3)
23 | return low - 0xA1;
24 | }
25 | return -1;
26 | }
27 |
28 | protected override int GetOrder(byte[] buf, int offset)
29 | {
30 | // We are only interested in Hiragana
31 | if (buf[offset] == HIRAGANA_FIRST_BYTE) {
32 | byte low = buf[offset+1];
33 | if (low >= 0xA1 && low <= 0xF3)
34 | return low - 0xA1;
35 | }
36 | return -1;
37 | }
38 | }
39 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/DetectionResult.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 |
5 | namespace UtfUnknown
6 | {
7 | ///
8 | /// Result of a detection.
9 | ///
10 | public class DetectionResult
11 | {
12 | ///
13 | /// Empty
14 | ///
15 | public DetectionResult()
16 | {
17 | }
18 |
19 | ///
20 | /// Multiple results
21 | ///
22 | public DetectionResult(IList details)
23 | {
24 | Details = details;
25 | }
26 |
27 | ///
28 | /// Single result
29 | ///
30 | ///
31 | public DetectionResult(DetectionDetail detectionDetail)
32 | {
33 | Details = new List { detectionDetail };
34 | }
35 |
36 | ///
37 | /// Get the best Detection
38 | ///
39 | public DetectionDetail Detected => Details?.FirstOrDefault();
40 |
41 | ///
42 | /// All results
43 | ///
44 | public IList Details { get; set; }
45 |
46 | public override string ToString()
47 | {
48 | return $"{nameof(Detected)}: {Detected}, \n{nameof(Details)}:\n - {string.Join("\n- ", Details?.Select(d => d.ToString()))}";
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("File Encoding Checker")]
9 | [assembly: AssemblyDescription("GUI tool to check the encoding of a text file")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("Jeevan James")]
12 | [assembly: AssemblyProduct("File Encoding Checker")]
13 | [assembly: AssemblyCopyright("Copyright © Jeevan James 2020")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("134e6b14-a7be-4ced-8332-3a2ca6023ee1")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("2.0.1.0")]
36 | [assembly: AssemblyFileVersion("2.0.1.0")]
37 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/Settings.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.ObjectModel;
3 | using System.Windows.Forms;
4 |
5 | namespace EncodingChecker
6 | {
7 | [Serializable]
8 | public sealed class Settings
9 | {
10 | public WindowPosition WindowPosition = new WindowPosition();
11 |
12 | public RecentDirectories RecentDirectories = new RecentDirectories();
13 | public bool IncludeSubdirectories = true;
14 |
15 | public string FileMasks;
16 | public string[] ValidCharsets;
17 | }
18 |
19 | [Serializable]
20 | public sealed class WindowPosition
21 | {
22 | public int Left = -1;
23 | public int Top = -1;
24 | public int Width = -1;
25 | public int Height = -1;
26 | public bool Maximized;
27 |
28 | public void ApplyTo(Form form)
29 | {
30 | if (Left >= 0 && Top >= 0 && Width > 0 && Height > 0)
31 | form.SetBounds(Left, Top, Width, Height);
32 | }
33 | }
34 |
35 | [Serializable]
36 | public sealed class RecentDirectories : Collection
37 | {
38 | protected override void InsertItem(int index, string item)
39 | {
40 | for (int i = Count - 1; i >= 0; i--)
41 | {
42 | if (this[i].Equals(item, StringComparison.OrdinalIgnoreCase))
43 | RemoveAt(i);
44 | }
45 |
46 | base.InsertItem(0, item);
47 |
48 | if (Count > 10)
49 | {
50 | for (int i = Count - 1; i >= 10; i--)
51 | RemoveAt(i);
52 | }
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://ci.appveyor.com/project/amrali-eg/encodingchecker/branch/master)
2 |
3 | # EncodingChecker v2.0
4 | File Encoding Checker is a GUI tool that allows you to validate the text encoding of one or more files. The tool can display the encoding for all selected files, or only the files that do not have the encodings you specify.
5 |
6 | File Encoding Checker requires Microsoft .NET Framework 4 to run.
7 |
8 | 
9 |
10 | ## Fixed issues
11 | Sorting the results by clicking a column header is working now.
12 |
13 | Display the sort arrow in the columnn header for the results list view.
14 |
15 | When viewing a directory, some files matching the file masks were not listed.
16 |
17 | Improved performance of the list view control for faster processing of results.
18 |
19 | Added feature to export selected results to a text file.
20 |
21 | Switched to UtfUnknown library for better encoding detection (Multiple bugs from Ude fixed).
22 |
23 | Validating the detected file encoding to avoid errors during conversion of files.
24 |
25 | UTF-16 text files without byte-order-mark (BOM) can be detected by heuristics.
26 |
27 | ## Credits
28 | The original project [EncodingChecker](https://archive.codeplex.com/?p=encodingchecker) on CodePlex was written by [Jeevan James](https://github.com/JeevanJames).
29 |
30 | For encoding detection, File Encoding Checker now uses the [UtfUnknown](https://github.com/CharsetDetector/UTF-unknown) library, which is a C# port of [uchardet](https://gitlab.freedesktop.org/uchardet/uchardet) library - A C++ port of the original [Mozilla Universal Charset Detector](https://dxr.mozilla.org/mozilla/source/extensions/universalchardet/).
31 |
32 | ## Supported Charsets
33 | File Encoding Checker currently supports over forty charsets.
34 |
35 | * ASCII
36 | * UTF-7 (with a BOM)
37 | * UTF-8 (with or without a BOM)
38 | * UTF-16 BE or LE (with or without a BOM)
39 | * UTF-32 BE or LE (with a BOM)
40 | * Arabic: iso-8859-6, windows-1256.
41 | * Baltic: iso-8859-4, windows-1257.
42 | * Central European: ibm852, iso-8859-2, windows-1250, x-mac-ce.
43 | * Chinese (Traditional and Simplified): big5, GB18030, hz-gb-2312, x-cp50227.
44 | * Cyrillic (primarily Russian): IBM855, cp866, iso-8859-5, koi8-r, windows-1251, x-mac-cyrillic.
45 | * Estonian: iso-8859-13.
46 | * Greek: iso-8859-7, windows-1253.
47 | * Hebrew: iso-8859-8, windows-1255.
48 | * Japanese: euc-jp, iso-2022-jp, shift_jis.
49 | * Korean: euc-kr, iso-2022-kr, ks_c_5601-1987 (cp949).
50 | * Thai: windows-874 (aliases TIS-620 and iso-8859-11 in .NET)
51 | * Turkish: iso-8859-3, iso-8859-9.
52 | * Western European: iso-8859-1, iso-8859-15, windows-1252.
53 | * Vietnamese: windows-1258.
54 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Common settings that generally should always be used with your language specific settings
2 |
3 | # https://www.davidlaing.com/2012/09/19/customise-your-gitattributes-to-become-a-git-ninja/
4 | # https://docs.github.com/en/github/using-git/configuring-git-to-handle-line-endings
5 | # https://git-scm.com/docs/gitattributes
6 |
7 | # Auto detect text files and perform LF normalization
8 | * text=auto
9 |
10 | #
11 | # The above will handle all files NOT found below
12 | #
13 |
14 | # Custom for Visual Studio
15 | *.cs text eol=crlf diff=csharp
16 | *.resx text eol=crlf
17 | *.csproj text eol=crlf merge=union
18 | *.vbproj text merge=union
19 | *.fsproj text merge=union
20 | *.dbproj text merge=union
21 | *.sln text eol=crlf merge=union
22 |
23 | # Source code
24 | *.bash text eol=lf
25 | *.bat text eol=crlf
26 | *.cmd text eol=crlf
27 | *.htm text diff=html
28 | *.html text diff=html
29 | *.php text diff=php
30 | *.ps1 text eol=crlf
31 | *.py text diff=python
32 | *.rb text diff=ruby
33 | *.scss text diff=css
34 | *.sh text eol=lf
35 | *.xhtml text diff=html
36 |
37 | # Documents
38 | *.doc diff=astextplain
39 | *.DOC diff=astextplain
40 | *.docx diff=astextplain
41 | *.DOCX diff=astextplain
42 | *.dot diff=astextplain
43 | *.DOT diff=astextplain
44 | *.pdf diff=astextplain
45 | *.PDF diff=astextplain
46 | *.rtf diff=astextplain
47 | *.RTF diff=astextplain
48 |
49 | #
50 | ## These files are binary and should be left untouched
51 | #
52 |
53 | # Graphics
54 | *.ai binary
55 | *.bmp binary
56 | *.eps binary
57 | *.gif binary
58 | *.gifv binary
59 | *.ico binary
60 | *.jng binary
61 | *.jp2 binary
62 | *.jpg binary
63 | *.jpeg binary
64 | *.jpx binary
65 | *.jxr binary
66 | *.pdf binary
67 | *.png binary
68 | *.psb binary
69 | *.psd binary
70 | # SVG treated as an asset (binary) by default.
71 | *.svg text
72 | # If you want to treat it as binary,
73 | # use the following line instead.
74 | # *.svg binary
75 | *.svgz binary
76 | *.tif binary
77 | *.tiff binary
78 | *.wbmp binary
79 | *.webp binary
80 |
81 | # Archives
82 | *.7z binary
83 | *.gz binary
84 | *.jar binary
85 | *.rar binary
86 | *.tar binary
87 | *.zip binary
88 |
89 | # Executables
90 | *.exe binary
91 | *.pyc binary
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/Properties/Resources.Designer.cs:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------------------------
2 | //
3 | // This code was generated by a tool.
4 | // Runtime Version:4.0.30319.42000
5 | //
6 | // Changes to this file may cause incorrect behavior and will be lost if
7 | // the code is regenerated.
8 | //
9 | //------------------------------------------------------------------------------
10 |
11 | namespace EncodingChecker.Properties
12 | {
13 |
14 |
15 | ///
16 | /// A strongly-typed resource class, for looking up localized strings, etc.
17 | ///
18 | // This class was auto-generated by the StronglyTypedResourceBuilder
19 | // class via a tool like ResGen or Visual Studio.
20 | // To add or remove a member, edit your .ResX file then rerun ResGen
21 | // with the /str option, or rebuild your VS project.
22 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")]
23 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
24 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
25 | internal class Resources
26 | {
27 |
28 | private static global::System.Resources.ResourceManager resourceMan;
29 |
30 | private static global::System.Globalization.CultureInfo resourceCulture;
31 |
32 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
33 | internal Resources()
34 | {
35 | }
36 |
37 | ///
38 | /// Returns the cached ResourceManager instance used by this class.
39 | ///
40 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
41 | internal static global::System.Resources.ResourceManager ResourceManager
42 | {
43 | get
44 | {
45 | if ((resourceMan == null))
46 | {
47 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("EncodingChecker.Properties.Resources", typeof(Resources).Assembly);
48 | resourceMan = temp;
49 | }
50 | return resourceMan;
51 | }
52 | }
53 |
54 | ///
55 | /// Overrides the current thread's CurrentUICulture property for all
56 | /// resource lookups using this strongly typed resource class.
57 | ///
58 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
59 | internal static global::System.Globalization.CultureInfo Culture
60 | {
61 | get
62 | {
63 | return resourceCulture;
64 | }
65 | set
66 | {
67 | resourceCulture = value;
68 | }
69 | }
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Korean/EUCKRSMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Korean
2 | {
3 | public class EUCKRSMModel : StateMachineModel
4 | {
5 | private readonly static int[] EUCKR_cls = {
6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
17 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
18 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
19 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
20 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
21 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
22 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
27 | BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7
28 | BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
32 | BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff
39 | };
40 |
41 | private readonly static int[] EUCKR_st = {
42 | BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07
43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f
44 | };
45 |
46 | private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 };
47 |
48 | public EUCKRSMModel() : base(
49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
50 | BitPackage.SHIFT_MASK_4BITS,
51 | BitPackage.BIT_SHIFT_4BITS,
52 | BitPackage.UNIT_MASK_4BITS, EUCKR_cls),
53 | 4,
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, EUCKR_st),
58 | EUCKRCharLenTable, CodepageName.EUC_KR)
59 | {
60 |
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/BIG5SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
4 | {
5 | public class BIG5SMModel : StateMachineModel
6 | {
7 | private readonly static int[] BIG5_cls = {
8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
9 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
11 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87
25 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
26 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
27 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
28 | BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7
29 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af
30 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7
31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf
32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7
33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
38 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
39 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
40 | };
41 |
42 | private readonly static int[] BIG5_st = {
43 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
44 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f
45 | BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
46 | };
47 |
48 | private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
49 |
50 | public BIG5SMModel() : base(
51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
52 | BitPackage.SHIFT_MASK_4BITS,
53 | BitPackage.BIT_SHIFT_4BITS,
54 | BitPackage.UNIT_MASK_4BITS, BIG5_cls),
55 | 5,
56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
57 | BitPackage.SHIFT_MASK_4BITS,
58 | BitPackage.BIT_SHIFT_4BITS,
59 | BitPackage.UNIT_MASK_4BITS, BIG5_st),
60 | BIG5CharLenTable, CodepageName.BIG5)
61 | {
62 |
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/StateMachineModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Kohei TAKETA (Java port)
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System;
40 |
41 | namespace UtfUnknown.Core.Models
42 | {
43 | ///
44 | /// State machine model
45 | ///
46 | public abstract class StateMachineModel
47 | {
48 | ///
49 | /// Start node
50 | ///
51 | public const int START = 0;
52 |
53 | ///
54 | /// Error node ?
55 | ///
56 | public const int ERROR = 1;
57 |
58 | ///
59 | /// ?
60 | ///
61 | public const int ITSME = 2;
62 |
63 | public BitPackage classTable;
64 | public BitPackage stateTable;
65 | public int[] charLenTable;
66 |
67 | public string Name { get; }
68 |
69 | public int ClassFactor { get; }
70 |
71 | public StateMachineModel(BitPackage classTable, int classFactor,
72 | BitPackage stateTable, int[] charLenTable, String name)
73 | {
74 | this.classTable = classTable;
75 | ClassFactor = classFactor;
76 | this.stateTable = stateTable;
77 | this.charLenTable = charLenTable;
78 | Name = name;
79 | }
80 |
81 | public int GetClass(byte b)
82 | {
83 | return classTable.Unpack((int)b);
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Korean/Iso_2022_KR_SMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Korean
2 | {
3 | public class Iso_2022_KR_SMModel : StateMachineModel
4 | {
5 | private readonly static int[] ISO2022KR_cls = {
6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
10 | BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
11 | BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
14 | BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
38 | };
39 |
40 | private readonly static int[] ISO2022KR_st = {
41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR), //00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR), //10-17
44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR), //18-1f
45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
46 | };
47 |
48 | private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
49 |
50 | public Iso_2022_KR_SMModel() : base(
51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
52 | BitPackage.SHIFT_MASK_4BITS,
53 | BitPackage.BIT_SHIFT_4BITS,
54 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
55 | 6,
56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
57 | BitPackage.SHIFT_MASK_4BITS,
58 | BitPackage.BIT_SHIFT_4BITS,
59 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
60 | ISO2022KRCharLenTable, CodepageName.ISO_2022_KR)
61 | {
62 |
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/SJIS_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
4 | {
5 | public class SJIS_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] SJIS_cls = {
8 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
10 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
12 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
25 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87
26 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f
27 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97
28 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f
29 | //0xa0 is illegal in sjis encoding, but some pages does
30 | //contain such byte. We need to be more error forgiven.
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
39 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
40 | BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef
41 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7
42 | BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff
43 | };
44 |
45 | private readonly static int[] SJIS_st = {
46 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
48 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17
49 | };
50 |
51 | private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 };
52 |
53 | public SJIS_SMModel() : base(
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, SJIS_cls),
58 | 6,
59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
60 | BitPackage.SHIFT_MASK_4BITS,
61 | BitPackage.BIT_SHIFT_4BITS,
62 | BitPackage.UNIT_MASK_4BITS, SJIS_st),
63 | SJISCharLenTable, CodepageName.SHIFT_JIS)
64 | {
65 |
66 | }
67 | }
68 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Probers/CodingStateMachine.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is mozilla.org code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Kohei TAKETA (Java port)
24 | * Rudi Pettazzi (C# port)
25 | *
26 | * Alternatively, the contents of this file may be used under the terms of
27 | * either the GNU General Public License Version 2 or later (the "GPL"), or
28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 | * in which case the provisions of the GPL or the LGPL are applicable instead
30 | * of those above. If you wish to allow use of your version of this file only
31 | * under the terms of either the GPL or the LGPL, and not to allow others to
32 | * use your version of this file under the terms of the MPL, indicate your
33 | * decision by deleting the provisions above and replace them with the notice
34 | * and other provisions required by the GPL or the LGPL. If you do not delete
35 | * the provisions above, a recipient may use your version of this file under
36 | * the terms of any one of the MPL, the GPL or the LGPL.
37 | *
38 | * ***** END LICENSE BLOCK ***** */
39 |
40 | using UtfUnknown.Core.Models;
41 |
42 | namespace UtfUnknown.Core.Probers
43 | {
44 | ///
45 | /// Parallel state machine for the Coding Scheme Method
46 | ///
47 | public class CodingStateMachine
48 | {
49 | private int currentState;
50 | private StateMachineModel model;
51 | private int currentCharLen;
52 |
53 |
54 | public CodingStateMachine(StateMachineModel model)
55 | {
56 | currentState = StateMachineModel.START;
57 | this.model = model;
58 | }
59 |
60 | public int NextState(byte b)
61 | {
62 | // for each byte we get its class, if it is first byte,
63 | // we also get byte length
64 | int byteCls = model.GetClass(b);
65 | if (currentState == StateMachineModel.START) {
66 |
67 | currentCharLen = model.charLenTable[byteCls];
68 | }
69 |
70 | // from byte's class and stateTable, we get its next state
71 | currentState = model.stateTable.Unpack(
72 | currentState * model.ClassFactor + byteCls);
73 |
74 | return currentState;
75 | }
76 |
77 | public void Reset()
78 | {
79 | currentState = StateMachineModel.START;
80 | }
81 |
82 | public int CurrentCharLen
83 | {
84 | get { return currentCharLen; }
85 | }
86 |
87 | public string ModelName
88 | {
89 | get { return model.Name; }
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/EUCTWSMModel.cs:
--------------------------------------------------------------------------------
1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
2 | {
3 | public class EUCTWSMModel : StateMachineModel
4 | {
5 | private readonly static int[] EUCTW_cls = {
6 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07
7 | BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f
8 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17
9 | BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f
10 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27
11 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f
12 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37
13 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f
14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
23 | BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
26 | BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7
27 | BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af
28 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
29 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
30 | BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7
31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
38 | };
39 |
40 | private readonly static int[] EUCTW_st = {
41 | BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07
42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17
44 | BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
45 | BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27
46 | BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f
47 | };
48 |
49 | private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 };
50 |
51 | public EUCTWSMModel() : base(
52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
53 | BitPackage.SHIFT_MASK_4BITS,
54 | BitPackage.BIT_SHIFT_4BITS,
55 | BitPackage.UNIT_MASK_4BITS, EUCTW_cls),
56 | 7,
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, EUCTW_st),
61 | EUCTWCharLenTable, CodepageName.EUC_TW)
62 | {
63 |
64 | }
65 | }
66 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/EUCJPSMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
4 | {
5 | public class EUCJPSMModel : StateMachineModel
6 | {
7 | private readonly static int[] EUCJP_cls = {
8 | //BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07
9 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07
10 | BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f
11 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17
12 | BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f
13 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27
14 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f
15 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37
16 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f
17 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47
18 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f
19 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57
20 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f
21 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67
22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f
23 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77
24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f
25 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87
26 | BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f
27 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97
28 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f
29 | BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
39 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
40 | BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff
41 | };
42 |
43 | private readonly static int[] EUCJP_st = {
44 | BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07
45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
46 | BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17
47 | BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f
48 | BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27
49 | };
50 |
51 | private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 };
52 |
53 | public EUCJPSMModel() : base(
54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
55 | BitPackage.SHIFT_MASK_4BITS,
56 | BitPackage.BIT_SHIFT_4BITS,
57 | BitPackage.UNIT_MASK_4BITS, EUCJP_cls),
58 | 6,
59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
60 | BitPackage.SHIFT_MASK_4BITS,
61 | BitPackage.BIT_SHIFT_4BITS,
62 | BitPackage.UNIT_MASK_4BITS, EUCJP_st),
63 | EUCJPCharLenTable, CodepageName.EUC_JP)
64 | {
65 |
66 | }
67 | }
68 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/UCS2BE_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte
4 | {
5 | public class UCS2BE_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] UCS2BE_cls = {
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
9 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
11 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
13 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
39 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
40 | };
41 |
42 | private readonly static int[] UCS2BE_st = {
43 | BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07
44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
45 | BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17
46 | BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f
47 | BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27
48 | BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f
49 | BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37
50 | };
51 |
52 | private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 };
53 |
54 | public UCS2BE_SMModel() : base(
55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
56 | BitPackage.SHIFT_MASK_4BITS,
57 | BitPackage.BIT_SHIFT_4BITS,
58 | BitPackage.UNIT_MASK_4BITS, UCS2BE_cls),
59 | 6,
60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
61 | BitPackage.SHIFT_MASK_4BITS,
62 | BitPackage.BIT_SHIFT_4BITS,
63 | BitPackage.UNIT_MASK_4BITS, UCS2BE_st),
64 | UCS2BECharLenTable, CodepageName.UTF16_BE)
65 | {
66 |
67 | }
68 | }
69 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/UCS2LE_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte
4 | {
5 | public class UCS2LE_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] UCS2LE_cls = {
8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
9 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
11 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
13 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
39 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
40 | };
41 |
42 | private readonly static int[] UCS2LE_st = {
43 | BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07
44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
45 | BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17
46 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f
47 | BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27
48 | BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f
49 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37
50 | };
51 |
52 | private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 };
53 |
54 | public UCS2LE_SMModel() : base(
55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
56 | BitPackage.SHIFT_MASK_4BITS,
57 | BitPackage.BIT_SHIFT_4BITS,
58 | BitPackage.UNIT_MASK_4BITS, UCS2LE_cls),
59 | 6,
60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
61 | BitPackage.SHIFT_MASK_4BITS,
62 | BitPackage.BIT_SHIFT_4BITS,
63 | BitPackage.UNIT_MASK_4BITS, UCS2LE_st),
64 | UCS2LECharLenTable, CodepageName.UTF16_LE)
65 | {
66 |
67 | }
68 | }
69 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Ibm866_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Ibm866_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70
56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
58 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
59 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
60 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
61 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR,
64 | };
65 |
66 | public Ibm866_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM866)
67 | {
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Ibm855_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Ibm855_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] BYTE_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70
56 | 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
57 | 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
58 | 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
59 | 220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
60 | 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
61 | 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
62 | 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
63 | 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR,
64 | };
65 |
66 | public Ibm855_RussianModel() : base(BYTE_TO_ORDER_MAP, CodepageName.IBM855)
67 | {
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/Iso_2022_CN_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
4 | {
5 | public class Iso_2022_CN_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] ISO2022CN_cls = {
8 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
9 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
11 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
13 | BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
16 | BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
39 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
40 | };
41 |
42 | private readonly static int[] ISO2022CN_st = {
43 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START), //00-07
44 | BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f
45 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME), //10-17
46 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR), //18-1f
47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //20-27
48 | BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //28-2f
49 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //30-37
50 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
51 | };
52 |
53 | private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
54 |
55 | public Iso_2022_CN_SMModel() : base(
56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
57 | BitPackage.SHIFT_MASK_4BITS,
58 | BitPackage.BIT_SHIFT_4BITS,
59 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
60 | 9,
61 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
62 | BitPackage.SHIFT_MASK_4BITS,
63 | BitPackage.BIT_SHIFT_4BITS,
64 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
65 | ISO2022CNCharLenTable, CodepageName.ISO_2022_CN)
66 | {
67 |
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Koi8r_Model.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Koi8r_Model : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, /* AX */
59 | 238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, /* BX */
60 | 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, /* CX */
61 | 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, /* DX */
62 | 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, /* EX */
63 | 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Koi8r_Model() : base(CHAR_TO_ORDER_MAP, CodepageName.KOI8_R)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Iso_8859_5_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Iso_8859_5_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */
59 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* BX */
60 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* CX */
61 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* DX */
62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* EX */
63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Iso_8859_5_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Windows_1251_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class Windows_1251_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */
57 | 207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, /* 9X */
58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */
59 | 239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, /* BX */
60 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* CX */
61 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* DX */
62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */
63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public Windows_1251_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/X_Mac_Cyrillic_RussianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Russian
44 | {
45 | public class X_Mac_Cyrillic_RussianModel : RussianModel
46 | {
47 | private readonly static byte[] CHAR_TO_ORDER_MAP = {
48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */
53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */
54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */
55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */
56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* 8X */
57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* 9X */
58 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* AX */
59 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* BX */
60 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* CX */
61 | 239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, /* DX */
62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */
63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, /* FX */
64 | };
65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
66 |
67 | public X_Mac_Cyrillic_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.X_MAC_CYRILLIC)
68 | {
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/Iso_2022_JP_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese
4 | {
5 | public class Iso_2022_JP_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] ISO2022JP_cls = {
8 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
9 | BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
11 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
12 | BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
13 | BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
16 | BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
17 | BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
39 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
40 | };
41 |
42 | private readonly static int[] ISO2022JP_st = {
43 | BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START), //00-07
44 | BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f
45 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //10-17
46 | BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR), //18-1f
47 | BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR), //20-27
48 | BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR), //28-2f
49 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME), //30-37
50 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //38-3f
51 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
52 | };
53 |
54 | private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
55 |
56 | public Iso_2022_JP_SMModel() : base(
57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
58 | BitPackage.SHIFT_MASK_4BITS,
59 | BitPackage.BIT_SHIFT_4BITS,
60 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
61 | 10,
62 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
63 | BitPackage.SHIFT_MASK_4BITS,
64 | BitPackage.BIT_SHIFT_4BITS,
65 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
66 | ISO2022JPCharLenTable, CodepageName.ISO_2022_JP)
67 | {
68 |
69 | }
70 |
71 | }
72 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/GB18030_SMModel.cs:
--------------------------------------------------------------------------------
1 | using UtfUnknown.Core.Models;
2 |
3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese
4 | {
5 | public class GB18030_SMModel : StateMachineModel
6 | {
7 | private readonly static int[] GB18030_cls = {
8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
9 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
11 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
14 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37
15 | BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f
16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f
24 | BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87
25 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f
26 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97
27 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f
28 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7
29 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af
30 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7
31 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf
32 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7
33 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
34 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
35 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
36 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7
37 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef
38 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7
39 | BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff
40 | };
41 |
42 | private readonly static int[] GB18030_st = {
43 | BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07
44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
45 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17
46 | BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f
47 | BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27
48 | BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f
49 | };
50 |
51 | // To be accurate, the length of class 6 can be either 2 or 4.
52 | // But it is not necessary to discriminate between the two since
53 | // it is used for frequency analysis only, and we are validating
54 | // each code range there as well. So it is safe to set it to be
55 | // 2 here.
56 | private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
57 |
58 | public GB18030_SMModel() : base(
59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
60 | BitPackage.SHIFT_MASK_4BITS,
61 | BitPackage.BIT_SHIFT_4BITS,
62 | BitPackage.UNIT_MASK_4BITS, GB18030_cls),
63 | 7,
64 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
65 | BitPackage.SHIFT_MASK_4BITS,
66 | BitPackage.BIT_SHIFT_4BITS,
67 | BitPackage.UNIT_MASK_4BITS, GB18030_st),
68 | GB18030CharLenTable, CodepageName.GB18030)
69 | {
70 |
71 | }
72 | }
73 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/TextEncoding.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Text;
4 | using UtfUnknown;
5 |
6 | namespace EncodingChecker
7 | {
8 | public static class TextEncoding
9 | {
10 | ///
11 | /// https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding/
12 | ///
13 | private static readonly DecoderExceptionFallback DecoderExceptionFallback = new DecoderExceptionFallback();
14 | public static bool Validate(this Encoding encoding, byte[] bytes, int offset = 0, int? length = null)
15 | {
16 | if (encoding == null)
17 | {
18 | throw new ArgumentNullException(nameof(encoding));
19 | }
20 | if (bytes == null)
21 | {
22 | throw new ArgumentNullException(nameof(bytes));
23 | }
24 | length = length ?? bytes.Length;
25 | if (offset < 0 || offset > bytes.Length)
26 | {
27 | throw new ArgumentOutOfRangeException(nameof(offset), @"Offset is out of range.");
28 | }
29 | if (length < 0 || length > bytes.Length)
30 | {
31 | throw new ArgumentOutOfRangeException(nameof(length), @"Length is out of range.");
32 | }
33 | else if ((offset + length) > bytes.Length)
34 | {
35 | throw new ArgumentOutOfRangeException(nameof(offset), @"The specified range is outside of the specified buffer.");
36 | }
37 | var decoder = encoding.GetDecoder();
38 | decoder.Fallback = DecoderExceptionFallback;
39 | try
40 | {
41 | decoder.GetCharCount(bytes, offset, length.Value);
42 | }
43 | catch (DecoderFallbackException)
44 | {
45 | return false;
46 | }
47 | return true;
48 | }
49 |
50 | ///
51 | /// Get the System.Text.Encoding of this file.
52 | ///
53 | /// Path to file
54 | /// System.Text.Encoding (can be null if not available or not supported by .NET).
55 | public static Encoding GetFileEncoding(string filePath, ref bool hasBOM)
56 | {
57 | return GetFileEncoding(filePath, null, ref hasBOM);
58 | }
59 |
60 | ///
61 | /// Get the System.Text.Encoding of this file.
62 | ///
63 | /// Path to file
64 | /// max bytes to read from . If null, then no max
65 | /// System.Text.Encoding (can be null if not available or not supported by .NET).
66 | public static Encoding GetFileEncoding(string filePath, int? maxBytesToRead, ref bool hasBOM)
67 | {
68 | hasBOM = false;
69 | try
70 | {
71 | using (FileStream stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
72 | {
73 | // Check for possible UTF-16 encoding (LE or BE).
74 | Encoding encoding = Utf16Detector.DetectFromStream(stream, maxBytesToRead);
75 | if (encoding != null)
76 | {
77 | return encoding;
78 | }
79 | // https://github.com/CharsetDetector/UTF-unknown
80 | stream.Position = 0L;
81 | var result = CharsetDetector.DetectFromStream(stream, maxBytesToRead);
82 | if (result.Detected != null)
83 | {
84 | hasBOM = result.Detected.HasBOM;
85 | return result.Detected.Encoding;
86 | }
87 | return null;
88 | }
89 | }
90 | catch
91 | {
92 | return null;
93 | }
94 | }
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/BitPackage.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Kohei TAKETA (Java port)
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | namespace UtfUnknown.Core
40 | {
41 | public class BitPackage
42 | {
43 | public static int INDEX_SHIFT_4BITS = 3;
44 | public static int INDEX_SHIFT_8BITS = 2;
45 | public static int INDEX_SHIFT_16BITS = 1;
46 |
47 | public static int SHIFT_MASK_4BITS = 7;
48 | public static int SHIFT_MASK_8BITS = 3;
49 | public static int SHIFT_MASK_16BITS = 1;
50 |
51 | public static int BIT_SHIFT_4BITS = 2;
52 | public static int BIT_SHIFT_8BITS = 3;
53 | public static int BIT_SHIFT_16BITS = 4;
54 |
55 | public static int UNIT_MASK_4BITS = 0x0000000F;
56 | public static int UNIT_MASK_8BITS = 0x000000FF;
57 | public static int UNIT_MASK_16BITS = 0x0000FFFF;
58 |
59 | private int indexShift;
60 | private int shiftMask;
61 | private int bitShift;
62 | private int unitMask;
63 | private int[] data;
64 |
65 | public BitPackage(int indexShift, int shiftMask,
66 | int bitShift, int unitMask, int[] data)
67 | {
68 | this.indexShift = indexShift;
69 | this.shiftMask = shiftMask;
70 | this.bitShift = bitShift;
71 | this.unitMask = unitMask;
72 | this.data = data;
73 | }
74 |
75 | public static int Pack16bits(int a, int b)
76 | {
77 | return ((b << 16) | a);
78 | }
79 |
80 | public static int Pack8bits(int a, int b, int c, int d)
81 | {
82 | return Pack16bits((b << 8) | a, (d << 8) | c);
83 | }
84 |
85 | public static int Pack4bits(int a, int b, int c, int d,
86 | int e, int f, int g, int h)
87 | {
88 | return Pack8bits((b << 4) | a, (d << 4) | c,
89 | (f << 4) | e, (h << 4) | g);
90 | }
91 |
92 | public int Unpack(int i)
93 | {
94 | return (data[i >> indexShift] >>
95 | ((i & shiftMask) << bitShift)) & unitMask;
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/ListViewExtensions.cs:
--------------------------------------------------------------------------------
1 | // https://stackoverflow.com/questions/254129/how-to-i-display-a-sort-arrow-in-the-header-of-a-list-view-column-using-c
2 | using System;
3 | using System.ComponentModel;
4 | using System.Runtime.InteropServices;
5 | using System.Windows.Forms;
6 |
7 | namespace EncodingChecker
8 | {
9 | [EditorBrowsable(EditorBrowsableState.Never)]
10 | public static class ListViewExtensions
11 | {
12 | [StructLayout(LayoutKind.Sequential)]
13 | public struct HDITEM
14 | {
15 | public Mask mask;
16 | public int cxy;
17 | [MarshalAs(UnmanagedType.LPTStr)] public string pszText;
18 | public IntPtr hbm;
19 | public int cchTextMax;
20 | public Format fmt;
21 | public IntPtr lParam;
22 | // _WIN32_IE >= 0x0300
23 | public int iImage;
24 | public int iOrder;
25 | // _WIN32_IE >= 0x0500
26 | public uint type;
27 | public IntPtr pvFilter;
28 | // _WIN32_WINNT >= 0x0600
29 | public uint state;
30 |
31 | [Flags]
32 | public enum Mask
33 | {
34 | Format = 0x4, // HDI_FORMAT
35 | };
36 |
37 | [Flags]
38 | public enum Format
39 | {
40 | SortDown = 0x200, // HDF_SORTDOWN
41 | SortUp = 0x400, // HDF_SORTUP
42 | };
43 | };
44 |
45 | public const int LVM_FIRST = 0x1000;
46 | public const int LVM_GETHEADER = LVM_FIRST + 31;
47 |
48 | public const int HDM_FIRST = 0x1200;
49 | public const int HDM_GETITEM = HDM_FIRST + 11;
50 | public const int HDM_SETITEM = HDM_FIRST + 12;
51 |
52 | [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
53 | public static extern IntPtr SendMessage(this IntPtr hWnd, UInt32 msg, IntPtr wParam, IntPtr lParam);
54 |
55 | [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
56 | public static extern IntPtr SendMessage(this IntPtr hWnd, UInt32 msg, IntPtr wParam, ref HDITEM lParam);
57 |
58 | public static void SetSortIcon(this ListView listViewControl, int columnIndex, SortOrder order)
59 | {
60 | IntPtr columnHeader = SendMessage(listViewControl.Handle, LVM_GETHEADER, IntPtr.Zero, IntPtr.Zero);
61 | for (int columnNumber = 0; columnNumber <= listViewControl.Columns.Count - 1; columnNumber++)
62 | {
63 | var columnPtr = new IntPtr(columnNumber);
64 | var lvColumn = new HDITEM
65 | {
66 | mask = HDITEM.Mask.Format
67 | };
68 |
69 | if (SendMessage(columnHeader, HDM_GETITEM, columnPtr, ref lvColumn) == IntPtr.Zero)
70 | {
71 | throw new Win32Exception();
72 | }
73 |
74 | if (order != SortOrder.None && columnNumber == columnIndex)
75 | {
76 | switch (order)
77 | {
78 | case SortOrder.Ascending:
79 | lvColumn.fmt &= ~HDITEM.Format.SortDown;
80 | lvColumn.fmt |= HDITEM.Format.SortUp;
81 | break;
82 | case SortOrder.Descending:
83 | lvColumn.fmt &= ~HDITEM.Format.SortUp;
84 | lvColumn.fmt |= HDITEM.Format.SortDown;
85 | break;
86 | }
87 | }
88 | else
89 | {
90 | lvColumn.fmt &= ~HDITEM.Format.SortDown & ~HDITEM.Format.SortUp;
91 | }
92 |
93 | if (SendMessage(columnHeader, HDM_SETITEM, columnPtr, ref lvColumn) == IntPtr.Zero)
94 | {
95 | throw new Win32Exception();
96 | }
97 | }
98 | }
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/DetectionDetail.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Runtime.CompilerServices;
4 | using System.Text;
5 | using UtfUnknown.Core;
6 | using UtfUnknown.Core.Probers;
7 |
8 | [assembly: InternalsVisibleTo("UtfUnknown.Tests, PublicKey=" +
9 | "002400000480000094000000060200000024000052534131000400000100010029f6b4defac763" +
10 | "66721687460b44b7619e8e19a411f785279316fdae2f6965edfa4a460304fe8b4ed796d5356a1c" +
11 | "225131b9087983d9ff9530df9307eab17d88cd4f1005a45f6f35523445d1ff7323322f3060cffc" +
12 | "0d70d0cb1b4b7d46081bbead31844927aaadb0508b64bf298de5abe5ea5cca8b92490c961b7b75" +
13 | "13c2c2a9")]
14 | namespace UtfUnknown
15 | {
16 | ///
17 | /// Detailed result of a detection
18 | ///
19 | public class DetectionDetail
20 | {
21 | ///
22 | /// A dictionary for replace unsupported codepage name in .NET to the nearly identical version.
23 | ///
24 | private static readonly Dictionary FixedToSupportCodepageName =
25 | new Dictionary
26 | {
27 | // CP949 is superset of ks_c_5601-1987 (see https://github.com/CharsetDetector/UTF-unknown/pull/74#issuecomment-550362133)
28 | {CodepageName.CP949, CodepageName.KS_C_5601_1987},
29 | {CodepageName.ISO_2022_CN, CodepageName.X_CP50227},
30 | };
31 |
32 | ///
33 | /// New result
34 | ///
35 | public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null,
36 | TimeSpan? time = null, string statusLog = null)
37 | {
38 | EncodingName = encodingShortName;
39 | Confidence = confidence;
40 | Encoding = GetEncoding(encodingShortName);
41 | Prober = prober;
42 | Time = time;
43 | StatusLog = statusLog;
44 | }
45 |
46 | ///
47 | /// New Result
48 | ///
49 | public DetectionDetail(CharsetProber prober, TimeSpan? time = null)
50 | : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time, prober.DumpStatus())
51 | {
52 | }
53 |
54 | ///
55 | /// The (short) name of the detected encoding. For full details, check
56 | ///
57 | public string EncodingName { get; }
58 |
59 | ///
60 | /// The detected encoding.
61 | ///
62 | public Encoding Encoding { get; set; }
63 |
64 | ///
65 | /// The confidence of the found encoding. Between 0 and 1.
66 | ///
67 | public float Confidence { get; set; }
68 |
69 | ///
70 | /// The used prober for detection
71 | ///
72 | public CharsetProber Prober { get; set; }
73 |
74 | ///
75 | /// A Byte Order Mark was detected
76 | ///
77 | public bool HasBOM { get; set; }
78 |
79 | ///
80 | /// The time spend
81 | ///
82 | public TimeSpan? Time { get; set; }
83 |
84 | public string StatusLog { get; set; }
85 |
86 | public override string ToString()
87 | {
88 | return $"Detected {EncodingName} with confidence of {Confidence}. (BOM: {HasBOM})";
89 | }
90 |
91 | internal static Encoding GetEncoding(string encodingShortName)
92 | {
93 | var encodingName = FixedToSupportCodepageName.TryGetValue(encodingShortName, out var supportCodepageName)
94 | ? supportCodepageName
95 | : encodingShortName;
96 | try
97 | {
98 | return Encoding.GetEncoding(encodingName);
99 | }
100 | catch (ArgumentException) // unsupported name
101 | {
102 | #if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
103 | return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
104 | #else
105 | return null;
106 | #endif
107 | }
108 | }
109 | }
110 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Bulgarian/Iso_8859_5_BulgarianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian
44 | {
45 | public class Iso_8859_5_BulgarianModel : BulgarianModel
46 | {
47 | // CTR: Control characters that usually does not exist in any text
48 | // RET: Carriage/Return
49 | // SYM: symbol(punctuation) that does not belong to word
50 | // NUM: 0 - 9
51 | //
52 | // Character Mapping Table:
53 | // this table is modified base on win1251BulgarianCharToOrderMap, so
54 | // only number <64 is sure valid
55 |
56 | private static byte[] CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */
62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */
64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, /* 8X */
66 | 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, /* 9X */
67 | 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, /* AX */
68 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* BX */
69 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, /* CX */
70 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* DX */
71 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, /* EX */
72 | 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Iso_8859_5_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Bulgarian/Windows_1251_BulgarianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian
44 | {
45 | public class Windows_1251_BulgarianModel : BulgarianModel
46 | {
47 | // CTR: Control characters that usually does not exist in any text
48 | // RET: Carriage/Return
49 | // SYM: symbol(punctuation) that does not belong to word
50 | // NUM: 0 - 9
51 | //
52 | // Character Mapping Table:
53 | // this table is modified base on win1251BulgarianCharToOrderMap, so
54 | // only number <64 is sure valid
55 |
56 | private static byte[] CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */
62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */
64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, /* 8X */
66 | 221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, /* 9X */
67 | 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, /* AX */
68 | 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, /* BX */
69 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* CX */
70 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, /* DX */
71 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* EX */
72 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Windows_1251_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Hebrew/Windows_1255_HebrewModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | /*
40 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
41 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangHebrewModel.cpp
42 | * and adjusted to language specific support.
43 | */
44 |
45 | namespace UtfUnknown.Core.Models.SingleByte.Hebrew
46 | {
47 | public class Windows_1255_HebrewModel : HebrewModel
48 | {
49 | // 255: Control characters that usually does not exist in any text
50 | // 254: Carriage/Return
51 | // 253: symbol (punctuation) that does not belong to word
52 | // 252: 0 - 9
53 |
54 | // Windows-1255 language model
55 | // Character Mapping Table:
56 | private readonly static byte[]CHAR_TO_ORDER_MAP = {
57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
61 | SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, /* 4X */
62 | 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, /* 5X */
63 | SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, /* 6X */
64 | 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, /* 7X */
65 | 124,ILL,203,204,205, 40, 58,206,207,208,ILL,210,ILL,ILL,ILL,ILL, /* 8X */
66 | ILL, 83, 52, 47, 46, 72, 32, 94,216,113,ILL,109,ILL,ILL,ILL,ILL, /* 9X */
67 | 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, /* AX */
68 | 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, /* BX */
69 | 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, /* CX */
70 | 238, 38, 45,239,240,241,242,243,127,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* DX */
71 | 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, /* EX */
72 | 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,ILL,ILL,128, 96,ILL, /* FX */
73 | };
74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
75 |
76 | public Windows_1255_HebrewModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1255)
77 | {
78 | }
79 | }
80 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/UTF8Prober.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Models;
42 | using UtfUnknown.Core.Models.MultiByte;
43 |
44 | namespace UtfUnknown.Core.Probers.MultiByte
45 | {
46 | public class UTF8Prober : CharsetProber
47 | {
48 | private static float ONE_CHAR_PROB = 0.50f;
49 | private CodingStateMachine codingSM;
50 | private int numOfMBChar;
51 |
52 | public UTF8Prober()
53 | {
54 | numOfMBChar = 0;
55 | codingSM = new CodingStateMachine(new UTF8_SMModel());
56 | Reset();
57 | }
58 |
59 | public override string GetCharsetName()
60 | {
61 | return CodepageName.UTF8;
62 | }
63 |
64 | public override void Reset()
65 | {
66 | codingSM.Reset();
67 | numOfMBChar = 0;
68 | state = ProbingState.Detecting;
69 | }
70 |
71 | public override ProbingState HandleData(byte[] buf, int offset, int len)
72 | {
73 | int max = offset + len;
74 |
75 | for (int i = offset; i < max; i++)
76 | {
77 |
78 | var codingState = codingSM.NextState(buf[i]);
79 |
80 | if (codingState == StateMachineModel.ERROR)
81 | {
82 | state = ProbingState.NotMe;
83 | break;
84 | }
85 |
86 | if (codingState == StateMachineModel.ITSME)
87 | {
88 | state = ProbingState.FoundIt;
89 | break;
90 | }
91 |
92 | if (codingState == StateMachineModel.START)
93 | {
94 | if (codingSM.CurrentCharLen >= 2)
95 | numOfMBChar++;
96 | }
97 | }
98 |
99 | if (state == ProbingState.Detecting)
100 | if (GetConfidence() > SHORTCUT_THRESHOLD)
101 | state = ProbingState.FoundIt;
102 |
103 | return state;
104 | }
105 |
106 | public override float GetConfidence(StringBuilder status = null)
107 | {
108 | float unlike = 0.99f;
109 | float confidence;
110 |
111 | if (numOfMBChar < 6)
112 | {
113 | for (int i = 0; i < numOfMBChar; i++)
114 | unlike *= ONE_CHAR_PROB;
115 |
116 | confidence = 1.0f - unlike;
117 | }
118 | else
119 | {
120 | confidence = 0.99f;
121 | }
122 |
123 | return confidence;
124 | }
125 | }
126 | }
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SequenceModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System;
40 |
41 | namespace UtfUnknown.Core.Models
42 | {
43 | public abstract class SequenceModel
44 | {
45 | // Codepoints
46 |
47 | // Illegal codepoints
48 | public const byte ILL = 255;
49 | // Control character
50 | public const byte CTR = 254;
51 | // Symbols and punctuation that does not belong to words
52 | public const byte SYM = 253;
53 | // Return/Line feeds
54 | public const byte RET = 252;
55 | // Numbers 0-9
56 | public const byte NUM = 251;
57 |
58 | // [256] table use to find a char's order
59 | protected byte[] charToOrderMap;
60 |
61 | // freqCharCount x freqCharCount table to find a 2-char sequence's
62 | // frequency
63 | protected byte[] precedenceMatrix;
64 |
65 | // The count of frequent characters
66 | protected int freqCharCount;
67 |
68 | public int FreqCharCount
69 | {
70 | get { return freqCharCount; }
71 | }
72 |
73 | // freqSeqs / totalSeqs
74 | protected float typicalPositiveRatio;
75 |
76 | public float TypicalPositiveRatio {
77 | get { return typicalPositiveRatio; }
78 | }
79 |
80 |
81 | ///
82 | /// TODO not used?
83 | ///
84 | protected bool keepEnglishLetter;
85 |
86 | ///
87 | /// TODO not used?
88 | ///
89 | public bool KeepEnglishLetter {
90 | get { return keepEnglishLetter; }
91 | }
92 |
93 | protected string charsetName;
94 |
95 | public string CharsetName {
96 | get { return charsetName; }
97 | }
98 |
99 | public SequenceModel(
100 | byte[] charToOrderMap,
101 | byte[] precedenceMatrix,
102 | int freqCharCount,
103 | float typicalPositiveRatio,
104 | bool keepEnglishLetter,
105 | String charsetName)
106 | {
107 | this.charToOrderMap = charToOrderMap;
108 | this.precedenceMatrix = precedenceMatrix;
109 | this.freqCharCount = freqCharCount;
110 | this.typicalPositiveRatio = typicalPositiveRatio;
111 | this.keepEnglishLetter = keepEnglishLetter;
112 | this.charsetName = charsetName;
113 | }
114 |
115 | public byte GetOrder(byte b)
116 | {
117 | return charToOrderMap[b];
118 | }
119 |
120 | public byte GetPrecedence(int pos)
121 | {
122 | return precedenceMatrix[pos];
123 | }
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/Chinese/EUCTWProber.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Analyzers.Chinese;
42 | using UtfUnknown.Core.Models;
43 | using UtfUnknown.Core.Models.MultiByte.Chinese;
44 |
45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese
46 | {
47 | public class EUCTWProber : CharsetProber
48 | {
49 | private CodingStateMachine codingSM;
50 | private EUCTWDistributionAnalyser distributionAnalyser;
51 | private byte[] lastChar = new byte[2];
52 |
53 | public EUCTWProber()
54 | {
55 | codingSM = new CodingStateMachine(new EUCTWSMModel());
56 | distributionAnalyser = new EUCTWDistributionAnalyser();
57 | Reset();
58 | }
59 |
60 | public override ProbingState HandleData(byte[] buf, int offset, int len)
61 | {
62 | int codingState;
63 | int max = offset + len;
64 |
65 | for (int i = 0; i < max; i++)
66 | {
67 | codingState = codingSM.NextState(buf[i]);
68 | if (codingState == StateMachineModel.ERROR)
69 | {
70 | state = ProbingState.NotMe;
71 | break;
72 | }
73 |
74 | if (codingState == StateMachineModel.ITSME)
75 | {
76 | state = ProbingState.FoundIt;
77 | break;
78 | }
79 |
80 | if (codingState == StateMachineModel.START)
81 | {
82 | int charLen = codingSM.CurrentCharLen;
83 | if (i == offset)
84 | {
85 | lastChar[1] = buf[offset];
86 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
87 | }
88 | else
89 | {
90 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
91 | }
92 | }
93 | }
94 |
95 | lastChar[0] = buf[max - 1];
96 |
97 | if (state == ProbingState.Detecting)
98 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
99 | state = ProbingState.FoundIt;
100 |
101 | return state;
102 | }
103 |
104 | public override string GetCharsetName()
105 | {
106 | return CodepageName.EUC_TW;
107 | }
108 |
109 | public override void Reset()
110 | {
111 | codingSM.Reset();
112 | state = ProbingState.Detecting;
113 | distributionAnalyser.Reset();
114 | }
115 |
116 | public override float GetConfidence(StringBuilder status = null)
117 | {
118 | return distributionAnalyser.GetConfidence();
119 | }
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Ibm852_CzechModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Czech
44 | {
45 | public class Ibm852_CzechModel : CzechModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 03:28:11.733089
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */
73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */
75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */
77 | 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */
78 | 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */
82 | 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Polish/Ibm852_PolishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangPolishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Polish
44 | {
45 | public class Ibm852_PolishModel : PolishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 17:21:04.405363
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */
73 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */
75 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */
77 | 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */
78 | 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */
82 | 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_PolishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Slovak/Ibm852_SlovakModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSlovakModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Slovak
44 | {
45 | public class Ibm852_SlovakModel : SlovakModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 13:33:10.331339
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */
73 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */
75 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 51, 46, 25, 62, 38, 48, 47, 51, 49, 54, 50, 50, 63, 64, 38, 47, /* 8X */
77 | 25, 42, 42, 32, 43, 33, 33, 65, 66, 43, 46, 31, 31, 49,SYM, 24, /* 9X */
78 | 21, 23, 35, 27, 67, 68, 26, 26, 69, 70,SYM, 71, 24, 59,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 21, 72, 41, 59,SYM,SYM,SYM,SYM, 61, 61,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM, 56, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 55, 55, 39, 54, 39, 36, 23, 73, 41,SYM,SYM,SYM,SYM, 74, 48,SYM, /* DX */
82 | 35, 58, 32, 52, 52, 36, 28, 28, 44, 27, 44, 60, 22, 22, 75,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 45, 45,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_SlovakModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Iso_8859_2_CzechModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Czech
44 | {
45 | public class Iso_8859_2_CzechModel : CzechModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 03:28:11.733089
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */
73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */
75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */
79 | SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */
80 | 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */
81 | 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */
82 | 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */
83 | 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_2_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_2)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_1_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_1_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */
81 | 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */
82 | 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */
83 | 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_1_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_9_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_9_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */
81 | 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */
82 | 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */
83 | 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_9_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_9)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Slovene/Ibm852_SloveneModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSloveneModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Slovene
44 | {
45 | public class Ibm852_SloveneModel : SloveneModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-28 22:06:46.134717
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */
73 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */
75 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 34,249, 29,249,249,249, 37, 34,249, 36,249,249,249,249,249, 37, /* 8X */
77 | 29,249,249, 35,249,249,249,249,249,249,249,249,249,249,SYM, 21, /* 9X */
78 | 32, 30, 31, 39,249,249, 23, 23,249,249,SYM,249, 21,249,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 32,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 249,249,249, 36,249,249, 30,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */
82 | 31,249, 35,249,249,249, 22, 22,249, 39,249,249, 40, 40,249,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_SloveneModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/Chinese/Big5Prober.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Universal charset detector code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 2001
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | * Shy Shalom
23 | * Rudi Pettazzi (C# port)
24 | *
25 | * Alternatively, the contents of this file may be used under the terms of
26 | * either the GNU General Public License Version 2 or later (the "GPL"), or
27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
29 | * of those above. If you wish to allow use of your version of this file only
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
31 | * use your version of this file under the terms of the MPL, indicate your
32 | * decision by deleting the provisions above and replace them with the notice
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
34 | * the provisions above, a recipient may use your version of this file under
35 | * the terms of any one of the MPL, the GPL or the LGPL.
36 | *
37 | * ***** END LICENSE BLOCK ***** */
38 |
39 | using System.Text;
40 |
41 | using UtfUnknown.Core.Analyzers.Chinese;
42 | using UtfUnknown.Core.Models;
43 | using UtfUnknown.Core.Models.MultiByte.Chinese;
44 |
45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese
46 | {
47 | public class Big5Prober : CharsetProber
48 | {
49 | //void GetDistribution(PRUint32 aCharLen, const char* aStr);
50 | private CodingStateMachine codingSM;
51 | private BIG5DistributionAnalyser distributionAnalyser;
52 | private byte[] lastChar = new byte[2];
53 |
54 | public Big5Prober()
55 | {
56 | codingSM = new CodingStateMachine(new BIG5SMModel());
57 | distributionAnalyser = new BIG5DistributionAnalyser();
58 | Reset();
59 | }
60 |
61 | public override ProbingState HandleData(byte[] buf, int offset, int len)
62 | {
63 | int max = offset + len;
64 |
65 | for (int i = offset; i < max; i++)
66 | {
67 | var codingState = codingSM.NextState(buf[i]);
68 | if (codingState == StateMachineModel.ERROR)
69 | {
70 | state = ProbingState.NotMe;
71 | break;
72 | }
73 | if (codingState == StateMachineModel.ITSME)
74 | {
75 | state = ProbingState.FoundIt;
76 | break;
77 | }
78 | if (codingState == StateMachineModel.START)
79 | {
80 | int charLen = codingSM.CurrentCharLen;
81 | if (i == offset)
82 | {
83 | lastChar[1] = buf[offset];
84 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
85 | }
86 | else
87 | {
88 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
89 | }
90 | }
91 | }
92 |
93 | lastChar[0] = buf[max - 1];
94 |
95 | if (state == ProbingState.Detecting)
96 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
97 | state = ProbingState.FoundIt;
98 |
99 | return state;
100 | }
101 |
102 | public override void Reset()
103 | {
104 | codingSM.Reset();
105 | state = ProbingState.Detecting;
106 | distributionAnalyser.Reset();
107 | }
108 |
109 | public override string GetCharsetName()
110 | {
111 | return CodepageName.BIG5;
112 | }
113 |
114 | public override float GetConfidence(StringBuilder status = null)
115 | {
116 | return distributionAnalyser.GetConfidence();
117 | }
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Croatian/Ibm852_CroatianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCroatianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Croatian
44 | {
45 | public class Ibm852_CroatianModel : CroatianModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-25 23:50:27.590137
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */
73 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */
75 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */
77 | 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */
78 | 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */
82 | 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_CroatianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_15_IrishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Irish
44 | {
45 | public class Iso_8859_15_IrishModel : IrishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-27 00:33:40.158624
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */
73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */
75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */
80 | 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */
81 | 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */
82 | 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */
83 | 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_15_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_15)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Romanian/Ibm852_RomanianModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRomanianModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Romanian
44 | {
45 | public class Ibm852_RomanianModel : RomanianModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-28 18:58:13.757152
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */
73 | 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */
75 | 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | 42, 39, 31, 24, 33,138, 35, 42, 46, 49, 44, 44, 20,139, 33, 35, /* 8X */
77 | 31,140,141,142, 36,143,144, 56, 56, 36, 39,145,146, 46,SYM, 41, /* 9X */
78 | 30, 37, 34, 47,147,148, 40, 40,149,150,SYM,151, 41,152,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 30, 24, 51,153,SYM,SYM,SYM,SYM,154,155,SYM, /* BX */
80 | SYM,SYM,SYM,SYM,SYM,SYM, 14, 14,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
81 | 43, 43,156, 49,157,158, 37, 20, 51,SYM,SYM,SYM,SYM,159,160,SYM, /* DX */
82 | 34, 57,161, 52, 52,162, 38, 38,163, 47,164, 50, 54, 54,165,SYM, /* EX */
83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50, 55, 55,SYM,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Ibm852_RomanianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Arabic/Iso_8859_6_ArabicModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangArabicModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Arabic
44 | {
45 | public class Iso_8859_6_ArabicModel : ArabicModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2015-12-13 18:33:58.848027
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */
73 | 64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */
75 | 62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, /* AX */
79 | ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, /* BX */
80 | ILL, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */
81 | 30, 5, 26, 12, 21, 23, 28,SYM, 33, 10, 29,ILL,ILL,ILL,ILL,ILL, /* DX */
82 | 36, 13, 14, 17, 1, 3, 6, 16, 4, 24, 2,SYM,SYM,SYM,SYM,SYM, /* EX */
83 | SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_6_ArabicModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_6)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Windows_1250_CzechModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Czech
44 | {
45 | public class Windows_1250_CzechModel : CzechModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-09-21 03:28:11.733089
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */
73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */
75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 47, /* 8X */
77 | ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 48, /* 9X */
78 | SYM,SYM,SYM, 49,SYM, 50,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM, 52, /* AX */
79 | SYM,SYM,SYM, 53,SYM,SYM,SYM,SYM,SYM, 54, 55,SYM, 45,SYM, 45, 56, /* BX */
80 | 57, 18, 58, 59, 42, 60, 61, 62, 25, 24, 63, 64, 23, 11, 65, 39, /* CX */
81 | 66, 67, 35, 37, 68, 69, 41,SYM, 27, 31, 33, 70, 43, 28, 71, 72, /* DX */
82 | 73, 18, 74, 75, 42, 76, 77, 78, 25, 24, 79, 80, 23, 11, 81, 39, /* EX */
83 | 82, 83, 35, 37, 84, 85, 41,SYM, 27, 31, 33, 86, 43, 28, 87,SYM, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Windows_1250_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1250)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Danish/Iso_8859_1_DanishModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangDanishModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.Danish
44 | {
45 | public class Iso_8859_1_DanishModel : DanishModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2016-02-19 17:56:42.163975
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */
73 | 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */
75 | 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */
81 | 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */
82 | 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */
83 | 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_1_DanishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/French/Iso_8859_1_FrenchModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangFrenchModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.French
44 | {
45 | public class Iso_8859_1_FrenchModel : FrenchModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2015-12-03 21:10:27.685575
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */
73 | 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */
75 | 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 24, 38, 32, 46, 49, 68, 47, 27, 23, 14, 28, 41, 69, 39, 33, 36, /* CX */
81 | 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 70, /* DX */
82 | 24, 38, 32, 46, 49, 71, 47, 27, 23, 14, 28, 41, 72, 39, 33, 36, /* EX */
83 | 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 73, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_1_FrenchModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/German/Iso_8859_1_GermanModel.cs:
--------------------------------------------------------------------------------
1 | /* ***** BEGIN LICENSE BLOCK *****
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 | *
4 | * The contents of this file are subject to the Mozilla Public License Version
5 | * 1.1 (the "License"); you may not use this file except in compliance with
6 | * the License. You may obtain a copy of the License at
7 | * http://www.mozilla.org/MPL/
8 | *
9 | * Software distributed under the License is distributed on an "AS IS" basis,
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 | * for the specific language governing rights and limitations under the
12 | * License.
13 | *
14 | * The Original Code is Mozilla Communicator client code.
15 | *
16 | * The Initial Developer of the Original Code is
17 | * Netscape Communications Corporation.
18 | * Portions created by the Initial Developer are Copyright (C) 1998
19 | * the Initial Developer. All Rights Reserved.
20 | *
21 | * Contributor(s):
22 | *
23 | * Alternatively, the contents of this file may be used under the terms of
24 | * either the GNU General Public License Version 2 or later (the "GPL"), or
25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 | * in which case the provisions of the GPL or the LGPL are applicable instead
27 | * of those above. If you wish to allow use of your version of this file only
28 | * under the terms of either the GPL or the LGPL, and not to allow others to
29 | * use your version of this file under the terms of the MPL, indicate your
30 | * decision by deleting the provisions above and replace them with the notice
31 | * and other provisions required by the GPL or the LGPL. If you do not delete
32 | * the provisions above, a recipient may use your version of this file under
33 | * the terms of any one of the MPL, the GPL or the LGPL.
34 | *
35 | * ***** END LICENSE BLOCK ***** */
36 |
37 | /*
38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet
39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangGermanModel.cpp
40 | * and adjusted to language specific support.
41 | */
42 |
43 | namespace UtfUnknown.Core.Models.SingleByte.German
44 | {
45 | public class Iso_8859_1_GermanModel : GermanModel
46 | {
47 | // Generated by BuildLangModel.py
48 | // On: 2015-12-03 22:50:46.518374
49 |
50 | // Character Mapping Table:
51 | // ILL: illegal character.
52 | // CTR: control character specific to the charset.
53 | // RET: carriage/return.
54 | // SYM: symbol (punctuation) that does not belong to word.
55 | // NUM: 0 - 9.
56 |
57 | // Other characters are ordered by probabilities
58 | // (0 is the most common character in the language).
59 |
60 | // Orders are generic to a language. So the codepoint with order X in
61 | // CHARSET1 maps to the same character as the codepoint with the same
62 | // order X in CHARSET2 for the same language.
63 | // As such, it is possible to get missing order. For instance the
64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
65 | // even though they are both used for French. Same for the euro sign.
66 |
67 | private static byte[] CHAR_TO_ORDER_MAP = {
68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
72 | SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */
73 | 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
74 | SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */
75 | 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
79 | SYM,SYM,SYM,SYM,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
80 | 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */
81 | 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 66, 24, 45, 67, 27, /* DX */
82 | 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */
83 | 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 68, 24, 45, 69, 56, /* FX */
84 | };
85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
86 |
87 | public Iso_8859_1_GermanModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1)
88 | {
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------