├── form.png ├── App └── EncodingChecker.exe ├── sources ├── EncodingChecker │ ├── AppIcon.ico │ ├── Properties │ │ ├── Settings.settings │ │ ├── Settings.Designer.cs │ │ ├── AssemblyInfo.cs │ │ └── Resources.Designer.cs │ ├── UtfUnknown │ │ ├── Core │ │ │ ├── InputState.cs │ │ │ ├── Probers │ │ │ │ ├── ProbingState.cs │ │ │ │ ├── CodingStateMachine.cs │ │ │ │ └── MultiByte │ │ │ │ │ ├── UTF8Prober.cs │ │ │ │ │ └── Chinese │ │ │ │ │ ├── EUCTWProber.cs │ │ │ │ │ └── Big5Prober.cs │ │ │ ├── Analyzers │ │ │ │ └── MultiByte │ │ │ │ │ └── Japanese │ │ │ │ │ ├── EUCJPDistributionAnalyser.cs │ │ │ │ │ ├── SJISContextAnalyser.cs │ │ │ │ │ └── EUCJPContextAnalyser.cs │ │ │ ├── Models │ │ │ │ ├── MultiByte │ │ │ │ │ ├── Korean │ │ │ │ │ │ ├── EUCKRSMModel.cs │ │ │ │ │ │ └── Iso_2022_KR_SMModel.cs │ │ │ │ │ ├── Chinese │ │ │ │ │ │ ├── BIG5SMModel.cs │ │ │ │ │ │ ├── EUCTWSMModel.cs │ │ │ │ │ │ ├── Iso_2022_CN_SMModel.cs │ │ │ │ │ │ └── GB18030_SMModel.cs │ │ │ │ │ ├── Japanese │ │ │ │ │ │ ├── SJIS_SMModel.cs │ │ │ │ │ │ ├── EUCJPSMModel.cs │ │ │ │ │ │ └── Iso_2022_JP_SMModel.cs │ │ │ │ │ ├── UCS2BE_SMModel.cs │ │ │ │ │ └── UCS2LE_SMModel.cs │ │ │ │ ├── StateMachineModel.cs │ │ │ │ ├── SingleByte │ │ │ │ │ ├── Russian │ │ │ │ │ │ ├── Ibm866_RussianModel.cs │ │ │ │ │ │ ├── Ibm855_RussianModel.cs │ │ │ │ │ │ ├── Koi8r_Model.cs │ │ │ │ │ │ ├── Iso_8859_5_RussianModel.cs │ │ │ │ │ │ ├── Windows_1251_RussianModel.cs │ │ │ │ │ │ └── X_Mac_Cyrillic_RussianModel.cs │ │ │ │ │ ├── Bulgarian │ │ │ │ │ │ ├── Iso_8859_5_BulgarianModel.cs │ │ │ │ │ │ └── Windows_1251_BulgarianModel.cs │ │ │ │ │ ├── Hebrew │ │ │ │ │ │ └── Windows_1255_HebrewModel.cs │ │ │ │ │ ├── Czech │ │ │ │ │ │ ├── Ibm852_CzechModel.cs │ │ │ │ │ │ ├── Iso_8859_2_CzechModel.cs │ │ │ │ │ │ └── Windows_1250_CzechModel.cs │ │ │ │ │ ├── Polish │ │ │ │ │ │ └── Ibm852_PolishModel.cs │ │ │ │ │ ├── Slovak │ │ │ │ │ │ └── Ibm852_SlovakModel.cs │ │ │ │ │ ├── Irish │ │ │ │ │ │ ├── Iso_8859_1_IrishModel.cs │ │ │ │ │ │ ├── Iso_8859_9_IrishModel.cs │ │ │ │ │ │ └── Iso_8859_15_IrishModel.cs │ │ │ │ │ ├── Slovene │ │ │ │ │ │ └── Ibm852_SloveneModel.cs │ │ │ │ │ ├── Croatian │ │ │ │ │ │ └── Ibm852_CroatianModel.cs │ │ │ │ │ ├── Romanian │ │ │ │ │ │ └── Ibm852_RomanianModel.cs │ │ │ │ │ ├── Arabic │ │ │ │ │ │ └── Iso_8859_6_ArabicModel.cs │ │ │ │ │ ├── Danish │ │ │ │ │ │ └── Iso_8859_1_DanishModel.cs │ │ │ │ │ ├── French │ │ │ │ │ │ └── Iso_8859_1_FrenchModel.cs │ │ │ │ │ └── German │ │ │ │ │ │ └── Iso_8859_1_GermanModel.cs │ │ │ │ └── SequenceModel.cs │ │ │ └── BitPackage.cs │ │ ├── DetectionResult.cs │ │ └── DetectionDetail.cs │ ├── Program.cs │ ├── AboutForm.cs │ ├── ListViewColumnSorter.cs │ ├── Settings.cs │ ├── TextEncoding.cs │ └── ListViewExtensions.cs └── EncodingChecker.sln ├── appveyor.yml ├── README.md └── .gitattributes /form.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/form.png -------------------------------------------------------------------------------- /App/EncodingChecker.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/App/EncodingChecker.exe -------------------------------------------------------------------------------- /sources/EncodingChecker/AppIcon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amrali-eg/EncodingChecker/HEAD/sources/EncodingChecker/AppIcon.ico -------------------------------------------------------------------------------- /sources/EncodingChecker/Properties/Settings.settings: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 2.0.{build} 2 | image: Visual Studio 2019 3 | configuration: Release 4 | platform: Any CPU 5 | build: 6 | project: sources/EncodingChecker.sln 7 | verbosity: minimal 8 | 9 | after_build: 10 | - 7z a EncodingChecker.zip %APPVEYOR_BUILD_FOLDER%/sources/EncodingChecker/bin/Release/* 11 | 12 | artifacts: 13 | - path: EncodingChecker.zip 14 | name: EncodingChecker -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/InputState.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core 2 | { 3 | enum InputState 4 | { 5 | PureASCII=0, 6 | 7 | /// 8 | /// Found escape character or HZ "~{" 9 | /// 10 | EscASCII = 1, 11 | 12 | /// 13 | /// non-ascii byte (high-byte) 14 | /// 15 | Highbyte = 2 16 | }; 17 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Probers/ProbingState.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Probers 2 | { 3 | public enum ProbingState 4 | { 5 | /// 6 | /// No sure answer yet, but caller can ask for confidence 7 | /// 8 | Detecting = 0, // 9 | /// 10 | /// Positive answer 11 | /// 12 | FoundIt = 1, 13 | /// 14 | /// Negative answer 15 | /// 16 | NotMe = 2 17 | }; 18 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class EUCJPDistributionAnalyser : SJISDistributionAnalyser 4 | { 5 | /// 6 | /// first byte range: 0xa0 -- 0xfe 7 | /// second byte range: 0xa1 -- 0xfe 8 | /// no validation needed here. State machine has done that 9 | /// 10 | public override int GetOrder(byte[] buf, int offset) 11 | { 12 | if (buf[offset] >= 0xA0) 13 | return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1; 14 | else 15 | return -1; 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Threading; 3 | using System.Windows.Forms; 4 | 5 | namespace EncodingChecker 6 | { 7 | internal static class Program 8 | { 9 | [STAThread] 10 | private static void Main() 11 | { 12 | Application.ThreadException += OnApplicationThreadException; 13 | Application.EnableVisualStyles(); 14 | Application.SetCompatibleTextRenderingDefault(false); 15 | Application.Run(new MainForm()); 16 | } 17 | 18 | private static void OnApplicationThreadException(object sender, ThreadExceptionEventArgs e) 19 | { 20 | MessageBox.Show(e.Exception.Message, @"Error", MessageBoxButtons.OK, MessageBoxIcon.Error); 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/AboutForm.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | using System.Windows.Forms; 3 | 4 | namespace EncodingChecker 5 | { 6 | public partial class AboutForm : Form 7 | { 8 | public AboutForm() 9 | { 10 | InitializeComponent(); 11 | } 12 | 13 | private void OnFormLoad(object sender, System.EventArgs e) 14 | { 15 | lblHomepage.Links[0].LinkData = "https://github.com/amrali-eg/EncodingChecker"; 16 | lblAuthor.Links[0].LinkData = "https://github.com/JeevanJames"; 17 | lblLicense.Links[0].LinkData = "http://www.mozilla.org/MPL/MPL-1.1.html"; 18 | lblCreditsUde.Links[0].LinkData = "https://github.com/CharsetDetector/UTF-unknown"; 19 | lblCreditsCodePlex.Links[0].LinkData = "http://encodingchecker.codeplex.com"; 20 | } 21 | 22 | private void OnLinkClicked(object sender, LinkLabelLinkClickedEventArgs e) 23 | { 24 | string url = (string)e.Link.LinkData; 25 | ProcessStartInfo startInfo = new ProcessStartInfo(url) {UseShellExecute = true}; 26 | Process.Start(startInfo); 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /sources/EncodingChecker.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30309.148 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EncodingChecker", "EncodingChecker\EncodingChecker.csproj", "{134E6B14-A7BE-4CED-8332-3A2CA6023EE1}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {134E6B14-A7BE-4CED-8332-3A2CA6023EE1}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {0E70C6C9-020B-4479-A5CB-2A85A2137B6C} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /sources/EncodingChecker/Properties/Settings.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace EncodingChecker.Properties 12 | { 13 | 14 | 15 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 16 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0")] 17 | internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase 18 | { 19 | 20 | private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); 21 | 22 | public static Settings Default 23 | { 24 | get 25 | { 26 | return defaultInstance; 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class SJISContextAnalyser : JapaneseContextAnalyser 4 | { 5 | private const byte HIRAGANA_FIRST_BYTE = 0x82; 6 | 7 | protected override int GetOrder(byte[] buf, int offset, out int charLen) 8 | { 9 | //find out current char's byte length 10 | if (buf[offset] >= 0x81 && buf[offset] <= 0x9F 11 | || buf[offset] >= 0xe0 && buf[offset] <= 0xFC) 12 | charLen = 2; 13 | else 14 | charLen = 1; 15 | 16 | // return its order if it is hiragana 17 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 18 | byte low = buf[offset+1]; 19 | if (low >= 0x9F && low <= 0xF1) 20 | return low - 0x9F; 21 | } 22 | return -1; 23 | } 24 | 25 | protected override int GetOrder(byte[] buf, int offset) 26 | { 27 | // We are only interested in Hiragana 28 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 29 | byte low = buf[offset+1]; 30 | if (low >= 0x9F && low <= 0xF1) 31 | return low - 0x9F; 32 | } 33 | return -1; 34 | } 35 | 36 | } 37 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/ListViewColumnSorter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Windows.Forms; 4 | 5 | namespace EncodingChecker 6 | { 7 | public class ListViewColumnSorter : IComparer 8 | { 9 | private readonly CaseInsensitiveComparer _objectCompare; 10 | 11 | public int SortColumn { get; set; } 12 | 13 | public SortOrder Order { get; set; } 14 | 15 | public ListViewColumnSorter() 16 | { 17 | SortColumn = 0; 18 | Order = SortOrder.None; 19 | _objectCompare = new CaseInsensitiveComparer(); 20 | } 21 | 22 | public int Compare(object x, object y) 23 | { 24 | ListViewItem listViewItem = (ListViewItem)x; 25 | if (listViewItem == null) throw new ArgumentNullException(nameof(listViewItem)); 26 | 27 | ListViewItem listViewItem2 = (ListViewItem)y; 28 | if (listViewItem2 == null) throw new ArgumentNullException(nameof(listViewItem2)); 29 | 30 | int compareResult = _objectCompare.Compare(a: listViewItem.SubItems[index: SortColumn].Text, b: listViewItem2.SubItems[index: SortColumn].Text); 31 | if (Order == SortOrder.Ascending) 32 | { 33 | return compareResult; 34 | } 35 | if (Order == SortOrder.Descending) 36 | { 37 | return -compareResult; 38 | } 39 | return 0; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Analyzers.Japanese 2 | { 3 | public class EUCJPContextAnalyser : JapaneseContextAnalyser 4 | { 5 | private const byte HIRAGANA_FIRST_BYTE = 0xA4; 6 | 7 | protected override int GetOrder(byte[] buf, int offset, out int charLen) 8 | { 9 | byte high = buf[offset]; 10 | 11 | //find out current char's byte length 12 | if (high == 0x8E || high >= 0xA1 && high <= 0xFE) 13 | charLen = 2; 14 | else if (high == 0xBF) 15 | charLen = 3; 16 | else 17 | charLen = 1; 18 | 19 | // return its order if it is hiragana 20 | if (high == HIRAGANA_FIRST_BYTE) { 21 | byte low = buf[offset+1]; 22 | if (low >= 0xA1 && low <= 0xF3) 23 | return low - 0xA1; 24 | } 25 | return -1; 26 | } 27 | 28 | protected override int GetOrder(byte[] buf, int offset) 29 | { 30 | // We are only interested in Hiragana 31 | if (buf[offset] == HIRAGANA_FIRST_BYTE) { 32 | byte low = buf[offset+1]; 33 | if (low >= 0xA1 && low <= 0xF3) 34 | return low - 0xA1; 35 | } 36 | return -1; 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/DetectionResult.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace UtfUnknown 6 | { 7 | /// 8 | /// Result of a detection. 9 | /// 10 | public class DetectionResult 11 | { 12 | /// 13 | /// Empty 14 | /// 15 | public DetectionResult() 16 | { 17 | } 18 | 19 | /// 20 | /// Multiple results 21 | /// 22 | public DetectionResult(IList details) 23 | { 24 | Details = details; 25 | } 26 | 27 | /// 28 | /// Single result 29 | /// 30 | /// 31 | public DetectionResult(DetectionDetail detectionDetail) 32 | { 33 | Details = new List { detectionDetail }; 34 | } 35 | 36 | /// 37 | /// Get the best Detection 38 | /// 39 | public DetectionDetail Detected => Details?.FirstOrDefault(); 40 | 41 | /// 42 | /// All results 43 | /// 44 | public IList Details { get; set; } 45 | 46 | public override string ToString() 47 | { 48 | return $"{nameof(Detected)}: {Detected}, \n{nameof(Details)}:\n - {string.Join("\n- ", Details?.Select(d => d.ToString()))}"; 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("File Encoding Checker")] 9 | [assembly: AssemblyDescription("GUI tool to check the encoding of a text file")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Jeevan James")] 12 | [assembly: AssemblyProduct("File Encoding Checker")] 13 | [assembly: AssemblyCopyright("Copyright © Jeevan James 2020")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("134e6b14-a7be-4ced-8332-3a2ca6023ee1")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("2.0.1.0")] 36 | [assembly: AssemblyFileVersion("2.0.1.0")] 37 | -------------------------------------------------------------------------------- /sources/EncodingChecker/Settings.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.ObjectModel; 3 | using System.Windows.Forms; 4 | 5 | namespace EncodingChecker 6 | { 7 | [Serializable] 8 | public sealed class Settings 9 | { 10 | public WindowPosition WindowPosition = new WindowPosition(); 11 | 12 | public RecentDirectories RecentDirectories = new RecentDirectories(); 13 | public bool IncludeSubdirectories = true; 14 | 15 | public string FileMasks; 16 | public string[] ValidCharsets; 17 | } 18 | 19 | [Serializable] 20 | public sealed class WindowPosition 21 | { 22 | public int Left = -1; 23 | public int Top = -1; 24 | public int Width = -1; 25 | public int Height = -1; 26 | public bool Maximized; 27 | 28 | public void ApplyTo(Form form) 29 | { 30 | if (Left >= 0 && Top >= 0 && Width > 0 && Height > 0) 31 | form.SetBounds(Left, Top, Width, Height); 32 | } 33 | } 34 | 35 | [Serializable] 36 | public sealed class RecentDirectories : Collection 37 | { 38 | protected override void InsertItem(int index, string item) 39 | { 40 | for (int i = Count - 1; i >= 0; i--) 41 | { 42 | if (this[i].Equals(item, StringComparison.OrdinalIgnoreCase)) 43 | RemoveAt(i); 44 | } 45 | 46 | base.InsertItem(0, item); 47 | 48 | if (Count > 10) 49 | { 50 | for (int i = Count - 1; i >= 10; i--) 51 | RemoveAt(i); 52 | } 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build status](https://ci.appveyor.com/api/projects/status/c8arh5v18u285jmj/branch/master?svg=true)](https://ci.appveyor.com/project/amrali-eg/encodingchecker/branch/master) 2 | 3 | # EncodingChecker v2.0 4 | File Encoding Checker is a GUI tool that allows you to validate the text encoding of one or more files. The tool can display the encoding for all selected files, or only the files that do not have the encodings you specify. 5 | 6 | File Encoding Checker requires Microsoft .NET Framework 4 to run. 7 | 8 | ![form image](./form.png "File Encoding Checker Form Preview") 9 | 10 | ## Fixed issues 11 | Sorting the results by clicking a column header is working now. 12 | 13 | Display the sort arrow in the columnn header for the results list view. 14 | 15 | When viewing a directory, some files matching the file masks were not listed. 16 | 17 | Improved performance of the list view control for faster processing of results. 18 | 19 | Added feature to export selected results to a text file. 20 | 21 | Switched to UtfUnknown library for better encoding detection (Multiple bugs from Ude fixed). 22 | 23 | Validating the detected file encoding to avoid errors during conversion of files. 24 | 25 | UTF-16 text files without byte-order-mark (BOM) can be detected by heuristics. 26 | 27 | ## Credits 28 | The original project [EncodingChecker](https://archive.codeplex.com/?p=encodingchecker) on CodePlex was written by [Jeevan James](https://github.com/JeevanJames). 29 | 30 | For encoding detection, File Encoding Checker now uses the [UtfUnknown](https://github.com/CharsetDetector/UTF-unknown) library, which is a C# port of [uchardet](https://gitlab.freedesktop.org/uchardet/uchardet) library - A C++ port of the original [Mozilla Universal Charset Detector](https://dxr.mozilla.org/mozilla/source/extensions/universalchardet/). 31 | 32 | ## Supported Charsets 33 | File Encoding Checker currently supports over forty charsets. 34 | 35 | * ASCII 36 | * UTF-7 (with a BOM) 37 | * UTF-8 (with or without a BOM) 38 | * UTF-16 BE or LE (with or without a BOM) 39 | * UTF-32 BE or LE (with a BOM) 40 | * Arabic: iso-8859-6, windows-1256. 41 | * Baltic: iso-8859-4, windows-1257. 42 | * Central European: ibm852, iso-8859-2, windows-1250, x-mac-ce. 43 | * Chinese (Traditional and Simplified): big5, GB18030, hz-gb-2312, x-cp50227. 44 | * Cyrillic (primarily Russian): IBM855, cp866, iso-8859-5, koi8-r, windows-1251, x-mac-cyrillic. 45 | * Estonian: iso-8859-13. 46 | * Greek: iso-8859-7, windows-1253. 47 | * Hebrew: iso-8859-8, windows-1255. 48 | * Japanese: euc-jp, iso-2022-jp, shift_jis. 49 | * Korean: euc-kr, iso-2022-kr, ks_c_5601-1987 (cp949). 50 | * Thai: windows-874 (aliases TIS-620 and iso-8859-11 in .NET) 51 | * Turkish: iso-8859-3, iso-8859-9. 52 | * Western European: iso-8859-1, iso-8859-15, windows-1252. 53 | * Vietnamese: windows-1258. 54 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Common settings that generally should always be used with your language specific settings 2 | 3 | # https://www.davidlaing.com/2012/09/19/customise-your-gitattributes-to-become-a-git-ninja/ 4 | # https://docs.github.com/en/github/using-git/configuring-git-to-handle-line-endings 5 | # https://git-scm.com/docs/gitattributes 6 | 7 | # Auto detect text files and perform LF normalization 8 | * text=auto 9 | 10 | # 11 | # The above will handle all files NOT found below 12 | # 13 | 14 | # Custom for Visual Studio 15 | *.cs text eol=crlf diff=csharp 16 | *.resx text eol=crlf 17 | *.csproj text eol=crlf merge=union 18 | *.vbproj text merge=union 19 | *.fsproj text merge=union 20 | *.dbproj text merge=union 21 | *.sln text eol=crlf merge=union 22 | 23 | # Source code 24 | *.bash text eol=lf 25 | *.bat text eol=crlf 26 | *.cmd text eol=crlf 27 | *.htm text diff=html 28 | *.html text diff=html 29 | *.php text diff=php 30 | *.ps1 text eol=crlf 31 | *.py text diff=python 32 | *.rb text diff=ruby 33 | *.scss text diff=css 34 | *.sh text eol=lf 35 | *.xhtml text diff=html 36 | 37 | # Documents 38 | *.doc diff=astextplain 39 | *.DOC diff=astextplain 40 | *.docx diff=astextplain 41 | *.DOCX diff=astextplain 42 | *.dot diff=astextplain 43 | *.DOT diff=astextplain 44 | *.pdf diff=astextplain 45 | *.PDF diff=astextplain 46 | *.rtf diff=astextplain 47 | *.RTF diff=astextplain 48 | 49 | # 50 | ## These files are binary and should be left untouched 51 | # 52 | 53 | # Graphics 54 | *.ai binary 55 | *.bmp binary 56 | *.eps binary 57 | *.gif binary 58 | *.gifv binary 59 | *.ico binary 60 | *.jng binary 61 | *.jp2 binary 62 | *.jpg binary 63 | *.jpeg binary 64 | *.jpx binary 65 | *.jxr binary 66 | *.pdf binary 67 | *.png binary 68 | *.psb binary 69 | *.psd binary 70 | # SVG treated as an asset (binary) by default. 71 | *.svg text 72 | # If you want to treat it as binary, 73 | # use the following line instead. 74 | # *.svg binary 75 | *.svgz binary 76 | *.tif binary 77 | *.tiff binary 78 | *.wbmp binary 79 | *.webp binary 80 | 81 | # Archives 82 | *.7z binary 83 | *.gz binary 84 | *.jar binary 85 | *.rar binary 86 | *.tar binary 87 | *.zip binary 88 | 89 | # Executables 90 | *.exe binary 91 | *.pyc binary 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/Properties/Resources.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace EncodingChecker.Properties 12 | { 13 | 14 | 15 | /// 16 | /// A strongly-typed resource class, for looking up localized strings, etc. 17 | /// 18 | // This class was auto-generated by the StronglyTypedResourceBuilder 19 | // class via a tool like ResGen or Visual Studio. 20 | // To add or remove a member, edit your .ResX file then rerun ResGen 21 | // with the /str option, or rebuild your VS project. 22 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")] 23 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] 24 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 25 | internal class Resources 26 | { 27 | 28 | private static global::System.Resources.ResourceManager resourceMan; 29 | 30 | private static global::System.Globalization.CultureInfo resourceCulture; 31 | 32 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] 33 | internal Resources() 34 | { 35 | } 36 | 37 | /// 38 | /// Returns the cached ResourceManager instance used by this class. 39 | /// 40 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 41 | internal static global::System.Resources.ResourceManager ResourceManager 42 | { 43 | get 44 | { 45 | if ((resourceMan == null)) 46 | { 47 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("EncodingChecker.Properties.Resources", typeof(Resources).Assembly); 48 | resourceMan = temp; 49 | } 50 | return resourceMan; 51 | } 52 | } 53 | 54 | /// 55 | /// Overrides the current thread's CurrentUICulture property for all 56 | /// resource lookups using this strongly typed resource class. 57 | /// 58 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 59 | internal static global::System.Globalization.CultureInfo Culture 60 | { 61 | get 62 | { 63 | return resourceCulture; 64 | } 65 | set 66 | { 67 | resourceCulture = value; 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Korean/EUCKRSMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Korean 2 | { 3 | public class EUCKRSMModel : StateMachineModel 4 | { 5 | private readonly static int[] EUCKR_cls = { 6 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07 7 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 8 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 10 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47 16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f 17 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57 18 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f 19 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67 20 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f 21 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77 22 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 27 | BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7 28 | BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 32 | BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff 39 | }; 40 | 41 | private readonly static int[] EUCKR_st = { 42 | BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07 43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f 44 | }; 45 | 46 | private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 }; 47 | 48 | public EUCKRSMModel() : base( 49 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 50 | BitPackage.SHIFT_MASK_4BITS, 51 | BitPackage.BIT_SHIFT_4BITS, 52 | BitPackage.UNIT_MASK_4BITS, EUCKR_cls), 53 | 4, 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, EUCKR_st), 58 | EUCKRCharLenTable, CodepageName.EUC_KR) 59 | { 60 | 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/BIG5SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 4 | { 5 | public class BIG5SMModel : StateMachineModel 6 | { 7 | private readonly static int[] BIG5_cls = { 8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 9 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 11 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f 24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87 25 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f 26 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97 27 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f 28 | BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7 29 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af 30 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7 31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf 32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7 33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf 34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7 35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df 36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef 38 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7 39 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] BIG5_st = { 43 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07 44 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f 45 | BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17 46 | }; 47 | 48 | private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0}; 49 | 50 | public BIG5SMModel() : base( 51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 52 | BitPackage.SHIFT_MASK_4BITS, 53 | BitPackage.BIT_SHIFT_4BITS, 54 | BitPackage.UNIT_MASK_4BITS, BIG5_cls), 55 | 5, 56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 57 | BitPackage.SHIFT_MASK_4BITS, 58 | BitPackage.BIT_SHIFT_4BITS, 59 | BitPackage.UNIT_MASK_4BITS, BIG5_st), 60 | BIG5CharLenTable, CodepageName.BIG5) 61 | { 62 | 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/StateMachineModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Kohei TAKETA (Java port) 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System; 40 | 41 | namespace UtfUnknown.Core.Models 42 | { 43 | /// 44 | /// State machine model 45 | /// 46 | public abstract class StateMachineModel 47 | { 48 | /// 49 | /// Start node 50 | /// 51 | public const int START = 0; 52 | 53 | /// 54 | /// Error node ? 55 | /// 56 | public const int ERROR = 1; 57 | 58 | /// 59 | /// ? 60 | /// 61 | public const int ITSME = 2; 62 | 63 | public BitPackage classTable; 64 | public BitPackage stateTable; 65 | public int[] charLenTable; 66 | 67 | public string Name { get; } 68 | 69 | public int ClassFactor { get; } 70 | 71 | public StateMachineModel(BitPackage classTable, int classFactor, 72 | BitPackage stateTable, int[] charLenTable, String name) 73 | { 74 | this.classTable = classTable; 75 | ClassFactor = classFactor; 76 | this.stateTable = stateTable; 77 | this.charLenTable = charLenTable; 78 | Name = name; 79 | } 80 | 81 | public int GetClass(byte b) 82 | { 83 | return classTable.Unpack((int)b); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Korean/Iso_2022_KR_SMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Korean 2 | { 3 | public class Iso_2022_KR_SMModel : StateMachineModel 4 | { 5 | private readonly static int[] ISO2022KR_cls = { 6 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 7 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 9 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 10 | BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27 11 | BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 13 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 14 | BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] ISO2022KR_st = { 41 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR), //00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR), //10-17 44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR), //18-1f 45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27 46 | }; 47 | 48 | private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0}; 49 | 50 | public Iso_2022_KR_SMModel() : base( 51 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 52 | BitPackage.SHIFT_MASK_4BITS, 53 | BitPackage.BIT_SHIFT_4BITS, 54 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls), 55 | 6, 56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 57 | BitPackage.SHIFT_MASK_4BITS, 58 | BitPackage.BIT_SHIFT_4BITS, 59 | BitPackage.UNIT_MASK_4BITS, ISO2022KR_st), 60 | ISO2022KRCharLenTable, CodepageName.ISO_2022_KR) 61 | { 62 | 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/SJIS_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 4 | { 5 | public class SJIS_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] SJIS_cls = { 8 | //BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07 9 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 10 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 11 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 12 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 14 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 15 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37 16 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f 25 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87 26 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f 27 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97 28 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f 29 | //0xa0 is illegal in sjis encoding, but some pages does 30 | //contain such byte. We need to be more error forgiven. 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 39 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 40 | BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef 41 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7 42 | BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff 43 | }; 44 | 45 | private readonly static int[] SJIS_st = { 46 | BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07 47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 48 | BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17 49 | }; 50 | 51 | private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 }; 52 | 53 | public SJIS_SMModel() : base( 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, SJIS_cls), 58 | 6, 59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 60 | BitPackage.SHIFT_MASK_4BITS, 61 | BitPackage.BIT_SHIFT_4BITS, 62 | BitPackage.UNIT_MASK_4BITS, SJIS_st), 63 | SJISCharLenTable, CodepageName.SHIFT_JIS) 64 | { 65 | 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Probers/CodingStateMachine.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is mozilla.org code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Kohei TAKETA (Java port) 24 | * Rudi Pettazzi (C# port) 25 | * 26 | * Alternatively, the contents of this file may be used under the terms of 27 | * either the GNU General Public License Version 2 or later (the "GPL"), or 28 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 | * in which case the provisions of the GPL or the LGPL are applicable instead 30 | * of those above. If you wish to allow use of your version of this file only 31 | * under the terms of either the GPL or the LGPL, and not to allow others to 32 | * use your version of this file under the terms of the MPL, indicate your 33 | * decision by deleting the provisions above and replace them with the notice 34 | * and other provisions required by the GPL or the LGPL. If you do not delete 35 | * the provisions above, a recipient may use your version of this file under 36 | * the terms of any one of the MPL, the GPL or the LGPL. 37 | * 38 | * ***** END LICENSE BLOCK ***** */ 39 | 40 | using UtfUnknown.Core.Models; 41 | 42 | namespace UtfUnknown.Core.Probers 43 | { 44 | /// 45 | /// Parallel state machine for the Coding Scheme Method 46 | /// 47 | public class CodingStateMachine 48 | { 49 | private int currentState; 50 | private StateMachineModel model; 51 | private int currentCharLen; 52 | 53 | 54 | public CodingStateMachine(StateMachineModel model) 55 | { 56 | currentState = StateMachineModel.START; 57 | this.model = model; 58 | } 59 | 60 | public int NextState(byte b) 61 | { 62 | // for each byte we get its class, if it is first byte, 63 | // we also get byte length 64 | int byteCls = model.GetClass(b); 65 | if (currentState == StateMachineModel.START) { 66 | 67 | currentCharLen = model.charLenTable[byteCls]; 68 | } 69 | 70 | // from byte's class and stateTable, we get its next state 71 | currentState = model.stateTable.Unpack( 72 | currentState * model.ClassFactor + byteCls); 73 | 74 | return currentState; 75 | } 76 | 77 | public void Reset() 78 | { 79 | currentState = StateMachineModel.START; 80 | } 81 | 82 | public int CurrentCharLen 83 | { 84 | get { return currentCharLen; } 85 | } 86 | 87 | public string ModelName 88 | { 89 | get { return model.Name; } 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/EUCTWSMModel.cs: -------------------------------------------------------------------------------- 1 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 2 | { 3 | public class EUCTWSMModel : StateMachineModel 4 | { 5 | private readonly static int[] EUCTW_cls = { 6 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07 7 | BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f 8 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17 9 | BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f 10 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27 11 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f 12 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37 13 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f 14 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 15 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 23 | BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 26 | BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7 27 | BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af 28 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7 29 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf 30 | BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7 31 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf 32 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7 33 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df 34 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7 35 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef 36 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7 37 | BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff 38 | }; 39 | 40 | private readonly static int[] EUCTW_st = { 41 | BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07 42 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f 43 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17 44 | BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f 45 | BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27 46 | BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f 47 | }; 48 | 49 | private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 }; 50 | 51 | public EUCTWSMModel() : base( 52 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 53 | BitPackage.SHIFT_MASK_4BITS, 54 | BitPackage.BIT_SHIFT_4BITS, 55 | BitPackage.UNIT_MASK_4BITS, EUCTW_cls), 56 | 7, 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, EUCTW_st), 61 | EUCTWCharLenTable, CodepageName.EUC_TW) 62 | { 63 | 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/EUCJPSMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 4 | { 5 | public class EUCJPSMModel : StateMachineModel 6 | { 7 | private readonly static int[] EUCJP_cls = { 8 | //BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07 9 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07 10 | BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f 11 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17 12 | BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f 13 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27 14 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f 15 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37 16 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f 17 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47 18 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f 19 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57 20 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f 21 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67 22 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f 23 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77 24 | BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f 25 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87 26 | BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f 27 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97 28 | BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f 29 | BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 39 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 40 | BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff 41 | }; 42 | 43 | private readonly static int[] EUCJP_st = { 44 | BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07 45 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 46 | BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17 47 | BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f 48 | BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27 49 | }; 50 | 51 | private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 }; 52 | 53 | public EUCJPSMModel() : base( 54 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 55 | BitPackage.SHIFT_MASK_4BITS, 56 | BitPackage.BIT_SHIFT_4BITS, 57 | BitPackage.UNIT_MASK_4BITS, EUCJP_cls), 58 | 6, 59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 60 | BitPackage.SHIFT_MASK_4BITS, 61 | BitPackage.BIT_SHIFT_4BITS, 62 | BitPackage.UNIT_MASK_4BITS, EUCJP_st), 63 | EUCJPCharLenTable, CodepageName.EUC_JP) 64 | { 65 | 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/UCS2BE_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte 4 | { 5 | public class UCS2BE_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] UCS2BE_cls = { 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07 9 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 11 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 13 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7 29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af 30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7 31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf 32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7 33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf 34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7 35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df 36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 39 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] UCS2BE_st = { 43 | BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07 44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 45 | BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17 46 | BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f 47 | BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27 48 | BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f 49 | BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37 50 | }; 51 | 52 | private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 }; 53 | 54 | public UCS2BE_SMModel() : base( 55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 56 | BitPackage.SHIFT_MASK_4BITS, 57 | BitPackage.BIT_SHIFT_4BITS, 58 | BitPackage.UNIT_MASK_4BITS, UCS2BE_cls), 59 | 6, 60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 61 | BitPackage.SHIFT_MASK_4BITS, 62 | BitPackage.BIT_SHIFT_4BITS, 63 | BitPackage.UNIT_MASK_4BITS, UCS2BE_st), 64 | UCS2BECharLenTable, CodepageName.UTF16_BE) 65 | { 66 | 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/UCS2LE_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte 4 | { 5 | public class UCS2LE_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] UCS2LE_cls = { 8 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07 9 | BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 11 | BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 13 | BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 16 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 24 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87 25 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f 26 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97 27 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f 28 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7 29 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af 30 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7 31 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf 32 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7 33 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf 34 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7 35 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df 36 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7 37 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef 38 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7 39 | BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] UCS2LE_st = { 43 | BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07 44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f 45 | BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17 46 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f 47 | BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27 48 | BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f 49 | BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37 50 | }; 51 | 52 | private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 }; 53 | 54 | public UCS2LE_SMModel() : base( 55 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 56 | BitPackage.SHIFT_MASK_4BITS, 57 | BitPackage.BIT_SHIFT_4BITS, 58 | BitPackage.UNIT_MASK_4BITS, UCS2LE_cls), 59 | 6, 60 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 61 | BitPackage.SHIFT_MASK_4BITS, 62 | BitPackage.BIT_SHIFT_4BITS, 63 | BitPackage.UNIT_MASK_4BITS, UCS2LE_st), 64 | UCS2LECharLenTable, CodepageName.UTF16_LE) 65 | { 66 | 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Ibm866_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Ibm866_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 58 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, 59 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 60 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 61 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, 62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, 63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, 64 | }; 65 | 66 | public Ibm866_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM866) 67 | { 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Ibm855_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Ibm855_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] BYTE_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 56 | 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205, 57 | 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70, 58 | 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219, 59 | 220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229, 60 | 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243, 61 | 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248, 62 | 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249, 63 | 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR, 64 | }; 65 | 66 | public Ibm855_RussianModel() : base(BYTE_TO_ORDER_MAP, CodepageName.IBM855) 67 | { 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/Iso_2022_CN_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 4 | { 5 | public class Iso_2022_CN_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] ISO2022CN_cls = { 8 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 9 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 11 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 12 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27 13 | BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 16 | BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47 17 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 39 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] ISO2022CN_st = { 43 | BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START), //00-07 44 | BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f 45 | BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME), //10-17 46 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR), //18-1f 47 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //20-27 48 | BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //28-2f 49 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //30-37 50 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f 51 | }; 52 | 53 | private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0}; 54 | 55 | public Iso_2022_CN_SMModel() : base( 56 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 57 | BitPackage.SHIFT_MASK_4BITS, 58 | BitPackage.BIT_SHIFT_4BITS, 59 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls), 60 | 9, 61 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 62 | BitPackage.SHIFT_MASK_4BITS, 63 | BitPackage.BIT_SHIFT_4BITS, 64 | BitPackage.UNIT_MASK_4BITS, ISO2022CN_st), 65 | ISO2022CNCharLenTable, CodepageName.ISO_2022_CN) 66 | { 67 | 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Koi8r_Model.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Koi8r_Model : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, /* AX */ 59 | 238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, /* BX */ 60 | 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, /* CX */ 61 | 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, /* DX */ 62 | 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, /* EX */ 63 | 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Koi8r_Model() : base(CHAR_TO_ORDER_MAP, CodepageName.KOI8_R) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Iso_8859_5_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Iso_8859_5_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */ 59 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* BX */ 60 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* CX */ 61 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* DX */ 62 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* EX */ 63 | 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Iso_8859_5_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/Windows_1251_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class Windows_1251_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* 8X */ 57 | 207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, /* 9X */ 58 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* AX */ 59 | 239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, /* BX */ 60 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* CX */ 61 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* DX */ 62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */ 63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public Windows_1251_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Russian/X_Mac_Cyrillic_RussianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRussianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Russian 44 | { 45 | public class X_Mac_Cyrillic_RussianModel : RussianModel 46 | { 47 | private readonly static byte[] CHAR_TO_ORDER_MAP = { 48 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 49 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 50 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 51 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 52 | SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, /* 4X */ 53 | 155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, /* 5X */ 54 | SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, /* 6X */ 55 | 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, /* 7X */ 56 | 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, /* 8X */ 57 | 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, /* 9X */ 58 | 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, /* AX */ 59 | 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, /* BX */ 60 | 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, /* CX */ 61 | 239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, /* DX */ 62 | 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, /* EX */ 63 | 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, /* FX */ 64 | }; 65 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 66 | 67 | public X_Mac_Cyrillic_RussianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.X_MAC_CYRILLIC) 68 | { 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Japanese/Iso_2022_JP_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Japanese 4 | { 5 | public class Iso_2022_JP_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] ISO2022JP_cls = { 8 | BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07 9 | BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f 10 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17 11 | BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f 12 | BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27 13 | BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f 14 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37 15 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f 16 | BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47 17 | BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f 18 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57 19 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f 20 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67 21 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f 22 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77 23 | BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f 24 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87 25 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f 26 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97 27 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f 28 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7 29 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af 30 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7 31 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf 32 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7 33 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf 34 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7 35 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df 36 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7 37 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef 38 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7 39 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] ISO2022JP_st = { 43 | BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START), //00-07 44 | BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR), //08-0f 45 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME), //10-17 46 | BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR), //18-1f 47 | BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR), //20-27 48 | BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR), //28-2f 49 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME), //30-37 50 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR), //38-3f 51 | BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47 52 | }; 53 | 54 | private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 55 | 56 | public Iso_2022_JP_SMModel() : base( 57 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 58 | BitPackage.SHIFT_MASK_4BITS, 59 | BitPackage.BIT_SHIFT_4BITS, 60 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls), 61 | 10, 62 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 63 | BitPackage.SHIFT_MASK_4BITS, 64 | BitPackage.BIT_SHIFT_4BITS, 65 | BitPackage.UNIT_MASK_4BITS, ISO2022JP_st), 66 | ISO2022JPCharLenTable, CodepageName.ISO_2022_JP) 67 | { 68 | 69 | } 70 | 71 | } 72 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/MultiByte/Chinese/GB18030_SMModel.cs: -------------------------------------------------------------------------------- 1 | using UtfUnknown.Core.Models; 2 | 3 | namespace UtfUnknown.Core.Models.MultiByte.Chinese 4 | { 5 | public class GB18030_SMModel : StateMachineModel 6 | { 7 | private readonly static int[] GB18030_cls = { 8 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07 9 | BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f 10 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17 11 | BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f 12 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27 13 | BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f 14 | BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37 15 | BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f 16 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47 17 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f 18 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57 19 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f 20 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67 21 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f 22 | BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77 23 | BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f 24 | BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87 25 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f 26 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97 27 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f 28 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7 29 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af 30 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7 31 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf 32 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7 33 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf 34 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7 35 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df 36 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7 37 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef 38 | BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7 39 | BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff 40 | }; 41 | 42 | private readonly static int[] GB18030_st = { 43 | BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07 44 | BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f 45 | BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17 46 | BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f 47 | BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27 48 | BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f 49 | }; 50 | 51 | // To be accurate, the length of class 6 can be either 2 or 4. 52 | // But it is not necessary to discriminate between the two since 53 | // it is used for frequency analysis only, and we are validating 54 | // each code range there as well. So it is safe to set it to be 55 | // 2 here. 56 | private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2}; 57 | 58 | public GB18030_SMModel() : base( 59 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 60 | BitPackage.SHIFT_MASK_4BITS, 61 | BitPackage.BIT_SHIFT_4BITS, 62 | BitPackage.UNIT_MASK_4BITS, GB18030_cls), 63 | 7, 64 | new BitPackage(BitPackage.INDEX_SHIFT_4BITS, 65 | BitPackage.SHIFT_MASK_4BITS, 66 | BitPackage.BIT_SHIFT_4BITS, 67 | BitPackage.UNIT_MASK_4BITS, GB18030_st), 68 | GB18030CharLenTable, CodepageName.GB18030) 69 | { 70 | 71 | } 72 | } 73 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/TextEncoding.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Text; 4 | using UtfUnknown; 5 | 6 | namespace EncodingChecker 7 | { 8 | public static class TextEncoding 9 | { 10 | /// 11 | /// https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding/ 12 | /// 13 | private static readonly DecoderExceptionFallback DecoderExceptionFallback = new DecoderExceptionFallback(); 14 | public static bool Validate(this Encoding encoding, byte[] bytes, int offset = 0, int? length = null) 15 | { 16 | if (encoding == null) 17 | { 18 | throw new ArgumentNullException(nameof(encoding)); 19 | } 20 | if (bytes == null) 21 | { 22 | throw new ArgumentNullException(nameof(bytes)); 23 | } 24 | length = length ?? bytes.Length; 25 | if (offset < 0 || offset > bytes.Length) 26 | { 27 | throw new ArgumentOutOfRangeException(nameof(offset), @"Offset is out of range."); 28 | } 29 | if (length < 0 || length > bytes.Length) 30 | { 31 | throw new ArgumentOutOfRangeException(nameof(length), @"Length is out of range."); 32 | } 33 | else if ((offset + length) > bytes.Length) 34 | { 35 | throw new ArgumentOutOfRangeException(nameof(offset), @"The specified range is outside of the specified buffer."); 36 | } 37 | var decoder = encoding.GetDecoder(); 38 | decoder.Fallback = DecoderExceptionFallback; 39 | try 40 | { 41 | decoder.GetCharCount(bytes, offset, length.Value); 42 | } 43 | catch (DecoderFallbackException) 44 | { 45 | return false; 46 | } 47 | return true; 48 | } 49 | 50 | /// 51 | /// Get the System.Text.Encoding of this file. 52 | /// 53 | /// Path to file 54 | /// System.Text.Encoding (can be null if not available or not supported by .NET). 55 | public static Encoding GetFileEncoding(string filePath, ref bool hasBOM) 56 | { 57 | return GetFileEncoding(filePath, null, ref hasBOM); 58 | } 59 | 60 | /// 61 | /// Get the System.Text.Encoding of this file. 62 | /// 63 | /// Path to file 64 | /// max bytes to read from . If null, then no max 65 | /// System.Text.Encoding (can be null if not available or not supported by .NET). 66 | public static Encoding GetFileEncoding(string filePath, int? maxBytesToRead, ref bool hasBOM) 67 | { 68 | hasBOM = false; 69 | try 70 | { 71 | using (FileStream stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) 72 | { 73 | // Check for possible UTF-16 encoding (LE or BE). 74 | Encoding encoding = Utf16Detector.DetectFromStream(stream, maxBytesToRead); 75 | if (encoding != null) 76 | { 77 | return encoding; 78 | } 79 | // https://github.com/CharsetDetector/UTF-unknown 80 | stream.Position = 0L; 81 | var result = CharsetDetector.DetectFromStream(stream, maxBytesToRead); 82 | if (result.Detected != null) 83 | { 84 | hasBOM = result.Detected.HasBOM; 85 | return result.Detected.Encoding; 86 | } 87 | return null; 88 | } 89 | } 90 | catch 91 | { 92 | return null; 93 | } 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/BitPackage.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Kohei TAKETA (Java port) 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | namespace UtfUnknown.Core 40 | { 41 | public class BitPackage 42 | { 43 | public static int INDEX_SHIFT_4BITS = 3; 44 | public static int INDEX_SHIFT_8BITS = 2; 45 | public static int INDEX_SHIFT_16BITS = 1; 46 | 47 | public static int SHIFT_MASK_4BITS = 7; 48 | public static int SHIFT_MASK_8BITS = 3; 49 | public static int SHIFT_MASK_16BITS = 1; 50 | 51 | public static int BIT_SHIFT_4BITS = 2; 52 | public static int BIT_SHIFT_8BITS = 3; 53 | public static int BIT_SHIFT_16BITS = 4; 54 | 55 | public static int UNIT_MASK_4BITS = 0x0000000F; 56 | public static int UNIT_MASK_8BITS = 0x000000FF; 57 | public static int UNIT_MASK_16BITS = 0x0000FFFF; 58 | 59 | private int indexShift; 60 | private int shiftMask; 61 | private int bitShift; 62 | private int unitMask; 63 | private int[] data; 64 | 65 | public BitPackage(int indexShift, int shiftMask, 66 | int bitShift, int unitMask, int[] data) 67 | { 68 | this.indexShift = indexShift; 69 | this.shiftMask = shiftMask; 70 | this.bitShift = bitShift; 71 | this.unitMask = unitMask; 72 | this.data = data; 73 | } 74 | 75 | public static int Pack16bits(int a, int b) 76 | { 77 | return ((b << 16) | a); 78 | } 79 | 80 | public static int Pack8bits(int a, int b, int c, int d) 81 | { 82 | return Pack16bits((b << 8) | a, (d << 8) | c); 83 | } 84 | 85 | public static int Pack4bits(int a, int b, int c, int d, 86 | int e, int f, int g, int h) 87 | { 88 | return Pack8bits((b << 4) | a, (d << 4) | c, 89 | (f << 4) | e, (h << 4) | g); 90 | } 91 | 92 | public int Unpack(int i) 93 | { 94 | return (data[i >> indexShift] >> 95 | ((i & shiftMask) << bitShift)) & unitMask; 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /sources/EncodingChecker/ListViewExtensions.cs: -------------------------------------------------------------------------------- 1 | // https://stackoverflow.com/questions/254129/how-to-i-display-a-sort-arrow-in-the-header-of-a-list-view-column-using-c 2 | using System; 3 | using System.ComponentModel; 4 | using System.Runtime.InteropServices; 5 | using System.Windows.Forms; 6 | 7 | namespace EncodingChecker 8 | { 9 | [EditorBrowsable(EditorBrowsableState.Never)] 10 | public static class ListViewExtensions 11 | { 12 | [StructLayout(LayoutKind.Sequential)] 13 | public struct HDITEM 14 | { 15 | public Mask mask; 16 | public int cxy; 17 | [MarshalAs(UnmanagedType.LPTStr)] public string pszText; 18 | public IntPtr hbm; 19 | public int cchTextMax; 20 | public Format fmt; 21 | public IntPtr lParam; 22 | // _WIN32_IE >= 0x0300 23 | public int iImage; 24 | public int iOrder; 25 | // _WIN32_IE >= 0x0500 26 | public uint type; 27 | public IntPtr pvFilter; 28 | // _WIN32_WINNT >= 0x0600 29 | public uint state; 30 | 31 | [Flags] 32 | public enum Mask 33 | { 34 | Format = 0x4, // HDI_FORMAT 35 | }; 36 | 37 | [Flags] 38 | public enum Format 39 | { 40 | SortDown = 0x200, // HDF_SORTDOWN 41 | SortUp = 0x400, // HDF_SORTUP 42 | }; 43 | }; 44 | 45 | public const int LVM_FIRST = 0x1000; 46 | public const int LVM_GETHEADER = LVM_FIRST + 31; 47 | 48 | public const int HDM_FIRST = 0x1200; 49 | public const int HDM_GETITEM = HDM_FIRST + 11; 50 | public const int HDM_SETITEM = HDM_FIRST + 12; 51 | 52 | [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] 53 | public static extern IntPtr SendMessage(this IntPtr hWnd, UInt32 msg, IntPtr wParam, IntPtr lParam); 54 | 55 | [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] 56 | public static extern IntPtr SendMessage(this IntPtr hWnd, UInt32 msg, IntPtr wParam, ref HDITEM lParam); 57 | 58 | public static void SetSortIcon(this ListView listViewControl, int columnIndex, SortOrder order) 59 | { 60 | IntPtr columnHeader = SendMessage(listViewControl.Handle, LVM_GETHEADER, IntPtr.Zero, IntPtr.Zero); 61 | for (int columnNumber = 0; columnNumber <= listViewControl.Columns.Count - 1; columnNumber++) 62 | { 63 | var columnPtr = new IntPtr(columnNumber); 64 | var lvColumn = new HDITEM 65 | { 66 | mask = HDITEM.Mask.Format 67 | }; 68 | 69 | if (SendMessage(columnHeader, HDM_GETITEM, columnPtr, ref lvColumn) == IntPtr.Zero) 70 | { 71 | throw new Win32Exception(); 72 | } 73 | 74 | if (order != SortOrder.None && columnNumber == columnIndex) 75 | { 76 | switch (order) 77 | { 78 | case SortOrder.Ascending: 79 | lvColumn.fmt &= ~HDITEM.Format.SortDown; 80 | lvColumn.fmt |= HDITEM.Format.SortUp; 81 | break; 82 | case SortOrder.Descending: 83 | lvColumn.fmt &= ~HDITEM.Format.SortUp; 84 | lvColumn.fmt |= HDITEM.Format.SortDown; 85 | break; 86 | } 87 | } 88 | else 89 | { 90 | lvColumn.fmt &= ~HDITEM.Format.SortDown & ~HDITEM.Format.SortUp; 91 | } 92 | 93 | if (SendMessage(columnHeader, HDM_SETITEM, columnPtr, ref lvColumn) == IntPtr.Zero) 94 | { 95 | throw new Win32Exception(); 96 | } 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/DetectionDetail.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.CompilerServices; 4 | using System.Text; 5 | using UtfUnknown.Core; 6 | using UtfUnknown.Core.Probers; 7 | 8 | [assembly: InternalsVisibleTo("UtfUnknown.Tests, PublicKey=" + 9 | "002400000480000094000000060200000024000052534131000400000100010029f6b4defac763" + 10 | "66721687460b44b7619e8e19a411f785279316fdae2f6965edfa4a460304fe8b4ed796d5356a1c" + 11 | "225131b9087983d9ff9530df9307eab17d88cd4f1005a45f6f35523445d1ff7323322f3060cffc" + 12 | "0d70d0cb1b4b7d46081bbead31844927aaadb0508b64bf298de5abe5ea5cca8b92490c961b7b75" + 13 | "13c2c2a9")] 14 | namespace UtfUnknown 15 | { 16 | /// 17 | /// Detailed result of a detection 18 | /// 19 | public class DetectionDetail 20 | { 21 | /// 22 | /// A dictionary for replace unsupported codepage name in .NET to the nearly identical version. 23 | /// 24 | private static readonly Dictionary FixedToSupportCodepageName = 25 | new Dictionary 26 | { 27 | // CP949 is superset of ks_c_5601-1987 (see https://github.com/CharsetDetector/UTF-unknown/pull/74#issuecomment-550362133) 28 | {CodepageName.CP949, CodepageName.KS_C_5601_1987}, 29 | {CodepageName.ISO_2022_CN, CodepageName.X_CP50227}, 30 | }; 31 | 32 | /// 33 | /// New result 34 | /// 35 | public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, 36 | TimeSpan? time = null, string statusLog = null) 37 | { 38 | EncodingName = encodingShortName; 39 | Confidence = confidence; 40 | Encoding = GetEncoding(encodingShortName); 41 | Prober = prober; 42 | Time = time; 43 | StatusLog = statusLog; 44 | } 45 | 46 | /// 47 | /// New Result 48 | /// 49 | public DetectionDetail(CharsetProber prober, TimeSpan? time = null) 50 | : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time, prober.DumpStatus()) 51 | { 52 | } 53 | 54 | /// 55 | /// The (short) name of the detected encoding. For full details, check 56 | /// 57 | public string EncodingName { get; } 58 | 59 | /// 60 | /// The detected encoding. 61 | /// 62 | public Encoding Encoding { get; set; } 63 | 64 | /// 65 | /// The confidence of the found encoding. Between 0 and 1. 66 | /// 67 | public float Confidence { get; set; } 68 | 69 | /// 70 | /// The used prober for detection 71 | /// 72 | public CharsetProber Prober { get; set; } 73 | 74 | /// 75 | /// A Byte Order Mark was detected 76 | /// 77 | public bool HasBOM { get; set; } 78 | 79 | /// 80 | /// The time spend 81 | /// 82 | public TimeSpan? Time { get; set; } 83 | 84 | public string StatusLog { get; set; } 85 | 86 | public override string ToString() 87 | { 88 | return $"Detected {EncodingName} with confidence of {Confidence}. (BOM: {HasBOM})"; 89 | } 90 | 91 | internal static Encoding GetEncoding(string encodingShortName) 92 | { 93 | var encodingName = FixedToSupportCodepageName.TryGetValue(encodingShortName, out var supportCodepageName) 94 | ? supportCodepageName 95 | : encodingShortName; 96 | try 97 | { 98 | return Encoding.GetEncoding(encodingName); 99 | } 100 | catch (ArgumentException) // unsupported name 101 | { 102 | #if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0 103 | return CodePagesEncodingProvider.Instance.GetEncoding(encodingName); 104 | #else 105 | return null; 106 | #endif 107 | } 108 | } 109 | } 110 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Bulgarian/Iso_8859_5_BulgarianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian 44 | { 45 | public class Iso_8859_5_BulgarianModel : BulgarianModel 46 | { 47 | // CTR: Control characters that usually does not exist in any text 48 | // RET: Carriage/Return 49 | // SYM: symbol(punctuation) that does not belong to word 50 | // NUM: 0 - 9 51 | // 52 | // Character Mapping Table: 53 | // this table is modified base on win1251BulgarianCharToOrderMap, so 54 | // only number <64 is sure valid 55 | 56 | private static byte[] CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */ 62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */ 64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, /* 8X */ 66 | 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, /* 9X */ 67 | 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, /* AX */ 68 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* BX */ 69 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, /* CX */ 70 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* DX */ 71 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, /* EX */ 72 | 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Iso_8859_5_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_5) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Bulgarian/Windows_1251_BulgarianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangBulgarianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Bulgarian 44 | { 45 | public class Windows_1251_BulgarianModel : BulgarianModel 46 | { 47 | // CTR: Control characters that usually does not exist in any text 48 | // RET: Carriage/Return 49 | // SYM: symbol(punctuation) that does not belong to word 50 | // NUM: 0 - 9 51 | // 52 | // Character Mapping Table: 53 | // this table is modified base on win1251BulgarianCharToOrderMap, so 54 | // only number <64 is sure valid 55 | 56 | private static byte[] CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, /* 4X */ 62 | 110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, /* 6X */ 64 | 116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, /* 8X */ 66 | 221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, /* 9X */ 67 | 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, /* AX */ 68 | 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, /* BX */ 69 | 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, /* CX */ 70 | 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, /* DX */ 71 | 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, /* EX */ 72 | 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Windows_1251_BulgarianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1251) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Hebrew/Windows_1255_HebrewModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | /* 40 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 41 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangHebrewModel.cpp 42 | * and adjusted to language specific support. 43 | */ 44 | 45 | namespace UtfUnknown.Core.Models.SingleByte.Hebrew 46 | { 47 | public class Windows_1255_HebrewModel : HebrewModel 48 | { 49 | // 255: Control characters that usually does not exist in any text 50 | // 254: Carriage/Return 51 | // 253: symbol (punctuation) that does not belong to word 52 | // 252: 0 - 9 53 | 54 | // Windows-1255 language model 55 | // Character Mapping Table: 56 | private readonly static byte[]CHAR_TO_ORDER_MAP = { 57 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 58 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 59 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 60 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 61 | SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, /* 4X */ 62 | 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, /* 5X */ 63 | SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, /* 6X */ 64 | 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, /* 7X */ 65 | 124,ILL,203,204,205, 40, 58,206,207,208,ILL,210,ILL,ILL,ILL,ILL, /* 8X */ 66 | ILL, 83, 52, 47, 46, 72, 32, 94,216,113,ILL,109,ILL,ILL,ILL,ILL, /* 9X */ 67 | 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, /* AX */ 68 | 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, /* BX */ 69 | 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, /* CX */ 70 | 238, 38, 45,239,240,241,242,243,127,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* DX */ 71 | 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, /* EX */ 72 | 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,ILL,ILL,128, 96,ILL, /* FX */ 73 | }; 74 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 75 | 76 | public Windows_1255_HebrewModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1255) 77 | { 78 | } 79 | } 80 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/UTF8Prober.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Models; 42 | using UtfUnknown.Core.Models.MultiByte; 43 | 44 | namespace UtfUnknown.Core.Probers.MultiByte 45 | { 46 | public class UTF8Prober : CharsetProber 47 | { 48 | private static float ONE_CHAR_PROB = 0.50f; 49 | private CodingStateMachine codingSM; 50 | private int numOfMBChar; 51 | 52 | public UTF8Prober() 53 | { 54 | numOfMBChar = 0; 55 | codingSM = new CodingStateMachine(new UTF8_SMModel()); 56 | Reset(); 57 | } 58 | 59 | public override string GetCharsetName() 60 | { 61 | return CodepageName.UTF8; 62 | } 63 | 64 | public override void Reset() 65 | { 66 | codingSM.Reset(); 67 | numOfMBChar = 0; 68 | state = ProbingState.Detecting; 69 | } 70 | 71 | public override ProbingState HandleData(byte[] buf, int offset, int len) 72 | { 73 | int max = offset + len; 74 | 75 | for (int i = offset; i < max; i++) 76 | { 77 | 78 | var codingState = codingSM.NextState(buf[i]); 79 | 80 | if (codingState == StateMachineModel.ERROR) 81 | { 82 | state = ProbingState.NotMe; 83 | break; 84 | } 85 | 86 | if (codingState == StateMachineModel.ITSME) 87 | { 88 | state = ProbingState.FoundIt; 89 | break; 90 | } 91 | 92 | if (codingState == StateMachineModel.START) 93 | { 94 | if (codingSM.CurrentCharLen >= 2) 95 | numOfMBChar++; 96 | } 97 | } 98 | 99 | if (state == ProbingState.Detecting) 100 | if (GetConfidence() > SHORTCUT_THRESHOLD) 101 | state = ProbingState.FoundIt; 102 | 103 | return state; 104 | } 105 | 106 | public override float GetConfidence(StringBuilder status = null) 107 | { 108 | float unlike = 0.99f; 109 | float confidence; 110 | 111 | if (numOfMBChar < 6) 112 | { 113 | for (int i = 0; i < numOfMBChar; i++) 114 | unlike *= ONE_CHAR_PROB; 115 | 116 | confidence = 1.0f - unlike; 117 | } 118 | else 119 | { 120 | confidence = 0.99f; 121 | } 122 | 123 | return confidence; 124 | } 125 | } 126 | } -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SequenceModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System; 40 | 41 | namespace UtfUnknown.Core.Models 42 | { 43 | public abstract class SequenceModel 44 | { 45 | // Codepoints 46 | 47 | // Illegal codepoints 48 | public const byte ILL = 255; 49 | // Control character 50 | public const byte CTR = 254; 51 | // Symbols and punctuation that does not belong to words 52 | public const byte SYM = 253; 53 | // Return/Line feeds 54 | public const byte RET = 252; 55 | // Numbers 0-9 56 | public const byte NUM = 251; 57 | 58 | // [256] table use to find a char's order 59 | protected byte[] charToOrderMap; 60 | 61 | // freqCharCount x freqCharCount table to find a 2-char sequence's 62 | // frequency 63 | protected byte[] precedenceMatrix; 64 | 65 | // The count of frequent characters 66 | protected int freqCharCount; 67 | 68 | public int FreqCharCount 69 | { 70 | get { return freqCharCount; } 71 | } 72 | 73 | // freqSeqs / totalSeqs 74 | protected float typicalPositiveRatio; 75 | 76 | public float TypicalPositiveRatio { 77 | get { return typicalPositiveRatio; } 78 | } 79 | 80 | 81 | /// 82 | /// TODO not used? 83 | /// 84 | protected bool keepEnglishLetter; 85 | 86 | /// 87 | /// TODO not used? 88 | /// 89 | public bool KeepEnglishLetter { 90 | get { return keepEnglishLetter; } 91 | } 92 | 93 | protected string charsetName; 94 | 95 | public string CharsetName { 96 | get { return charsetName; } 97 | } 98 | 99 | public SequenceModel( 100 | byte[] charToOrderMap, 101 | byte[] precedenceMatrix, 102 | int freqCharCount, 103 | float typicalPositiveRatio, 104 | bool keepEnglishLetter, 105 | String charsetName) 106 | { 107 | this.charToOrderMap = charToOrderMap; 108 | this.precedenceMatrix = precedenceMatrix; 109 | this.freqCharCount = freqCharCount; 110 | this.typicalPositiveRatio = typicalPositiveRatio; 111 | this.keepEnglishLetter = keepEnglishLetter; 112 | this.charsetName = charsetName; 113 | } 114 | 115 | public byte GetOrder(byte b) 116 | { 117 | return charToOrderMap[b]; 118 | } 119 | 120 | public byte GetPrecedence(int pos) 121 | { 122 | return precedenceMatrix[pos]; 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/Chinese/EUCTWProber.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Analyzers.Chinese; 42 | using UtfUnknown.Core.Models; 43 | using UtfUnknown.Core.Models.MultiByte.Chinese; 44 | 45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese 46 | { 47 | public class EUCTWProber : CharsetProber 48 | { 49 | private CodingStateMachine codingSM; 50 | private EUCTWDistributionAnalyser distributionAnalyser; 51 | private byte[] lastChar = new byte[2]; 52 | 53 | public EUCTWProber() 54 | { 55 | codingSM = new CodingStateMachine(new EUCTWSMModel()); 56 | distributionAnalyser = new EUCTWDistributionAnalyser(); 57 | Reset(); 58 | } 59 | 60 | public override ProbingState HandleData(byte[] buf, int offset, int len) 61 | { 62 | int codingState; 63 | int max = offset + len; 64 | 65 | for (int i = 0; i < max; i++) 66 | { 67 | codingState = codingSM.NextState(buf[i]); 68 | if (codingState == StateMachineModel.ERROR) 69 | { 70 | state = ProbingState.NotMe; 71 | break; 72 | } 73 | 74 | if (codingState == StateMachineModel.ITSME) 75 | { 76 | state = ProbingState.FoundIt; 77 | break; 78 | } 79 | 80 | if (codingState == StateMachineModel.START) 81 | { 82 | int charLen = codingSM.CurrentCharLen; 83 | if (i == offset) 84 | { 85 | lastChar[1] = buf[offset]; 86 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen); 87 | } 88 | else 89 | { 90 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen); 91 | } 92 | } 93 | } 94 | 95 | lastChar[0] = buf[max - 1]; 96 | 97 | if (state == ProbingState.Detecting) 98 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 99 | state = ProbingState.FoundIt; 100 | 101 | return state; 102 | } 103 | 104 | public override string GetCharsetName() 105 | { 106 | return CodepageName.EUC_TW; 107 | } 108 | 109 | public override void Reset() 110 | { 111 | codingSM.Reset(); 112 | state = ProbingState.Detecting; 113 | distributionAnalyser.Reset(); 114 | } 115 | 116 | public override float GetConfidence(StringBuilder status = null) 117 | { 118 | return distributionAnalyser.GetConfidence(); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Ibm852_CzechModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Czech 44 | { 45 | public class Ibm852_CzechModel : CzechModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 03:28:11.733089 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ 73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ 75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 139, 43, 24,140, 42, 31,141,142,143,144,145,146,147,148, 42,149, /* 8X */ 77 | 24,150,151,152, 41, 45, 45, 46, 46, 41, 43, 38, 38,153,SYM, 25, /* 9X */ 78 | 18, 11, 37, 33,154,155, 26, 26,156,157,SYM,158, 25,159,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 18,160, 23,161,SYM,SYM,SYM,SYM,162,163,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,164,165,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 166,167, 39,168, 39, 35, 11,169, 23,SYM,SYM,SYM,SYM,170, 31,SYM, /* DX */ 82 | 37,171,172,173,174, 35, 29, 29,175, 33,176,177, 28, 28,178,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,179, 27, 27,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Polish/Ibm852_PolishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangPolishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Polish 44 | { 45 | public class Ibm852_PolishModel : PolishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 17:21:04.405363 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 4X */ 73 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 20, 11, 14, 3, 26, 21, 22, 1, 18, 7, 15, 16, 5, 2, /* 6X */ 75 | 13, 36, 4, 6, 10, 17, 31, 9, 33, 12, 8,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 47, 39, 34, 54, 40, 78, 30, 47, 19, 58, 49, 49, 77, 32, 40, 30, /* 8X */ 77 | 34, 79, 80, 55, 38, 74, 74, 28, 28, 38, 39, 76, 76, 19,SYM, 44, /* 9X */ 78 | 35, 37, 24, 51, 25, 25, 45, 45, 23, 23,SYM, 32, 44, 56,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 35, 54, 46, 56,SYM,SYM,SYM,SYM, 27, 27,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM, 53, 53,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 70, 70, 69, 58, 69, 81, 37, 77, 46,SYM,SYM,SYM,SYM, 65, 82,SYM, /* DX */ 82 | 24, 57, 55, 29, 29, 83, 41, 41, 84, 51, 85, 86, 60, 60, 65,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 87, 50, 50,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_PolishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Slovak/Ibm852_SlovakModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSlovakModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Slovak 44 | { 45 | public class Ibm852_SlovakModel : SlovakModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 13:33:10.331339 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 4X */ 73 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 1, 20, 15, 11, 2, 29, 30, 17, 4, 18, 7, 10, 12, 3, 0, /* 6X */ 75 | 13, 40, 6, 8, 5, 14, 9, 37, 34, 19, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 51, 46, 25, 62, 38, 48, 47, 51, 49, 54, 50, 50, 63, 64, 38, 47, /* 8X */ 77 | 25, 42, 42, 32, 43, 33, 33, 65, 66, 43, 46, 31, 31, 49,SYM, 24, /* 9X */ 78 | 21, 23, 35, 27, 67, 68, 26, 26, 69, 70,SYM, 71, 24, 59,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 21, 72, 41, 59,SYM,SYM,SYM,SYM, 61, 61,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM, 56, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 55, 55, 39, 54, 39, 36, 23, 73, 41,SYM,SYM,SYM,SYM, 74, 48,SYM, /* DX */ 82 | 35, 58, 32, 52, 52, 36, 28, 28, 44, 27, 44, 60, 22, 22, 75,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 45, 45,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_SlovakModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Iso_8859_2_CzechModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Czech 44 | { 45 | public class Iso_8859_2_CzechModel : CzechModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 03:28:11.733089 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ 73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ 75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,180,SYM,181,SYM, 45, 46,SYM,SYM, 29,182, 38,183,SYM, 26,184, /* AX */ 79 | SYM,185,SYM,186,SYM, 45, 46,SYM,SYM, 29,187, 38,188,SYM, 26,189, /* BX */ 80 | 190, 18,191,192, 42,193,194,195, 25, 24,196,197, 23, 11,198, 39, /* CX */ 81 | 199,200, 35, 37,201,202, 41,SYM, 27, 31, 33,203, 43, 28,204,205, /* DX */ 82 | 206, 18,207,208, 42,209,210,211, 25, 24,212,213, 23, 11,214, 39, /* EX */ 83 | 215,216, 35, 37,217,218, 41,SYM, 27, 31, 33,219, 43, 28,220,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_2_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_2) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_1_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_1_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 45, 14, 46, 47, 33, 48, 49, 39, 35, 18, 42, 37, 50, 17, 51, 40, /* CX */ 81 | 52, 32, 43, 22, 53, 54, 38,SYM, 36, 55, 20, 56, 31, 57, 58, 59, /* DX */ 82 | 60, 14, 61, 62, 33, 63, 64, 39, 35, 18, 42, 37, 65, 17, 66, 40, /* EX */ 83 | 67, 32, 43, 22, 68, 69, 38,SYM, 36, 70, 20, 71, 31, 72, 73, 74, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_1_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_9_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_9_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM,148,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 149, 14,150,151, 33,152,153, 39, 35, 18, 42, 37,154, 17,155, 40, /* CX */ 81 | 156, 32, 43, 22,157,158, 38,SYM, 36,159, 20,160, 31,161,162,163, /* DX */ 82 | 164, 14,165,166, 33,167,168, 39, 35, 18, 42, 37,169, 17,170, 40, /* EX */ 83 | 171, 32, 43, 22,172,173, 38,SYM, 36,174, 20,175, 31, 41,176,177, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_9_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_9) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Slovene/Ibm852_SloveneModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangSloveneModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Slovene 44 | { 45 | public class Ibm852_SloveneModel : SloveneModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-28 22:06:46.134717 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 4X */ 73 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 18, 19, 13, 1, 24, 17, 20, 2, 8, 12, 9, 14, 4, 3, /* 6X */ 75 | 11, 28, 5, 6, 7, 16, 10, 27, 25, 26, 15,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 34,249, 29,249,249,249, 37, 34,249, 36,249,249,249,249,249, 37, /* 8X */ 77 | 29,249,249, 35,249,249,249,249,249,249,249,249,249,249,SYM, 21, /* 9X */ 78 | 32, 30, 31, 39,249,249, 23, 23,249,249,SYM,249, 21,249,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 32,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 249,249,249, 36,249,249, 30,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ 82 | 31,249, 35,249,249,249, 22, 22,249, 39,249,249, 40, 40,249,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_SloveneModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Probers/MultiByte/Chinese/Big5Prober.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Universal charset detector code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 2001 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * Shy Shalom 23 | * Rudi Pettazzi (C# port) 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * either the GNU General Public License Version 2 or later (the "GPL"), or 27 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 | * in which case the provisions of the GPL or the LGPL are applicable instead 29 | * of those above. If you wish to allow use of your version of this file only 30 | * under the terms of either the GPL or the LGPL, and not to allow others to 31 | * use your version of this file under the terms of the MPL, indicate your 32 | * decision by deleting the provisions above and replace them with the notice 33 | * and other provisions required by the GPL or the LGPL. If you do not delete 34 | * the provisions above, a recipient may use your version of this file under 35 | * the terms of any one of the MPL, the GPL or the LGPL. 36 | * 37 | * ***** END LICENSE BLOCK ***** */ 38 | 39 | using System.Text; 40 | 41 | using UtfUnknown.Core.Analyzers.Chinese; 42 | using UtfUnknown.Core.Models; 43 | using UtfUnknown.Core.Models.MultiByte.Chinese; 44 | 45 | namespace UtfUnknown.Core.Probers.MultiByte.Chinese 46 | { 47 | public class Big5Prober : CharsetProber 48 | { 49 | //void GetDistribution(PRUint32 aCharLen, const char* aStr); 50 | private CodingStateMachine codingSM; 51 | private BIG5DistributionAnalyser distributionAnalyser; 52 | private byte[] lastChar = new byte[2]; 53 | 54 | public Big5Prober() 55 | { 56 | codingSM = new CodingStateMachine(new BIG5SMModel()); 57 | distributionAnalyser = new BIG5DistributionAnalyser(); 58 | Reset(); 59 | } 60 | 61 | public override ProbingState HandleData(byte[] buf, int offset, int len) 62 | { 63 | int max = offset + len; 64 | 65 | for (int i = offset; i < max; i++) 66 | { 67 | var codingState = codingSM.NextState(buf[i]); 68 | if (codingState == StateMachineModel.ERROR) 69 | { 70 | state = ProbingState.NotMe; 71 | break; 72 | } 73 | if (codingState == StateMachineModel.ITSME) 74 | { 75 | state = ProbingState.FoundIt; 76 | break; 77 | } 78 | if (codingState == StateMachineModel.START) 79 | { 80 | int charLen = codingSM.CurrentCharLen; 81 | if (i == offset) 82 | { 83 | lastChar[1] = buf[offset]; 84 | distributionAnalyser.HandleOneChar(lastChar, 0, charLen); 85 | } 86 | else 87 | { 88 | distributionAnalyser.HandleOneChar(buf, i - 1, charLen); 89 | } 90 | } 91 | } 92 | 93 | lastChar[0] = buf[max - 1]; 94 | 95 | if (state == ProbingState.Detecting) 96 | if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) 97 | state = ProbingState.FoundIt; 98 | 99 | return state; 100 | } 101 | 102 | public override void Reset() 103 | { 104 | codingSM.Reset(); 105 | state = ProbingState.Detecting; 106 | distributionAnalyser.Reset(); 107 | } 108 | 109 | public override string GetCharsetName() 110 | { 111 | return CodepageName.BIG5; 112 | } 113 | 114 | public override float GetConfidence(StringBuilder status = null) 115 | { 116 | return distributionAnalyser.GetConfidence(); 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Croatian/Ibm852_CroatianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCroatianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Croatian 44 | { 45 | public class Ibm852_CroatianModel : CroatianModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-25 23:50:27.590137 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 4X */ 73 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 19, 20, 15, 2, 22, 17, 21, 1, 7, 9, 10, 12, 4, 3, /* 6X */ 75 | 14, 30, 6, 8, 5, 11, 13, 28, 29, 27, 16,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 39, 33, 31, 43, 36,249, 25, 39, 40, 47,249,249,249,249, 36, 25, /* 8X */ 77 | 31,249,249,249, 32,249,249,249,249, 32, 33,249,249, 40,SYM, 18, /* 9X */ 78 | 41,249, 44, 48,249,249, 24, 24,249,249,SYM,249, 18,249,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 41, 43,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM,249,249,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 26, 26,249, 47,249,249,249,249,249,SYM,SYM,SYM,SYM,249,249,SYM, /* DX */ 82 | 44,249,249,249,249,249, 23, 23,249, 48,249,249,249,249,249,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,249,249,249,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_CroatianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Irish/Iso_8859_15_IrishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangIrishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Irish 44 | { 45 | public class Iso_8859_15_IrishModel : IrishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-27 00:33:40.158624 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 4X */ 73 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 0, 16, 8, 11, 5, 19, 12, 3, 1, 27, 25, 9, 13, 2, 10, /* 6X */ 75 | 21, 30, 4, 6, 7, 15, 23, 26, 29, 24, 28,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM, 34,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,112,113,SYM,SYM,114,SYM,SYM,SYM,115,116,117,SYM, /* BX */ 80 | 118, 14,119,120, 33,121,122, 39, 35, 18, 42, 37,123, 17,124, 40, /* CX */ 81 | 125, 32, 43, 22,126,127, 38,SYM, 36,128, 20,129, 31,130,131,132, /* DX */ 82 | 133, 14,134,135, 33,136,137, 39, 35, 18, 42, 37,138, 17,139, 40, /* EX */ 83 | 140, 32, 43, 22,141,142, 38,SYM, 36,143, 20,144, 31,145,146,147, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_15_IrishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_15) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Romanian/Ibm852_RomanianModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangRomanianModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Romanian 44 | { 45 | public class Ibm852_RomanianModel : RomanianModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-28 18:58:13.757152 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 4X */ 73 | 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 2, 17, 9, 11, 0, 16, 15, 23, 1, 26, 27, 6, 12, 4, 8, /* 6X */ 75 | 13, 32, 3, 10, 5, 7, 21, 29, 25, 28, 22,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | 42, 39, 31, 24, 33,138, 35, 42, 46, 49, 44, 44, 20,139, 33, 35, /* 8X */ 77 | 31,140,141,142, 36,143,144, 56, 56, 36, 39,145,146, 46,SYM, 41, /* 9X */ 78 | 30, 37, 34, 47,147,148, 40, 40,149,150,SYM,151, 41,152,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 30, 24, 51,153,SYM,SYM,SYM,SYM,154,155,SYM, /* BX */ 80 | SYM,SYM,SYM,SYM,SYM,SYM, 14, 14,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ 81 | 43, 43,156, 49,157,158, 37, 20, 51,SYM,SYM,SYM,SYM,159,160,SYM, /* DX */ 82 | 34, 57,161, 52, 52,162, 38, 38,163, 47,164, 50, 54, 54,165,SYM, /* EX */ 83 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 50, 55, 55,SYM,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Ibm852_RomanianModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM852) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Arabic/Iso_8859_6_ArabicModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangArabicModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Arabic 44 | { 45 | public class Iso_8859_6_ArabicModel : ArabicModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2015-12-13 18:33:58.848027 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 52, 72, 61, 68, 74, 69, 59, 78, 60, 90, 86, 67, 65, 71, 75, /* 4X */ 73 | 64, 85, 76, 55, 57, 79, 81, 70, 82, 87, 91,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 37, 58, 49, 47, 38, 54, 66, 46, 39, 88, 63, 45, 51, 43, 40, /* 6X */ 75 | 62, 89, 42, 44, 41, 50, 77, 73, 83, 56, 80,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, /* AX */ 79 | ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, /* BX */ 80 | ILL, 32, 34, 15, 35, 22, 31, 0, 9, 8, 7, 27, 19, 18, 25, 11, /* CX */ 81 | 30, 5, 26, 12, 21, 23, 28,SYM, 33, 10, 29,ILL,ILL,ILL,ILL,ILL, /* DX */ 82 | 36, 13, 14, 17, 1, 3, 6, 16, 4, 24, 2,SYM,SYM,SYM,SYM,SYM, /* EX */ 83 | SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_6_ArabicModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_6) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Czech/Windows_1250_CzechModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangCzechModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Czech 44 | { 45 | public class Windows_1250_CzechModel : CzechModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-09-21 03:28:11.733089 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 4X */ 73 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 3, 22, 14, 15, 1, 30, 32, 17, 4, 21, 12, 10, 16, 2, 0, /* 6X */ 75 | 8, 40, 9, 6, 5, 13, 7, 36, 34, 20, 19,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 47, /* 8X */ 77 | ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 29,SYM, 46, 38, 26, 48, /* 9X */ 78 | SYM,SYM,SYM, 49,SYM, 50,SYM,SYM,SYM,SYM, 51,SYM,SYM,SYM,SYM, 52, /* AX */ 79 | SYM,SYM,SYM, 53,SYM,SYM,SYM,SYM,SYM, 54, 55,SYM, 45,SYM, 45, 56, /* BX */ 80 | 57, 18, 58, 59, 42, 60, 61, 62, 25, 24, 63, 64, 23, 11, 65, 39, /* CX */ 81 | 66, 67, 35, 37, 68, 69, 41,SYM, 27, 31, 33, 70, 43, 28, 71, 72, /* DX */ 82 | 73, 18, 74, 75, 42, 76, 77, 78, 25, 24, 79, 80, 23, 11, 81, 39, /* EX */ 83 | 82, 83, 35, 37, 84, 85, 41,SYM, 27, 31, 33, 86, 43, 28, 87,SYM, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Windows_1250_CzechModel() : base(CHAR_TO_ORDER_MAP, CodepageName.WINDOWS_1250) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/Danish/Iso_8859_1_DanishModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangDanishModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.Danish 44 | { 45 | public class Iso_8859_1_DanishModel : DanishModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2016-02-19 17:56:42.163975 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 4X */ 73 | 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 4, 15, 24, 7, 0, 13, 10, 18, 5, 23, 11, 8, 12, 2, 9, /* 6X */ 75 | 17, 29, 1, 6, 3, 16, 14, 25, 27, 20, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 42,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 71, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 72, 34, 73, 50, /* CX */ 81 | 43, 47, 51, 36, 52, 74, 30,SYM, 19, 75, 37, 44, 31, 46, 76, 48, /* DX */ 82 | 77, 33, 40, 35, 32, 21, 22, 38, 41, 28, 49, 45, 78, 34, 79, 50, /* EX */ 83 | 43, 47, 51, 36, 52, 80, 30,SYM, 19, 81, 37, 44, 31, 46, 82, 83, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_1_DanishModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/French/Iso_8859_1_FrenchModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangFrenchModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.French 44 | { 45 | public class Iso_8859_1_FrenchModel : FrenchModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2015-12-03 21:10:27.685575 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 4X */ 73 | 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 2, 18, 11, 10, 0, 17, 15, 19, 4, 25, 26, 7, 13, 3, 8, /* 6X */ 75 | 12, 20, 5, 1, 6, 9, 16, 30, 21, 22, 29,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 24, 38, 32, 46, 49, 68, 47, 27, 23, 14, 28, 41, 69, 39, 33, 36, /* CX */ 81 | 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 70, /* DX */ 82 | 24, 38, 32, 46, 49, 71, 47, 27, 23, 14, 28, 41, 72, 39, 33, 36, /* EX */ 83 | 48, 45, 54, 40, 31, 55, 42,SYM, 52, 37, 43, 34, 44, 53, 50, 73, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_1_FrenchModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1) 88 | { 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /sources/EncodingChecker/UtfUnknown/Core/Models/SingleByte/German/Iso_8859_1_GermanModel.cs: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * The contents of this file are subject to the Mozilla Public License Version 5 | * 1.1 (the "License"); you may not use this file except in compliance with 6 | * the License. You may obtain a copy of the License at 7 | * http://www.mozilla.org/MPL/ 8 | * 9 | * Software distributed under the License is distributed on an "AS IS" basis, 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 11 | * for the specific language governing rights and limitations under the 12 | * License. 13 | * 14 | * The Original Code is Mozilla Communicator client code. 15 | * 16 | * The Initial Developer of the Original Code is 17 | * Netscape Communications Corporation. 18 | * Portions created by the Initial Developer are Copyright (C) 1998 19 | * the Initial Developer. All Rights Reserved. 20 | * 21 | * Contributor(s): 22 | * 23 | * Alternatively, the contents of this file may be used under the terms of 24 | * either the GNU General Public License Version 2 or later (the "GPL"), or 25 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 | * in which case the provisions of the GPL or the LGPL are applicable instead 27 | * of those above. If you wish to allow use of your version of this file only 28 | * under the terms of either the GPL or the LGPL, and not to allow others to 29 | * use your version of this file under the terms of the MPL, indicate your 30 | * decision by deleting the provisions above and replace them with the notice 31 | * and other provisions required by the GPL or the LGPL. If you do not delete 32 | * the provisions above, a recipient may use your version of this file under 33 | * the terms of any one of the MPL, the GPL or the LGPL. 34 | * 35 | * ***** END LICENSE BLOCK ***** */ 36 | 37 | /* 38 | * The following part was imported from https://gitlab.freedesktop.org/uchardet/uchardet 39 | * The implementation of this feature was originally done on https://gitlab.freedesktop.org/uchardet/uchardet/blob/master/src/LangModels/LangGermanModel.cpp 40 | * and adjusted to language specific support. 41 | */ 42 | 43 | namespace UtfUnknown.Core.Models.SingleByte.German 44 | { 45 | public class Iso_8859_1_GermanModel : GermanModel 46 | { 47 | // Generated by BuildLangModel.py 48 | // On: 2015-12-03 22:50:46.518374 49 | 50 | // Character Mapping Table: 51 | // ILL: illegal character. 52 | // CTR: control character specific to the charset. 53 | // RET: carriage/return. 54 | // SYM: symbol (punctuation) that does not belong to word. 55 | // NUM: 0 - 9. 56 | 57 | // Other characters are ordered by probabilities 58 | // (0 is the most common character in the language). 59 | 60 | // Orders are generic to a language. So the codepoint with order X in 61 | // CHARSET1 maps to the same character as the codepoint with the same 62 | // order X in CHARSET2 for the same language. 63 | // As such, it is possible to get missing order. For instance the 64 | // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 65 | // even though they are both used for French. Same for the euro sign. 66 | 67 | private static byte[] CHAR_TO_ORDER_MAP = { 68 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ 69 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ 70 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ 71 | NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ 72 | SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ 73 | 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ 74 | SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ 75 | 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ 76 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ 77 | CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ 78 | SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ 79 | SYM,SYM,SYM,SYM,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ 80 | 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* CX */ 81 | 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 66, 24, 45, 67, 27, /* DX */ 82 | 41, 31, 37, 44, 22, 49, 50, 35, 32, 29, 48, 43, 57, 33, 47, 52, /* EX */ 83 | 53, 39, 51, 34, 40, 55, 26,SYM, 38, 58, 46, 68, 24, 45, 69, 56, /* FX */ 84 | }; 85 | /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ 86 | 87 | public Iso_8859_1_GermanModel() : base(CHAR_TO_ORDER_MAP, CodepageName.ISO_8859_1) 88 | { 89 | } 90 | } 91 | } 92 | --------------------------------------------------------------------------------